diff options
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 1718 |
1 files changed, 823 insertions, 895 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f9d76072398d..8ce6f45f45e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -17,8 +17,8 @@ #include <linux/uio.h> #include <linux/iversion.h> #include <linux/fsverity.h> -#include <linux/iomap.h> #include "ctree.h" +#include "direct-io.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" @@ -36,100 +36,42 @@ #include "ioctl.h" #include "file.h" #include "super.h" - -/* simple helper to fault in pages and copy. This should go away - * and be replaced with calls into generic code. - */ -static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, - struct page **prepared_pages, - struct iov_iter *i) -{ - size_t copied = 0; - size_t total_copied = 0; - int pg = 0; - int offset = offset_in_page(pos); - - while (write_bytes > 0) { - size_t count = min_t(size_t, - PAGE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[pg]; - /* - * Copy data from userspace to the current page - */ - copied = copy_page_from_iter_atomic(page, offset, count, i); - - /* Flush processor's dcache for this page */ - flush_dcache_page(page); - - /* - * if we get a partial write, we can end up with - * partially up to date pages. These add - * a lot of complexity, so make sure they don't - * happen by forcing this copy to be retried. - * - * The rest of the btrfs_file_write code will fall - * back to page at a time copies after we return 0. - */ - if (unlikely(copied < count)) { - if (!PageUptodate(page)) { - iov_iter_revert(i, copied); - copied = 0; - } - if (!copied) - break; - } - - write_bytes -= copied; - total_copied += copied; - offset += copied; - if (offset == PAGE_SIZE) { - pg++; - offset = 0; - } - } - return total_copied; -} +#include "print-tree.h" /* - * unlocks pages after btrfs_file_write is done with them + * Unlock folio after btrfs_file_write() is done with it. */ -static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, - struct page **pages, size_t num_pages, +static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, u64 pos, u64 copied) { - size_t i; u64 block_start = round_down(pos, fs_info->sectorsize); u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; ASSERT(block_len <= U32_MAX); - for (i = 0; i < num_pages; i++) { - /* page checked is some magic around finding pages that - * have been modified without going through btrfs_set_page_dirty - * clear it here. There should be no need to mark the pages - * accessed as prepare_pages should have marked them accessed - * in prepare_pages via find_or_create_page() - */ - btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), - block_start, block_len); - unlock_page(pages[i]); - put_page(pages[i]); - } + /* + * Folio checked is some magic around finding folios that have been + * modified without going through btrfs_dirty_folio(). Clear it here. + * There should be no need to mark the pages accessed as + * prepare_one_folio() should have marked them accessed in + * prepare_one_folio() via find_or_create_page() + */ + btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); + folio_unlock(folio); + folio_put(folio); } /* - * After btrfs_copy_from_user(), update the following things for delalloc: - * - Mark newly dirtied pages as DELALLOC in the io tree. + * After copy_folio_from_iter_atomic(), update the following things for delalloc: + * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. - * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Mark modified folio as Uptodate/Dirty and not needing COW fixup * - Update inode size for past EOF write */ -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve) +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int err = 0; - int i; + int ret = 0; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -147,6 +89,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); + ASSERT(folio_pos(folio) <= pos && + folio_pos(folio) + folio_size(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -154,25 +98,18 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ - clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - cached); + btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + cached); - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); - if (err) - return err; - - for (i = 0; i < num_pages; i++) { - struct page *p = pages[i]; + if (ret) + return ret; - btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), - start_pos, num_bytes); - } + btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); /* * we've only changed i_size in ram, and we haven't updated @@ -206,7 +143,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); @@ -243,10 +179,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); - if (args->start >= inode->disk_i_size && !args->replace_extent) + if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; - update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -264,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } ret = btrfs_next_leaf(root, path); if (ret < 0) break; @@ -340,7 +280,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end < extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -370,18 +314,19 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, - disk_bytenr, num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - new_key.objectid, - args->start - extent_offset, - 0, false); + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, new_key.objectid, + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -414,7 +359,6 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; @@ -426,7 +370,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end >= extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -434,7 +382,6 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) @@ -454,7 +401,11 @@ delete_extent_item: del_slot = path->slots[0]; del_nr = 1; } else { - BUG_ON(del_slot + del_nr != path->slots[0]); + if (WARN_ON(del_slot + del_nr != path->slots[0])) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } del_nr++; } @@ -464,15 +415,17 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_DROP_DELAYED_REF, - disk_bytenr, num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - key.objectid, - key.offset - extent_offset, 0, - false); + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, key.objectid, + key.offset - extent_offset, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -555,20 +508,19 @@ out: return ret; } -static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 orig_offset, - u64 *start, u64 *end) +static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, + u64 bytenr, u64 orig_offset, u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 extent_end; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || @@ -577,15 +529,15 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) - return 0; + return false; extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if ((*start && *start != key.offset) || (*end && *end != extent_end)) - return 0; + return false; *start = key.offset; *end = extent_end; - return 1; + return true; } /* @@ -600,7 +552,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct btrfs_ref ref = { 0 }; struct btrfs_key key; @@ -683,7 +635,6 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -712,7 +663,6 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -746,12 +696,14 @@ again: btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_mark_buffer_dirty(trans, leaf); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, - num_bytes, 0, root->root_key.objectid); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset, 0, false); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -774,10 +726,14 @@ again: other_start = end; other_end = 0; - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, 0, root->root_key.objectid); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, - 0, false); + + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -818,7 +774,6 @@ again: btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); @@ -827,7 +782,6 @@ again: btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { @@ -836,58 +790,51 @@ again: } } out: - btrfs_free_path(path); return ret; } /* - * on error we return an unlocked page and the error value - * on success we return a locked page and 0 + * On error return an unlocked folio and the error value + * On success return a locked folio and 0 */ -static int prepare_uptodate_page(struct inode *inode, - struct page *page, u64 pos, - bool force_uptodate) +static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, + u64 len) { - struct folio *folio = page_folio(page); + u64 clamp_start = max_t(u64, pos, folio_pos(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; - if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && - !PageUptodate(page)) { - ret = btrfs_read_folio(NULL, folio); - if (ret) - return ret; - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - return -EIO; - } - - /* - * Since btrfs_read_folio() will unlock the folio before it - * returns, there is a window where btrfs_release_folio() can be - * called to release the page. Here we check both inode - * mapping and PagePrivate() to make sure the page was not - * released. - * - * The private flag check is essential for subpage as we need - * to store extra bitmap using folio private. - */ - if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { - unlock_page(page); - return -EAGAIN; - } - } - return 0; -} + if (folio_test_uptodate(folio)) + return 0; -static fgf_t get_prepare_fgp_flags(bool nowait) -{ - fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + if (IS_ALIGNED(clamp_start, blocksize) && + IS_ALIGNED(clamp_end, blocksize)) + return 0; - if (nowait) - fgp_flags |= FGP_NOWAIT; + ret = btrfs_read_folio(NULL, folio); + if (ret) + return ret; + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + return -EIO; + } - return fgp_flags; + /* + * Since btrfs_read_folio() will unlock the folio before it returns, + * there is a window where btrfs_release_folio() can be called to + * release the page. Here we check both inode mapping and page + * private to make sure the page was not released. + * + * The private flag check is essential for subpage as we need to store + * extra bitmap using folio private. + */ + if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { + folio_unlock(folio); + return -EAGAIN; + } + return 0; } static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) @@ -904,89 +851,62 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) } /* - * this just gets pages into the page cache and locks them down. + * Get folio into the page cache and lock it. */ -static noinline int prepare_pages(struct inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, bool force_uptodate, - bool nowait) +static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, + loff_t pos, size_t write_bytes, + bool nowait) { - int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = get_prepare_fgp_flags(nowait); - int err = 0; - int faili; + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | + fgf_set_order(write_bytes); + struct folio *folio; + int ret = 0; - for (i = 0; i < num_pages; i++) { again: - pages[i] = pagecache_get_page(inode->i_mapping, index + i, - fgp_flags, mask | __GFP_WRITE); - if (!pages[i]) { - faili = i - 1; - if (nowait) - err = -EAGAIN; - else - err = -ENOMEM; - goto fail; - } - - err = set_page_extent_mapped(pages[i]); - if (err < 0) { - faili = i; - goto fail; - } - - if (i == 0) - err = prepare_uptodate_page(inode, pages[i], pos, - force_uptodate); - if (!err && i == num_pages - 1) - err = prepare_uptodate_page(inode, pages[i], - pos + write_bytes, false); - if (err) { - put_page(pages[i]); - if (!nowait && err == -EAGAIN) { - err = 0; - goto again; - } - faili = i - 1; - goto fail; + folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + return ret; + } + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); + if (ret) { + /* The folio is already unlocked. */ + folio_put(folio); + if (!nowait && ret == -EAGAIN) { + ret = 0; + goto again; } - wait_on_page_writeback(pages[i]); + return ret; } - + *folio_ret = folio; return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - put_page(pages[faili]); - faili--; - } - return err; - } /* - * This function locks the extent and properly waits for data=ordered extents - * to finish before allowing the pages to be modified if need. + * Locks the extent and properly waits for data=ordered extents to finish + * before allowing the folios to be modified if need. * - * The return value: + * Return: * 1 - the extent is locked * 0 - the extent is not locked, and everything is OK - * -EAGAIN - need re-prepare the pages - * the other < 0 number - Something wrong happens + * -EAGAIN - need to prepare the folios again */ static noinline int -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, + loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start_pos; u64 last_pos; - int i; int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); @@ -996,18 +916,15 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, struct btrfs_ordered_extent *ordered; if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, - cached_state)) { - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - pages[i] = NULL; - } - + if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, + last_pos, cached_state)) { + folio_unlock(folio); + folio_put(folio); return -EAGAIN; } } else { - lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); } ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -1015,12 +932,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { - unlock_extent(&inode->io_tree, start_pos, last_pos, - cached_state); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - } + btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; @@ -1034,11 +949,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * We should be called after prepare_pages() which should have locked + * We should be called after prepare_one_folio() which should have locked * all pages in the range. */ - for (i = 0; i < num_pages; i++) - WARN_ON(!PageLocked(pages[i])); + WARN_ON(!folio_test_locked(folio)); return ret; } @@ -1093,14 +1007,13 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL, nowait, false); + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } @@ -1110,28 +1023,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now, ts; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - ts = inode_get_mtime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_mtime_to_ts(inode, now); - - ts = inode_get_ctime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_ctime_to_ts(inode, now); - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - -static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, - size_t count) +int btrfs_write_check(struct kiocb *iocb, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -1139,7 +1031,6 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1161,11 +1052,13 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - update_time_for_write(inode); + if (!IS_NOCMTIME(inode)) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + inode_inc_iversion(inode); + } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); @@ -1177,456 +1070,316 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, return 0; } -static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, - struct iov_iter *i) +static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, + u64 start, u64 len, bool only_release_metadata) { - struct file *file = iocb->ki_filp; - loff_t pos; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct page **pages = NULL; - struct extent_changeset *data_reserved = NULL; - u64 release_bytes = 0; - u64 lockstart; - u64 lockend; - size_t num_written = 0; - int nrptrs; - ssize_t ret; - bool only_release_metadata = false; - bool force_page_uptodate = false; - loff_t old_isize = i_size_read(inode); - unsigned int ilock_flags = 0; - const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); - - if (nowait) - ilock_flags |= BTRFS_ILOCK_TRY; + if (len == 0) + return; - ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (ret < 0) - return ret; + if (only_release_metadata) { + btrfs_check_nocow_unlock(inode); + btrfs_delalloc_release_metadata(inode, len, true); + } else { + const struct btrfs_fs_info *fs_info = inode->root->fs_info; - ret = generic_write_checks(iocb, i); - if (ret <= 0) - goto out; + btrfs_delalloc_release_space(inode, data_reserved, + round_down(start, fs_info->sectorsize), + len, true); + } +} - ret = btrfs_write_check(iocb, i, ret); - if (ret < 0) - goto out; +/* + * Reserve data and metadata space for this buffered write range. + * + * Return >0 for the number of bytes reserved, which is always block aligned. + * Return <0 for error. + */ +static ssize_t reserve_space(struct btrfs_inode *inode, + struct extent_changeset **data_reserved, + u64 start, size_t *len, bool nowait, + bool *only_release_metadata) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); + size_t reserve_bytes; + int ret; - pos = iocb->ki_pos; - nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), - PAGE_SIZE / (sizeof(struct page *))); - nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); - nrptrs = max(nrptrs, 8); - pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out; - } + ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); + if (ret < 0) { + int can_nocow; - while (iov_iter_count(i) > 0) { - struct extent_state *cached_state = NULL; - size_t offset = offset_in_page(pos); - size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), - nrptrs * (size_t)PAGE_SIZE - - offset); - size_t num_pages; - size_t reserve_bytes; - size_t dirty_pages; - size_t copied; - size_t dirty_sectors; - size_t num_sectors; - int extents_locked; + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) + return -EAGAIN; /* - * Fault pages before locking them in prepare_pages - * to avoid recursive lock + * If we don't have to COW at the offset, reserve metadata only. + * write_bytes may get smaller than requested here. */ - if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { - ret = -EFAULT; - break; - } - - only_release_metadata = false; - sector_offset = pos & (fs_info->sectorsize - 1); - - extent_changeset_release(data_reserved); - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &data_reserved, pos, - write_bytes, nowait); - if (ret < 0) { - int can_nocow; - - if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { - ret = -EAGAIN; - break; - } + can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) + return ret; + *only_release_metadata = true; + } - /* - * If we don't have to COW at the offset, reserve - * metadata only. write_bytes may get smaller than - * requested here. - */ - can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes, nowait); - if (can_nocow < 0) - ret = can_nocow; - if (can_nocow > 0) - ret = 0; - if (ret) - break; - only_release_metadata = true; - } + reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); + WARN_ON(reserve_bytes == 0); + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, + reserve_bytes, nowait); + if (ret) { + if (!*only_release_metadata) + btrfs_free_reserved_data_space(inode, *data_reserved, + start, *len); + else + btrfs_check_nocow_unlock(inode); - num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); - WARN_ON(num_pages > nrptrs); - reserve_bytes = round_up(write_bytes + sector_offset, - fs_info->sectorsize); - WARN_ON(reserve_bytes == 0); - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes, - reserve_bytes, nowait); - if (ret) { - if (!only_release_metadata) - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, pos, - write_bytes); - else - btrfs_check_nocow_unlock(BTRFS_I(inode)); + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; + return ret; + } + return reserve_bytes; +} - if (nowait && ret == -ENOSPC) - ret = -EAGAIN; - break; - } +/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ +static void shrink_reserved_space(struct btrfs_inode *inode, + struct extent_changeset *data_reserved, + u64 reserved_start, u64 reserved_len, + u64 new_len, bool only_release_metadata) +{ + const u64 diff = reserved_len - new_len; - release_bytes = reserve_bytes; -again: - ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - break; - } + ASSERT(new_len <= reserved_len); + btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, diff, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + reserved_start + new_len, diff, true); +} - /* - * This is going to setup the pages array with the number of - * pages we want, so we don't really need to worry about the - * contents of pages from loop to loop - */ - ret = prepare_pages(inode, pages, num_pages, - pos, write_bytes, force_page_uptodate, false); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - break; - } +/* Calculate the maximum amount of bytes we can write into one folio. */ +static size_t calc_write_bytes(const struct btrfs_inode *inode, + const struct iov_iter *iter, u64 start) +{ + const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); - extents_locked = lock_and_cleanup_extent_if_need( - BTRFS_I(inode), pages, - num_pages, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); - if (extents_locked < 0) { - if (!nowait && extents_locked == -EAGAIN) - goto again; + return min(max_folio_size - (start & (max_folio_size - 1)), + iov_iter_count(iter)); +} - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - ret = extents_locked; - break; - } +/* + * Do the heavy-lifting work to copy one range into one folio of the page cache. + * + * Return > 0 in case we copied all bytes or just some of them. + * Return 0 if no bytes were copied, in which case the caller should retry. + * Return <0 on error. + */ +static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, + struct extent_changeset **data_reserved, u64 start, + bool nowait) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_state *cached_state = NULL; + size_t write_bytes = calc_write_bytes(inode, iter, start); + size_t copied; + const u64 reserved_start = round_down(start, fs_info->sectorsize); + u64 reserved_len; + struct folio *folio = NULL; + int extents_locked; + u64 lockstart; + u64 lockend; + bool only_release_metadata = false; + const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + int ret; - copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + /* + * Fault all pages before locking them in prepare_one_folio() to avoid + * recursive lock. + */ + if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) + return -EFAULT; + extent_changeset_release(*data_reserved); + ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, + &only_release_metadata); + if (ret < 0) + return ret; + reserved_len = ret; + /* Write range must be inside the reserved range. */ + ASSERT(reserved_start <= start); + ASSERT(start + write_bytes <= reserved_start + reserved_len); - num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - fs_info->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); +again: + ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, + bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - /* - * if we have trouble faulting in the pages, fall - * back to one page at a time - */ - if (copied < write_bytes) - nrptrs = 1; + ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - if (copied == 0) { - force_page_uptodate = true; - dirty_sectors = 0; - dirty_pages = 0; - } else { - force_page_uptodate = false; - dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_SIZE); - } + /* + * The reserved range goes beyond the current folio, shrink the reserved + * space to the folio boundary. + */ + if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) { + const u64 last_block = folio_pos(folio) + folio_size(folio); + + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + write_bytes = last_block - start; + reserved_len = last_block - reserved_start; + } + + extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, + write_bytes, &lockstart, + &lockend, nowait, + &cached_state); + if (extents_locked < 0) { + if (!nowait && extents_locked == -EAGAIN) + goto again; - if (num_sectors > dirty_sectors) { - /* release everything except the sectors we dirtied */ - release_bytes -= dirty_sectors << fs_info->sectorsize_bits; - if (only_release_metadata) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - u64 __pos; - - __pos = round_down(pos, - fs_info->sectorsize) + - (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, __pos, - release_bytes, true); - } - } + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + ret = extents_locked; + return ret; + } - release_bytes = round_up(copied + sector_offset, - fs_info->sectorsize); + copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), + write_bytes, iter); + flush_dcache_folio(folio); - ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, - &cached_state, only_release_metadata); + if (unlikely(copied < write_bytes)) { + u64 last_block; /* - * If we have not locked the extent range, because the range's - * start offset is >= i_size, we might still have a non-NULL - * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_pages(). Therefore free any - * possible cached extent state to avoid a memory leak. + * The original write range doesn't need an uptodate folio as + * the range is block aligned. But now a short copy happened. + * We cannot handle it without an uptodate folio. + * + * So just revert the range and we will retry. */ - if (extents_locked) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); - else - free_extent_state(cached_state); - - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - if (ret) { - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); - break; + if (!folio_test_uptodate(folio)) { + iov_iter_revert(iter, copied); + copied = 0; } - release_bytes = 0; - if (only_release_metadata) - btrfs_check_nocow_unlock(BTRFS_I(inode)); - - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + /* No copied bytes, unlock, release reserved space and exit. */ + if (copied == 0) { + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, + &cached_state); + else + btrfs_free_extent_state(cached_state); + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + btrfs_drop_folio(fs_info, folio, start, copied); + return 0; + } - cond_resched(); + /* Release the reserved space beyond the last block. */ + last_block = round_up(start + copied, fs_info->sectorsize); - pos += copied; - num_written += copied; + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + reserved_len = last_block - reserved_start; } - kfree(pages); - - if (release_bytes) { - if (only_release_metadata) { - btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, - round_down(pos, fs_info->sectorsize), - release_bytes, true); - } - } + ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, + only_release_metadata); + /* + * If we have not locked the extent range, because the range's start + * offset is >= i_size, we might still have a non-NULL cached extent + * state, acquired while marking the extent range as delalloc through + * btrfs_dirty_page(). Therefore free any possible cached extent state + * to avoid a memory leak. + */ + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + else + btrfs_free_extent_state(cached_state); - extent_changeset_free(data_reserved); - if (num_written > 0) { - pagecache_isize_extended(inode, old_isize, iocb->ki_pos); - iocb->ki_pos += num_written; + btrfs_delalloc_release_extents(inode, reserved_len); + if (ret) { + btrfs_drop_folio(fs_info, folio, start, copied); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; } -out: - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - return num_written ? num_written : ret; -} - -static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - const u32 blocksize_mask = fs_info->sectorsize - 1; - - if (offset & blocksize_mask) - return -EINVAL; - - if (iov_iter_alignment(iter) & blocksize_mask) - return -EINVAL; + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); - return 0; + btrfs_drop_folio(fs_info, folio, start, copied); + return copied; } -static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos; - ssize_t written = 0; - ssize_t written_buffered; - size_t prev_left = 0; - loff_t endbyte; - ssize_t err; + struct inode *inode = file_inode(file); + struct extent_changeset *data_reserved = NULL; + size_t num_written = 0; + ssize_t ret; + loff_t old_isize; unsigned int ilock_flags = 0; - struct iomap_dio *dio; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - if (iocb->ki_flags & IOCB_NOWAIT) + if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; - /* - * If the write DIO is within EOF, use a shared lock and also only if - * security bits will likely not be dropped by file_remove_privs() called - * from btrfs_write_check(). Either will need to be rechecked after the - * lock was acquired. - */ - if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) - ilock_flags |= BTRFS_ILOCK_SHARED; - -relock: - err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (err < 0) - return err; - - /* Shared lock cannot be used with security bits set. */ - if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - ilock_flags &= ~BTRFS_ILOCK_SHARED; - goto relock; - } - - err = generic_write_checks(iocb, from); - if (err <= 0) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - return err; - } - - err = btrfs_write_check(iocb, from, err); - if (err < 0) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - goto out; - } - - pos = iocb->ki_pos; - /* - * Re-check since file size may have changed just before taking the - * lock or pos may have changed because of O_APPEND in generic_write_check() - */ - if ((ilock_flags & BTRFS_ILOCK_SHARED) && - pos + iov_iter_count(from) > i_size_read(inode)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - ilock_flags &= ~BTRFS_ILOCK_SHARED; - goto relock; - } - - if (check_direct_IO(fs_info, from, pos)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - goto buffered; - } - - /* - * The iov_iter can be mapped to the same file range we are writing to. - * If that's the case, then we will deadlock in the iomap code, because - * it first calls our callback btrfs_dio_iomap_begin(), which will create - * an ordered extent, and after that it will fault in the pages that the - * iov_iter refers to. During the fault in we end up in the readahead - * pages code (starting at btrfs_readahead()), which will lock the range, - * find that ordered extent and then wait for it to complete (at - * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since - * obviously the ordered extent can never complete as we didn't submit - * yet the respective bio(s). This always happens when the buffer is - * memory mapped to the same file range, since the iomap DIO code always - * invalidates pages in the target file range (after starting and waiting - * for any writeback). - * - * So here we disable page faults in the iov_iter and then retry if we - * got -EFAULT, faulting in the pages before the retry. - */ - from->nofault = true; - dio = btrfs_dio_write(iocb, from, written); - from->nofault = false; + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; /* - * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync - * iocb, and that needs to lock the inode. So unlock it before calling - * iomap_dio_complete() to avoid a deadlock. + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. */ - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - - if (IS_ERR_OR_NULL(dio)) - err = PTR_ERR_OR_ZERO(dio); - else - err = iomap_dio_complete(dio); - - /* No increment (+=) because iomap returns a cumulative value. */ - if (err > 0) - written = err; + old_isize = i_size_read(inode); - if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { - const size_t left = iov_iter_count(from); - /* - * We have more data left to write. Try to fault in as many as - * possible of the remainder pages and retry. We do this without - * releasing and locking again the inode, to prevent races with - * truncate. - * - * Also, in case the iov refers to pages in the file range of the - * file we want to write to (due to a mmap), we could enter an - * infinite loop if we retry after faulting the pages in, since - * iomap will invalidate any pages in the range early on, before - * it tries to fault in the pages of the iov. So we keep track of - * how much was left of iov in the previous EFAULT and fallback - * to buffered IO in case we haven't made any progress. - */ - if (left == prev_left) { - err = -ENOTBLK; - } else { - fault_in_iov_iter_readable(from, left); - prev_left = left; - goto relock; - } - } - - /* - * If 'err' is -ENOTBLK or we have not written all data, then it means - * we must fallback to buffered IO. - */ - if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) + ret = generic_write_checks(iocb, iter); + if (ret <= 0) goto out; -buffered: - /* - * If we are in a NOWAIT context, then return -EAGAIN to signal the caller - * it must retry the operation in a context where blocking is acceptable, - * because even if we end up not blocking during the buffered IO attempt - * below, we will block when flushing and waiting for the IO. - */ - if (iocb->ki_flags & IOCB_NOWAIT) { - err = -EAGAIN; + ret = btrfs_write_check(iocb, ret); + if (ret < 0) goto out; - } pos = iocb->ki_pos; - written_buffered = btrfs_buffered_write(iocb, from); - if (written_buffered < 0) { - err = written_buffered; - goto out; + while (iov_iter_count(iter) > 0) { + ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); + if (ret < 0) + break; + pos += ret; + num_written += ret; + cond_resched(); + } + + extent_changeset_free(data_reserved); + if (num_written > 0) { + pagecache_isize_extended(inode, old_isize, iocb->ki_pos); + iocb->ki_pos += num_written; } - /* - * Ensure all data is persisted. We want the next direct IO read to be - * able to read what was just written. - */ - endbyte = pos + written_buffered - 1; - err = btrfs_fdatawrite_range(inode, pos, endbyte); - if (err) - goto out; - err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); - if (err) - goto out; - written += written_buffered; - iocb->ki_pos = pos + written_buffered; - invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); out: - return err < 0 ? err : written; + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + return num_written ? num_written : ret; } static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, @@ -1650,7 +1403,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (ret || encoded->len == 0) goto out; - ret = btrfs_write_check(iocb, from, encoded->len); + ret = btrfs_write_check(iocb, encoded->len); if (ret < 0) goto out; @@ -1711,7 +1464,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) if (private) { kfree(private->filldir_buf); - free_extent_state(private->llseek_cached_state); + btrfs_free_extent_state(private->llseek_cached_state); kfree(private); filp->private_data = NULL; } @@ -1728,7 +1481,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) return 0; } -static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end) { int ret; struct blk_plug plug; @@ -1748,7 +1501,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) { - struct btrfs_inode *inode = BTRFS_I(ctx->inode); + struct btrfs_inode *inode = ctx->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && @@ -1784,14 +1537,21 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; u64 len; bool full_sync; + bool skip_ilock = false; + + if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { + skip_ilock = true; + current->journal_info = NULL; + btrfs_assert_inode_locked(inode); + } trace_btrfs_sync_file(file, datasync); @@ -1819,7 +1579,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + if (skip_ilock) + down_write(&inode->i_mmap_lock); + else + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -1843,7 +1606,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -1855,8 +1621,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * running delalloc the full sync flag may be set if we need to drop * extra extent map ranges due to temporary memory allocation failures. */ - full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We have to do this here to avoid the priority inversion of waiting on @@ -1875,15 +1640,29 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); + clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); } else { /* * Get our ordered extents as soon as possible to avoid doing * checksum lookups in the csum tree, and use instead the * checksums attached to the ordered extents. */ - btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), - &ctx.ordered_extents); - ret = filemap_fdatawait_range(inode->i_mapping, start, end); + btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); + if (ret) + goto out_release_extents; + + /* + * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after + * starting and waiting for writeback, because for buffered IO + * it may have been set during the end IO callback + * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in + * case an error happened and we need to wait for ordered + * extents to complete so that any extent maps that point to + * unwritten locations are dropped and we don't log them. + */ + if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags)) + ret = btrfs_wait_ordered_range(inode, start, len); } if (ret) @@ -1897,8 +1676,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ - clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * An ordered extent might have started before and completed * already with io errors, in which case the inode was not @@ -1906,7 +1684,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * for any errors that might have happened since we last * checked called fsync. */ - ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); + ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); goto out_release_extents; } @@ -1956,7 +1734,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); @@ -2025,10 +1806,172 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct folio *folio = page_folio(page); + struct inode *inode = file_inode(vmf->vma->vm_file); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + unsigned long zero_start; + loff_t size; + size_t fsize = folio_size(folio); + int ret; + u64 reserved_space; + u64 page_start; + u64 page_end; + u64 end; + + reserved_space = fsize; + + sb_start_pagefault(inode->i_sb); + page_start = folio_pos(folio); + page_end = page_start + folio_size(folio) - 1; + end = page_end; + + /* + * Reserving delalloc space after obtaining the page lock can lead to + * deadlock. For example, if a dirty page is locked by this function + * and the call to btrfs_delalloc_reserve_space() ends up triggering + * dirty page write out, then the btrfs_writepages() function could + * end up waiting indefinitely to get a lock on the page currently + * being processed by btrfs_page_mkwrite() function. + */ + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (ret < 0) + goto out_noreserve; + + ret = file_update_time(vmf->vma->vm_file); + if (ret < 0) + goto out; +again: + down_read(&BTRFS_I(inode)->i_mmap_lock); + folio_lock(folio); + size = i_size_read(inode); + + if ((folio->mapping != inode->i_mapping) || + (page_start >= size)) { + /* Page got truncated out from underneath us. */ + goto out_unlock; + } + folio_wait_writeback(folio); + + btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } + + /* + * We can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish. + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); + if (ordered) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); + folio_unlock(folio); + up_read(&BTRFS_I(inode)->i_mmap_lock); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } + + if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { + reserved_space = round_up(size - page_start, fs_info->sectorsize); + if (reserved_space < fsize) { + end = page_start + reserved_space - 1; + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, end + 1, + fsize - reserved_space, true); + } + } + + /* + * page_mkwrite gets called when the page is firstly dirtied after it's + * faulted in, but write(2) could also dirty a page and set delalloc + * bits, thus in this case for space account reason, we still need to + * clear any delalloc bits within this page range since we have to + * reserve data&meta space before lock_page() (see above comments). + */ + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); + + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + &cached_state); + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } + + /* Page is wholly or partially inside EOF. */ + if (page_start + folio_size(folio) > size) + zero_start = offset_in_folio(folio, size); + else + zero_start = fsize; + + if (zero_start != fsize) + folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); + + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); + + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); + + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; + +out_unlock: + folio_unlock(folio); + up_read(&BTRFS_I(inode)->i_mmap_lock); +out: + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, + reserved_space, true); + extent_changeset_free(data_reserved); +out_noreserve: + sb_end_pagefault(inode->i_sb); + + if (ret < 0) + return vmf_error(ret); + + /* Make the VM retry the fault. */ + return VM_FAULT_NOPAGE; +} + static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -2048,33 +1991,33 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, - int slot, u64 start, u64 end) +static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; + return false; if (btrfs_file_extent_disk_bytenr(leaf, fi)) - return 0; + return false; if (key.offset == end) - return 1; + return true; if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) - return 1; - return 0; + return true; + return false; } static int fill_holes(struct btrfs_trans_handle *trans, @@ -2120,7 +2063,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } @@ -2137,7 +2079,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } btrfs_release_path(path); @@ -2150,7 +2091,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); @@ -2158,15 +2099,13 @@ out: hole_em->start = offset; hole_em->len = end - offset; hole_em->ram_bytes = hole_em->len; - hole_em->orig_start = offset; - hole_em->block_start = EXTENT_MAP_HOLE; - hole_em->block_len = 0; - hole_em->orig_block_len = 0; + hole_em->disk_bytenr = EXTENT_MAP_HOLE; + hole_em->disk_num_bytes = 0; hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); } @@ -2193,21 +2132,39 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ - if (em->block_start == EXTENT_MAP_HOLE) { + if (em->disk_bytenr == EXTENT_MAP_HOLE) { ret = 1; *len = em->start + em->len > *start + *len ? 0 : *start + *len - em->start - em->len; *start = em->start + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -static void btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +/* + * Check if there is no folio in the range. + * + * We cannot utilize filemap_range_has_page() in a filemap with large folios + * as we can hit the following false positive: + * + * start end + * | | + * |//|//|//|//| | | | | | | | |//|//| + * \ / \ / + * Folio A Folio B + * + * That large folio A and B cover the start and end indexes. + * In that case filemap_range_has_page() will always return true, but the above + * case is fine for btrfs_punch_hole_lock_range() usage. + * + * So here we only ensure that no other folios is in the range, excluding the + * head/tail large folio. + */ +static bool check_range_has_page(struct inode *inode, u64 start, u64 end) { + struct folio_batch fbatch; + bool ret = false; /* * For subpage case, if the range is not at page boundary, we could * have pages at the leading/tailing part of the range. @@ -2215,15 +2172,48 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ - const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockstart = round_up(start, PAGE_SIZE); + const u64 page_lockend = round_down(end + 1, PAGE_SIZE); + const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; + const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; + pgoff_t tmp = start_index; + int found_folios; + + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + return false; + + folio_batch_init(&fbatch); + found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); + for (int i = 0; i < found_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + /* A large folio begins before the start. Not a target. */ + if (folio->index < start_index) + continue; + /* A large folio extends beyond the end. Not a target. */ + if (folio->index + folio_nr_pages(folio) > end_index) + continue; + /* A folio doesn't cover the head/tail index. Found a target. */ + ret = true; + break; + } + folio_batch_release(&fbatch); + return ret; +} +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, const u64 lockend, + struct extent_state **cached_state) +{ while (1) { truncate_pagecache_range(inode, lockstart, lockend); - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2234,12 +2224,11 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * locking the range check if we have pages in the range, and if * we do, unlock the range and retry. */ - if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + if (!check_range_has_page(inode, lockstart, lockend)) break; - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); @@ -2258,7 +2247,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; int slot; - struct btrfs_ref ref = { 0 }; int ret; if (replace_len == 0) @@ -2288,7 +2276,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, @@ -2314,15 +2301,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->qgroup_reserved, &key); } else { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = extent_info->disk_offset, + .num_bytes = extent_info->disk_len, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; u64 ref_offset; - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - extent_info->disk_offset, - extent_info->disk_len, 0, - root->root_key.objectid); ref_offset = extent_info->file_offset - extent_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset, 0, false); + btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2612,7 +2601,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) u64 lockend; u64 tail_start; u64 tail_len; - u64 orig_start = offset; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; int ret = 0; bool same_block; u64 ino_size; @@ -2621,7 +2611,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); - ret = btrfs_wait_ordered_range(inode, offset, len); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len); if (ret) goto out_only_mutex; @@ -2644,18 +2634,14 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* - * We needn't truncate any block which is beyond the end of the file - * because we are sure there is no data there. - */ - /* * Only do this if we are in the same block and we aren't doing the * entire block. */ if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); } else { ret = 0; } @@ -2665,7 +2651,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) { btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; @@ -2702,8 +2688,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) if (tail_start + tail_len < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), - tail_start + tail_len, - 0, 1); + tail_start + tail_len - 1, + orig_start, orig_end); if (ret) goto out_only_mutex; } @@ -2737,8 +2723,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* @@ -2849,14 +2835,14 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, if (IS_ERR(em)) return PTR_ERR(em); - if (em->block_start == EXTENT_MAP_HOLE) + if (em->disk_bytenr == EXTENT_MAP_HOLE) ret = RANGE_BOUNDARY_HOLE; else if (em->flags & EXTENT_FLAG_PREALLOC) ret = RANGE_BOUNDARY_PREALLOC_EXTENT; else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } @@ -2871,6 +2857,8 @@ static int btrfs_zero_range(struct inode *inode, int ret; u64 alloc_hint = 0; const u64 sectorsize = fs_info->sectorsize; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -2900,7 +2888,7 @@ static int btrfs_zero_range(struct inode *inode, * do nothing except updating the inode's i_size if * needed. */ - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; @@ -2913,9 +2901,9 @@ static int btrfs_zero_range(struct inode *inode, ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; - alloc_hint = em->block_start + em->len; + alloc_hint = btrfs_extent_map_block_start(em) + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { @@ -2926,22 +2914,22 @@ static int btrfs_zero_range(struct inode *inode, } if (em->flags & EXTENT_FLAG_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } - if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { - free_extent_map(em); - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { + btrfs_free_extent_map(em); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, mode); return ret; } - free_extent_map(em); + btrfs_free_extent_map(em); alloc_start = round_down(offset, sectorsize); alloc_end = alloc_start + sectorsize; goto reserve_space; @@ -2965,7 +2953,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, + orig_start, orig_end); if (ret) goto out; } else { @@ -2982,8 +2971,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, - 0, 1); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (ret) goto out; } else { @@ -3008,16 +2997,16 @@ reserve_space: ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, fs_info->sectorsize, offset + len, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; @@ -3103,7 +3092,8 @@ static long btrfs_fallocate(struct file *file, int mode, * need to zero out the end of the block if i_size lands in the * middle of a block. */ - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, + inode->i_size, (u64)-1); if (ret) goto out; } @@ -3116,7 +3106,7 @@ static long btrfs_fallocate(struct file *file, int mode, * the file range and, due to the previous locking we did, we know there * can't be more delalloc or ordered extents in the range. */ - ret = btrfs_wait_ordered_range(inode, alloc_start, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start, alloc_end - alloc_start); if (ret) goto out; @@ -3128,8 +3118,8 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); @@ -3141,29 +3131,29 @@ static long btrfs_fallocate(struct file *file, int mode, ret = PTR_ERR(em); break; } - last_byte = min(extent_map_end(em), alloc_end); - actual_end = min_t(u64, extent_map_end(em), offset + len); + last_byte = min(btrfs_extent_map_end(em), alloc_end); + actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !(em->flags & EXTENT_FLAG_PREALLOC))) { const u64 range_len = last_byte - cur_offset; ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } qgroup_reserved += range_len; data_space_needed += range_len; } - free_extent_map(em); + btrfs_free_extent_map(em); cur_offset = last_byte; } @@ -3217,8 +3207,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); out: btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); @@ -3252,10 +3242,10 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, - delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1, - cached_state); + delalloc_len = btrfs_count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1, + cached_state); } else { spin_unlock(&inode->lock); } @@ -3469,7 +3459,7 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) { struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); - struct btrfs_file_private *private = file->private_data; + struct btrfs_file_private *private; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; struct extent_state **delalloc_cached_state; @@ -3497,7 +3487,19 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) inode_get_bytes(&inode->vfs_inode) == i_size) return i_size; - if (!private) { + spin_lock(&inode->lock); + private = file->private_data; + spin_unlock(&inode->lock); + + if (private && private->owner_task != current) { + /* + * Not allocated by us, don't use it as its cached state is used + * by the task that allocated it and we don't want neither to + * mess with it nor get incorrect results because it reflects an + * invalid state for the current task. + */ + private = NULL; + } else if (!private) { private = kzalloc(sizeof(*private), GFP_KERNEL); /* * No worries if memory allocation failed. @@ -3505,7 +3507,23 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, * so everything will still be correct. */ - file->private_data = private; + if (private) { + bool free = false; + + private->owner_task = current; + + spin_lock(&inode->lock); + if (file->private_data) + free = true; + else + file->private_data = private; + spin_unlock(&inode->lock); + + if (free) { + kfree(private); + private = NULL; + } + } } if (private) @@ -3536,7 +3554,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) last_extent_end = lockstart; - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { @@ -3682,7 +3700,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) } out: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_free_path(path); if (ret < 0) @@ -3719,8 +3737,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_CAN_ODIRECT; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) @@ -3728,97 +3745,6 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } -static int check_direct_read(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - int ret; - int i, seg; - - ret = check_direct_IO(fs_info, iter, offset); - if (ret < 0) - return ret; - - if (!iter_is_iovec(iter)) - return 0; - - for (seg = 0; seg < iter->nr_segs; seg++) { - for (i = seg + 1; i < iter->nr_segs; i++) { - const struct iovec *iov1 = iter_iov(iter) + seg; - const struct iovec *iov2 = iter_iov(iter) + i; - - if (iov1->iov_base == iov2->iov_base) - return -EINVAL; - } - } - return 0; -} - -static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) -{ - struct inode *inode = file_inode(iocb->ki_filp); - size_t prev_left = 0; - ssize_t read = 0; - ssize_t ret; - - if (fsverity_active(inode)) - return 0; - - if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) - return 0; - - btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); -again: - /* - * This is similar to what we do for direct IO writes, see the comment - * at btrfs_direct_write(), but we also disable page faults in addition - * to disabling them only at the iov_iter level. This is because when - * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), - * which can still trigger page fault ins despite having set ->nofault - * to true of our 'to' iov_iter. - * - * The difference to direct IO writes is that we deadlock when trying - * to lock the extent range in the inode's tree during he page reads - * triggered by the fault in (while for writes it is due to waiting for - * our own ordered extent). This is because for direct IO reads, - * btrfs_dio_iomap_begin() returns with the extent range locked, which - * is only unlocked in the endio callback (end_bio_extent_readpage()). - */ - pagefault_disable(); - to->nofault = true; - ret = btrfs_dio_read(iocb, to, read); - to->nofault = false; - pagefault_enable(); - - /* No increment (+=) because iomap returns a cumulative value. */ - if (ret > 0) - read = ret; - - if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { - const size_t left = iov_iter_count(to); - - if (left == prev_left) { - /* - * We didn't make any progress since the last attempt, - * fallback to a buffered read for the remainder of the - * range. This is just to avoid any possibility of looping - * for too long. - */ - ret = read; - } else { - /* - * We made some progress since the last retry or this is - * the first time we are retrying. Fault in as many pages - * as possible and retry. - */ - fault_in_iov_iter_writeable(to, left); - prev_left = left; - goto again; - } - } - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); - return ret < 0 ? ret : read; -} - static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; @@ -3850,10 +3776,13 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .uring_cmd = btrfs_uring_cmd, + .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) { + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; /* @@ -3870,10 +3799,9 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) * know better and pull this out at some point in the future, it is * right and you are wrong. */ - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + ret = filemap_fdatawrite_range(mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) + ret = filemap_fdatawrite_range(mapping, start, end); return ret; } |