diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 4436 |
1 files changed, 1921 insertions, 2515 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7fed887e700c..c0c778243bf1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,7 +32,7 @@ #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/iomap.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/fsverity.h> #include "misc.h" #include "ctree.h" @@ -70,31 +70,13 @@ #include "orphan.h" #include "backref.h" #include "raid-stripe-tree.h" +#include "fiemap.h" struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; -struct btrfs_dio_data { - ssize_t submitted; - struct extent_changeset *data_reserved; - struct btrfs_ordered_extent *ordered; - bool data_space_reserved; - bool nocow_done; -}; - -struct btrfs_dio_private { - /* Range of I/O */ - u64 file_offset; - u32 bytes; - - /* This must be last */ - struct btrfs_bio bbio; -}; - -static struct bio_set btrfs_dio_bioset; - struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; @@ -134,14 +116,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type); static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) @@ -254,7 +231,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, btrfs_ino(inode), file_off, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); @@ -264,7 +241,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off logical += file_off; btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -331,15 +308,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); /* Output without objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -347,7 +324,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } else { btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -416,37 +393,16 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct page *locked_page, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = 0, page_end = 0; - struct page *page; - - if (locked_page) { - page_start = page_offset(locked_page); - page_end = page_start + PAGE_SIZE - 1; - } + struct folio *folio; while (index <= end_index) { - /* - * For locked page, we will call btrfs_mark_ordered_io_finished - * through btrfs_mark_ordered_io_finished() on it - * in run_delalloc_range() for the error handling, which will - * clear page Ordered and run the ordered extent accounting. - * - * Here we can't just clear the Ordered bit, or - * btrfs_mark_ordered_io_finished() would skip the accounting - * for the page range, and the ordered extent will never finish. - */ - if (locked_page && index == (page_start >> PAGE_SHIFT)) { - index++; - continue; - } - page = find_get_page(inode->vfs_inode.i_mapping, index); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; - if (!page) + if (IS_ERR(folio)) continue; /* @@ -454,25 +410,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_folio_clamp_clear_ordered(inode->root->fs_info, - page_folio(page), offset, bytes); - put_page(page); - } - - if (locked_page) { - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being - * instantiated then skip it, since the first page of a range is - * going to be properly cleaned up by the caller of - * run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; - } + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, + offset, bytes); + folio_put(folio); } return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); @@ -512,12 +452,12 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages, + struct folio *compressed_folio, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct page *page = NULL; + const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; @@ -525,10 +465,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; u64 i_size; - ASSERT((compressed_size > 0 && compressed_pages) || - (compressed_size == 0 && !compressed_pages)); + /* + * The decompressed size must still be no larger than a sector. Under + * heavy race, we can have size == 0 passed in, but that shouldn't be a + * big deal and we can continue the insertion. + */ + ASSERT(size <= sectorsize); + + /* + * The compressed size also needs to be no larger than a sector. + * That's also why we only need one page as the parameter. + */ + if (compressed_folio) + ASSERT(compressed_size <= sectorsize); + else + ASSERT(compressed_size == 0); - if (compressed_size && compressed_pages) + if (compressed_size && compressed_folio) cur_size = compressed_size; if (!extent_inserted) { @@ -536,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t datasize; key.objectid = btrfs_ino(inode); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -556,32 +509,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { - struct page *cpage; - int i = 0; - while (compressed_size > 0) { - cpage = compressed_pages[i]; - cur_size = min_t(unsigned long, compressed_size, - PAGE_SIZE); - - kaddr = kmap_local_page(cpage); - write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_local(kaddr); + kaddr = kmap_local_folio(compressed_folio, 0); + write_extent_buffer(leaf, kaddr, ptr, compressed_size); + kunmap_local(kaddr); - i++; - ptr += cur_size; - compressed_size -= cur_size; - } btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->vfs_inode.i_mapping, 0); + struct folio *folio; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); + ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); - put_page(page); + folio_put(folio); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -611,17 +555,53 @@ fail: return ret; } +static bool can_cow_file_range_inline(struct btrfs_inode *inode, + u64 offset, u64 size, + size_t compressed_size) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 data_len = (compressed_size ?: size); + + /* Inline extents must start at offset 0. */ + if (offset != 0) + return false; + + /* Inline extents are limited to sectorsize. */ + if (size > fs_info->sectorsize) + return false; + + /* We do not allow a non-compressed extent to be as large as block size. */ + if (data_len >= fs_info->sectorsize) + return false; + + /* We cannot exceed the maximum inline data size. */ + if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + return false; + + /* We cannot exceed the user specified max_inline size. */ + if (data_len > fs_info->max_inline) + return false; + + /* Inline extents must be the entirety of the file. */ + if (size < i_size_read(&inode->vfs_inode)) + return false; + + return true; +} /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. + * + * If being used directly, you must have already checked we're allowed to cow + * the range by getting true from can_cow_file_range_inline(). */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, - size_t compressed_size, - int compress_type, - struct page **compressed_pages, - bool update_i_size) +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, + u64 size, size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; @@ -631,18 +611,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, int ret; struct btrfs_path *path; - /* - * We can create an inline extent if it ends at or beyond the current - * i_size, is no larger than a sector (decompressed), and the (possibly - * compressed) data fits in a leaf and the configured maximum inline - * size. - */ - if (size < i_size_read(&inode->vfs_inode) || - size > fs_info->sectorsize || - data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - data_len > fs_info->max_inline) - return 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -668,7 +636,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, - compressed_pages, update_i_size); + compressed_folio, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -695,25 +663,74 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; } +static noinline int cow_file_range_inline(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 offset, u64 end, + size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) +{ + struct extent_state *cached = NULL; + unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; + u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); + int ret; + + if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) + return 1; + + btrfs_lock_extent(&inode->io_tree, offset, end, &cached); + ret = __cow_file_range_inline(inode, size, compressed_size, + compress_type, compressed_folio, + update_i_size); + if (ret > 0) { + btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); + return ret; + } + + /* + * In the successful case (ret == 0 here), cow_file_range will return 1. + * + * Quite a bit further up the callstack in extent_writepage(), ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * In the failure case, the locked_folio does get unlocked by + * btrfs_folio_end_all_writers, which asserts that it is still locked + * at that point, so we must *not* unlock it here. + * + * The other two callsites in compress_file_range do not have a + * locked_folio, so they are not relevant to this logic. + */ + if (ret == 0) + locked_folio = NULL; + + extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, + clear_flags, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + return ret; +} + struct async_extent { u64 start; u64 ram_size; u64 compressed_size; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; int compress_type; struct list_head list; }; struct async_chunk { struct btrfs_inode *inode; - struct page *locked_page; + struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; @@ -731,8 +748,8 @@ struct async_cow { static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, - struct page **pages, - unsigned long nr_pages, + struct folio **folios, + unsigned long nr_folios, int compress_type) { struct async_extent *async_extent; @@ -743,8 +760,8 @@ static noinline int add_async_extent(struct async_chunk *cow, async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; - async_extent->pages = pages; - async_extent->nr_pages = nr_pages; + async_extent->folios = folios; + async_extent->nr_folios = nr_folios; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; @@ -760,42 +777,9 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(inode)); + DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0; } - /* - * Special check for subpage. - * - * We lock the full page then run each delalloc range in the page, thus - * for the following case, we will hit some subpage specific corner case: - * - * 0 32K 64K - * | |///////| |///////| - * \- A \- B - * - * In above case, both range A and range B will try to unlock the full - * page [0, 64K), causing the one finished later will have page - * unlocked already, triggering various page lock requirement BUG_ON()s. - * - * So here we add an artificial limit that subpage compression can only - * if the range is fully page aligned. - * - * In theory we only need to ensure the first page is fully covered, but - * the tailing partial page will be locked until the full compression - * finishes, delaying the write of other range. - * - * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range - * first to prevent any submitted async extent to unlock the full page. - * By this, we can ensure for subpage case that only the last async_cow - * will unlock the full page. - */ - if (fs_info->sectorsize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(end + 1)) - return 0; - } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) @@ -809,7 +793,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) - return btrfs_compress_heuristic(&inode->vfs_inode, start, end); + return btrfs_compress_heuristic(inode, start, end); return 0; } @@ -819,7 +803,28 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode, small_write); + btrfs_add_inode_defrag(inode, small_write); +} + +static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) +{ + unsigned long end_index = end >> PAGE_SHIFT; + struct folio *folio; + int ret = 0; + + for (unsigned long index = start >> PAGE_SHIFT; + index <= end_index; index++) { + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); + if (IS_ERR(folio)) { + if (!ret) + ret = PTR_ERR(folio); + continue; + } + btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, + end + 1 - start); + folio_put(folio); + } + return ret; } /* @@ -848,13 +853,14 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int poff; int i; int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -863,7 +869,16 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(inode, start, end); + + /* + * All the folios should have been locked thus no failure. + * + * And even if some folios are missing, btrfs_compress_folios() + * would handle them correctly, so here just do an ASSERT() check for + * early logic errors. + */ + ASSERT(ret == 0); /* * We need to save i_size before now because it could change in between @@ -879,9 +894,9 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - pages = NULL; - nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); + folios = NULL; + nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through @@ -906,17 +921,6 @@ again: (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* - * For subpage case, we require full page alignment for the sector - * aligned range. - * Thus we must also check against @actual_end, not just @end. - */ - if (blocksize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(round_up(actual_end, blocksize))) - goto cleanup_and_bail_uncompressed; - } - total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -930,8 +934,8 @@ again: if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { + folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); + if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. @@ -939,15 +943,17 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) + if (inode->defrag_compress) { compress_type = inode->defrag_compress; - else if (inode->prop_compress) + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { compress_type = inode->prop_compress; + } /* Compression level is applied here. */ - ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), - mapping, start, pages, &nr_pages, &total_in, - &total_compressed); + ret = btrfs_compress_folios(compress_type, compress_level, + mapping, start, folios, &nr_folios, &total_in, + &total_compressed); if (ret) goto mark_incompressible; @@ -957,7 +963,7 @@ again: */ poff = offset_in_page(total_compressed); if (poff) - memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); + folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); /* * Try to create an inline extent. @@ -968,43 +974,16 @@ again: * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - if (total_in < actual_end) { - ret = cow_file_range_inline(inode, actual_end, 0, - BTRFS_COMPRESS_NONE, NULL, - false); - } else { - ret = cow_file_range_inline(inode, actual_end, - total_compressed, - compress_type, pages, - false); - } - if (ret <= 0) { - unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING; - - if (ret < 0) - mapping_set_error(mapping, -EIO); - - /* - * inline extent creation worked or returned error, - * we don't need to create any more async work items. - * Unlock and free up our temp pages. - * - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be done _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - NULL, - clear_flags, - PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - goto free_pages; - } + if (total_in < actual_end) + ret = cow_file_range_inline(inode, NULL, start, end, 0, + BTRFS_COMPRESS_NONE, NULL, false); + else + ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, + compress_type, folios[0], false); + if (ret <= 0) { + if (ret < 0) + mapping_set_error(mapping, -EIO); + goto free_pages; } /* @@ -1026,8 +1005,8 @@ again: * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages, - nr_pages, compress_type); + ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, + nr_folios, compress_type); BUG_ON(ret); if (start + total_in < end) { start += total_in; @@ -1044,12 +1023,12 @@ cleanup_and_bail_uncompressed: BTRFS_COMPRESS_NONE); BUG_ON(ret); free_pages: - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - btrfs_free_compr_page(pages[i]); + if (folios) { + for (i = 0; i < nr_folios; i++) { + WARN_ON(folios[i]->mapping); + btrfs_free_compr_folio(folios[i]); } - kfree(pages); + kfree(folios); } } @@ -1057,21 +1036,21 @@ static void free_async_extent_pages(struct async_extent *async_extent) { int i; - if (!async_extent->pages) + if (!async_extent->folios) return; - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - btrfs_free_compr_page(async_extent->pages[i]); + for (i = 0; i < async_extent->nr_folios; i++) { + WARN_ON(async_extent->folios[i]->mapping); + btrfs_free_compr_folio(async_extent->folios[i]); } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + kfree(async_extent->folios); + async_extent->nr_folios = 0; + async_extent->folios = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, - struct page *locked_page) + struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1084,21 +1063,17 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + ret = run_delalloc_cow(inode, locked_folio, start, end, + &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); - if (locked_page) { - const u64 page_start = page_offset(locked_page); - - set_page_writeback(locked_page); - end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, locked_page, - page_start, PAGE_SIZE, - !ret); - mapping_set_error(locked_page->mapping, ret); - unlock_page(locked_page); - } + if (locked_folio) + btrfs_folio_end_lock(inode->root->fs_info, locked_folio, + start, async_extent->ram_size); + btrfs_err_rl(inode->root->fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, async_extent->ram_size, ret); } } @@ -1111,10 +1086,13 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; struct btrfs_key ins; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; + struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1122,20 +1100,23 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, kthread_associate_blkcg(async_chunk->blkcg_css); /* - * If async_chunk->locked_page is in the async_extent range, we need to + * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ - if (async_chunk->locked_page) { - u64 locked_page_start = page_offset(async_chunk->locked_page); - u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + if (async_chunk->locked_folio) { + u64 locked_folio_start = folio_pos(async_chunk->locked_folio); + u64 locked_folio_end = locked_folio_start + + folio_size(async_chunk->locked_folio) - 1; - if (!(start >= locked_page_end || end <= locked_page_start)) - locked_page = async_chunk->locked_page; + if (!(start >= locked_folio_end || end <= locked_folio_start)) + locked_folio = async_chunk->locked_folio; } - lock_extent(io_tree, start, end, NULL); if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, locked_page); + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); + submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1150,34 +1131,30 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } + btrfs_lock_extent(io_tree, start, end, &cached); + /* Here we're doing allocation and writeback of the compressed pages */ - em = create_io_em(inode, start, - async_extent->ram_size, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - async_extent->ram_size, /* ram_bytes */ - async_extent->compress_type, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.ram_bytes = async_extent->ram_size; + file_extent.num_bytes = async_extent->ram_size; + file_extent.offset = 0; + file_extent.compression = async_extent->compress_type; + + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ - async_extent->ram_size, /* num_bytes */ - async_extent->ram_size, /* ram_bytes */ - ins.objectid, /* disk_bytenr */ - ins.offset, /* disk_num_bytes */ - 0, /* offset */ - 1 << BTRFS_ORDERED_COMPRESSED, - async_extent->compress_type); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1187,25 +1164,28 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, - async_extent->pages, /* compressed_pages */ - async_extent->nr_pages, + async_extent->folios, /* compressed_folios */ + async_extent->nr_folios, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + NULL, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | @@ -1215,36 +1195,36 @@ out_free_reserve: kthread_associate_blkcg(NULL); btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - root->root_key.objectid, btrfs_ino(inode), start, + btrfs_root_id(root), btrfs_ino(inode), start, async_extent->ram_size, ret); kfree(async_extent); } -static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, - u64 num_bytes) +u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 alloc_hint = 0; read_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, start, num_bytes); + em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + btrfs_free_extent_map(em); + em = btrfs_search_extent_mapping(em_tree, 0, 0); + if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + alloc_hint = btrfs_extent_map_block_start(em); if (em) - free_extent_map(em); + btrfs_free_extent_map(em); } else { - alloc_hint = em->block_start; - free_extent_map(em); + alloc_hint = btrfs_extent_map_block_start(em); + btrfs_free_extent_map(em); } } read_unlock(&em_tree->lock); @@ -1258,39 +1238,36 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * - * locked_page is the page that writepage had locked already. We use + * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_page. + * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_page and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_page is + * unlocks all pages including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_page are unlocked. + * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the - * while-loop, the ordered extents created in previous iterations are kept - * intact. So, the caller must clean them up by calling - * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for - * example. + * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, u64 start, u64 end, - u64 *done_offset, + struct folio *locked_folio, u64 start, + u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_state *cached = NULL; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; @@ -1298,7 +1275,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct extent_map *em; unsigned clear_bits; unsigned long page_ops; - bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { @@ -1312,57 +1288,36 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { - u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), - end + 1); - + if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, actual_end, 0, + ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); - if (ret == 0) { - /* - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be run _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + if (ret <= 0) { /* - * locked_page is locked by the caller of - * writepage_delalloc(), not locked by - * __process_pages_contig(). - * - * We can't let __process_pages_contig() to unlock it, - * as it doesn't have any subpage::writers recorded. + * We succeeded, return 1 so the caller knows we're done + * with this page and already handled the IO. * - * Here we manually unlock the page, since the caller - * can't determine if it's an inline extent or a - * compressed extent. + * If there was an error then cow_file_range_inline() has + * already done the cleanup. */ - unlock_page(locked_page); - ret = 1; + if (ret == 0) + ret = 1; goto done; - } else if (ret < 0) { - goto out_unlock; } } - alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); + + /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; /* * Relocation relies on the relocated extents to have exactly the same @@ -1382,9 +1337,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, while (num_bytes > 0) { struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; - cur_alloc_size = num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { @@ -1407,36 +1362,49 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; - extent_reserved = true; - - ram_size = ins.offset; - em = create_io_em(inode, start, ins.offset, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - ram_size, /* ram_bytes */ - BTRFS_COMPRESS_NONE, /* compress_type */ - BTRFS_ORDERED_REGULAR /* type */); + + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = ins.offset; + file_extent.ram_bytes = ins.offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; + + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ + btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, + &cached); + + em = btrfs_create_io_em(inode, start, &file_extent, + BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, - ram_size, ins.objectid, cur_alloc_size, - 0, 1 << BTRFS_ORDERED_REGULAR, - BTRFS_COMPRESS_NONE); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1457,35 +1425,20 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (ret) btrfs_drop_extent_map_range(inode, start, - start + ram_size - 1, + start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered (Private2) bit so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; - extent_reserved = false; + cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1495,16 +1448,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; return ret; out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); + btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: @@ -1515,29 +1470,31 @@ out_unlock: * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents - * for this region. They are cleaned up by - * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * for this region, thus we need to cleanup those ordered extents. + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the ordered extents cleanup. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_page) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { - if (!locked_page) + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); + + btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, @@ -1548,13 +1505,12 @@ out_unlock: * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ - if (extent_reserved) { + if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_page, - clear_bits, + locked_folio, &cached, clear_bits, page_ops); - start += cur_alloc_size; + btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } /* @@ -1563,11 +1519,18 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - if (start < end) { + if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); - } + extent_clear_unlock_delalloc(inode, start + cur_alloc_size, + end, locked_folio, + &cached, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, + end - start - cur_alloc_size + 1, NULL); + } + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret; } @@ -1589,10 +1552,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ u64 alloc_hint = 0; if (do_free) { - struct async_chunk *async_chunk; struct async_cow *async_cow; - async_chunk = container_of(work, struct async_chunk, work); btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -1607,8 +1568,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); + async_extent = list_first_entry(&async_chunk->extents, + struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } @@ -1620,7 +1581,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1639,7 +1600,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, if (!ctx) return false; - unlock_extent(&inode->io_tree, start, end, NULL); set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; @@ -1661,15 +1621,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, INIT_LIST_HEAD(&async_chunk[i].extents); /* - * The locked_page comes all the way from writepage and its - * the original page we were actually given. As we spread + * The locked_folio comes all the way from writepage and its + * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk - * structs, only the first struct needs a pointer to locked_page + * structs, only the first struct needs a pointer to + * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ - if (locked_page) { + if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to @@ -1679,12 +1640,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, locked_page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); - async_chunk[i].locked_page = locked_page; - locked_page = NULL; + async_chunk[i].locked_folio = locked_folio; + locked_folio = NULL; } else { - async_chunk[i].locked_page = NULL; + async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { @@ -1713,7 +1674,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -1721,48 +1682,27 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, &done_offset, - true, false); + ret = cow_file_range(inode, locked_folio, start, end, + &done_offset, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, - done_offset, wbc, pages_dirty); + extent_write_locked_range(&inode->vfs_inode, locked_folio, + start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } -static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes, bool nowait) -{ - struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); - struct btrfs_ordered_sum *sums; - int ret; - LIST_HEAD(list); - - ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, - &list, 0, nowait); - if (ret == 0 && list_empty(&list)) - return 0; - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); - } - if (ret < 0) - return ret; - return 1; -} - -static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end) +static int fallback_to_cow(struct btrfs_inode *inode, + struct folio *locked_folio, const u64 start, + const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_state *cached_state = NULL; u64 range_start = start; u64 count; int ret; @@ -1799,8 +1739,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ - count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0, NULL); + btrfs_lock_extent(io_tree, start, end, &cached_state); + count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1810,20 +1751,21 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, bytes = range_bytes; spin_lock(&sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) - clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - NULL); + btrfs_clear_extent_bits(io_tree, start, end, EXTENT_NORESERVE); } + btrfs_unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, + true); ASSERT(ret != 1); return ret; } @@ -1836,20 +1778,17 @@ struct can_nocow_file_extent_args { /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; - bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. */ bool free_path; - /* Output fields. Only set when can_nocow_file_extent() returns 1. */ - - u64 disk_bytenr; - u64 disk_num_bytes; - u64 extent_offset; - /* Number of bytes that can be written to in NOCOW mode. */ - u64 num_bytes; + /* + * Output fields. Only set when can_nocow_file_extent() returns 1. + * The expected file extent for the NOCOW write. + */ + struct btrfs_file_extent file_extent; }; /* @@ -1870,6 +1809,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; + struct btrfs_root *csum_root; + u64 io_start; u64 extent_end; u8 extent_type; int can_nocow = 0; @@ -1882,11 +1823,6 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out; - /* Can't access these fields unless we know it's not an inline extent. */ - args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - args->extent_offset = btrfs_file_extent_offset(leaf, fi); - if (!(inode->flags & BTRFS_INODE_NODATACOW) && extent_type == BTRFS_FILE_EXTENT_REG) goto out; @@ -1896,13 +1832,12 @@ static int can_nocow_file_extent(struct btrfs_path *path, * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ - if (!args->strict && - btrfs_file_extent_generation(leaf, fi) <= + if (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; /* An explicit hole, must COW. */ - if (args->disk_bytenr == 0) + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out; /* Compressed/encrypted/encoded extents must be COWed. */ @@ -1913,6 +1848,12 @@ static int can_nocow_file_extent(struct btrfs_path *path, extent_end = btrfs_file_extent_end(path); + args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); + args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); + /* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid @@ -1920,9 +1861,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ btrfs_release_path(path); - ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->extent_offset, - args->disk_bytenr, args->strict, path); + ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1930,7 +1870,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (args->free_path) { /* * We don't need the path anymore, plus through the - * csum_exist_in_range() call below we will end up allocating + * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage. */ @@ -1943,16 +1883,19 @@ static int can_nocow_file_extent(struct btrfs_path *path, atomic_read(&root->snapshot_force_cow)) goto out; - args->disk_bytenr += args->extent_offset; - args->disk_bytenr += args->start - key->offset; - args->num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.offset += args->start - key->offset; + io_start = args->file_extent.disk_bytenr + args->file_extent.offset; /* * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ - ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, - nowait); + + csum_root = btrfs_csum_root(root->fs_info, io_start); + ret = btrfs_lookup_csums_list(csum_root, io_start, + io_start + args->file_extent.num_bytes - 1, + NULL, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1966,6 +1909,112 @@ static int can_nocow_file_extent(struct btrfs_path *path, } /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + btrfs_free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1U << BTRFS_ORDERED_PREALLOC) + : (1U << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + /* + * On error, we need to cleanup the ordered extents we created. + * + * We do not clear the folio Dirty flags because they are set and + * cleaered by the caller. + */ + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, file_pos, end); + return ret; +} + +/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * @@ -1973,13 +2022,18 @@ static int can_nocow_file_extent(struct btrfs_path *path, * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2002,17 +2056,14 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.end = end; nocow_args.writeback_path = true; - while (1) { + while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; + struct extent_state *cached_state = NULL; u64 extent_end; - u64 ram_bytes; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2067,12 +2118,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -2088,7 +2140,6 @@ next_slot: ret = -EUCLEAN; goto error; } - ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); extent_end = btrfs_file_extent_end(path); /* @@ -2108,7 +2159,9 @@ next_slot: goto must_cow; ret = 0; - nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + nocow_bg = btrfs_inc_nocow_writers(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset); if (!nocow_bg) { must_cow: /* @@ -2135,79 +2188,23 @@ must_cow: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1); - cow_start = (u64)-1; + ret = fallback_to_cow(inode, locked_folio, cow_start, + found_key.offset - 1); if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } + cow_start = (u64)-1; } - nocow_end = cur_offset + nocow_args.num_bytes - 1; - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - u64 orig_start = found_key.offset - nocow_args.extent_offset; - struct extent_map *em; - - em = create_io_em(inode, cur_offset, nocow_args.num_bytes, - orig_start, - nocow_args.disk_bytenr, /* block_start */ - nocow_args.num_bytes, /* block_len */ - nocow_args.disk_num_bytes, /* orig_block_len */ - ram_bytes, BTRFS_COMPRESS_NONE, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - nocow_args.num_bytes, nocow_args.num_bytes, - nocow_args.disk_bytenr, nocow_args.num_bytes, 0, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW), - BTRFS_COMPRESS_NONE); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; - if (cur_offset > end) - break; } btrfs_release_path(path); @@ -2215,11 +2212,12 @@ must_cow: cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end); - cow_start = (u64)-1; - if (ret) + ret = fallback_to_cow(inode, locked_folio, cow_start, end); + if (ret) { + cow_end = end; goto error; + } + cow_start = (u64)-1; } btrfs_free_path(path); @@ -2227,20 +2225,81 @@ must_cow: error: /* - * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_offset end + * |/////////////| | + * + * In this case, cow_start should be (u64)-1. + * + * For range [start, cur_offset) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Need to clear the dirty flags and finish the ordered extents. + * + * 2) Failed with error before calling fallback_to_cow() + * + * start cow_start end + * |/////////////| | + * + * In this case, only @cow_start is set, @cur_offset is between + * [cow_start, end) + * + * It's mostly the same as case 1), just replace @cur_offset with + * @cow_start. + * + * 3) Failed with error from fallback_to_cow() + * + * start cow_start cow_end end + * |/////////////|-----------| | + * + * In this case, both @cow_start and @cow_end is set. + * + * For range [start, cow_start) it's the same as case 1). + * But for range [cow_start, cow_end), all the cleanup is handled by + * cow_file_range(), we should not touch anything in that range. + * + * So for all above cases, if @cow_start is set, cleanup ordered extents + * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). */ if (cow_start != (u64)-1) cur_offset = cow_start; - if (cur_offset < end) + + if (cur_offset > start) { + btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + } + + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. + */ + if (cow_end) + cur_offset = cow_end + 1; + + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. + */ + if (cur_offset < end) { + struct extent_state *cached = NULL; + + btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_DEFRAG | + locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + } btrfs_free_path(path); + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, end + 1 - start, ret); return ret; } @@ -2248,7 +2307,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) + btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2259,40 +2318,35 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* - * The range must cover part of the @locked_page, or a return of 1 + * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= page_offset(locked_page) || - start >= page_offset(locked_page) + PAGE_SIZE)); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_pos(locked_folio) + folio_size(locked_folio))); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_page, start, end); - goto out; + ret = run_delalloc_nocow(inode, locked_folio, start, end); + return ret; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, locked_page, start, end, wbc)) + run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_page, start, end, NULL, + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); - -out: - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_page, start, - end - start + 1); return ret; } @@ -2542,7 +2596,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, !btrfs_is_free_space_inode(inode) && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -2575,44 +2629,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered) -{ - u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - u64 len = bbio->bio.bi_iter.bi_size; - struct btrfs_ordered_extent *new; - int ret; - - /* Must always be called for the beginning of an ordered extent. */ - if (WARN_ON_ONCE(start != ordered->disk_bytenr)) - return -EINVAL; - - /* No need to split if the ordered extent covers the entire bio. */ - if (ordered->disk_num_bytes == len) { - refcount_inc(&ordered->refs); - bbio->ordered = ordered; - return 0; - } - - /* - * Don't split the extent_map for NOCOW extents, as we're writing into - * a pre-existing one. - */ - if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); - if (ret) - return ret; - } - - new = btrfs_split_ordered_extent(ordered, len); - if (IS_ERR(new)) - return PTR_ERR(new); - bbio->ordered = new; - return 0; -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2655,7 +2671,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (IS_ERR(em)) return PTR_ERR(em); - if (em->block_start != EXTENT_MAP_HOLE) + if (em->disk_bytenr != EXTENT_MAP_HOLE) goto next; em_len = em->len; @@ -2664,12 +2680,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (em_len > search_len) em_len = search_len; - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state); + ret = btrfs_set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, cached_state); next: - search_start = extent_map_end(em); - free_extent_map(em); + search_start = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); if (ret) return ret; } @@ -2699,13 +2715,13 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC | extra_bits, cached_state); + return btrfs_set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { - struct page *page; + struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2717,50 +2733,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page = fixup->page; + struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 page_start = page_offset(page); - u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = folio_pos(folio) + folio_size(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before - * we take the page lock. + * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + folio_size(folio)); again: - lock_page(page); + folio_lock(folio); /* - * Before we queued this fixup, we took a reference on the page. - * page->mapping may go NULL, but it shouldn't be moved to a different + * Before we queued this fixup, we took a reference on the folio. + * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + if (!folio->mapping || !folio_test_dirty(folio) || + !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * - * 1) We got here and our page had already been dealt with and + * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. - * 2) Our page was already dealt with, but we happened to get an + * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but - * because the page was already dealt with we don't want to - * mark the page with an error, so make sure we're resetting + * because the folio was already dealt with we don't want to + * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC - * when the page was already properly dealt with. + * when the folio was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE, + page_start, folio_size(folio), true); } ret = 0; @@ -2768,23 +2785,23 @@ again: } /* - * We can't mess with the page state unless it is locked, so now that + * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) goto out_page; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PageOrdered(page)) + if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); - unlock_page(page); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -2802,28 +2819,28 @@ again: * * The page was dirty when we started, nothing should have cleaned it. */ - BUG_ON(!PageDirty(page)); + BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ - mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page, page_start, - PAGE_SIZE, !ret); - clear_page_dirty_for_io(page); - } - btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); - unlock_page(page); - put_page(page); + mapping_set_error(folio->mapping, ret); + btrfs_mark_ordered_io_finished(inode, folio, page_start, + folio_size(folio), !ret); + folio_clear_dirty_for_io(folio); + } + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* @@ -2836,33 +2853,49 @@ out_page: /* * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a + * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. + * the delalloc bit and make it safe to write the folio. */ -int btrfs_writepage_cow_fixup(struct page *page) +int btrfs_writepage_cow_fixup(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; - /* This page has ordered extent covering it already */ - if (PageOrdered(page)) + /* This folio has ordered extent covering it already */ + if (folio_test_ordered(folio)) return 0; /* - * PageChecked is set below when we create a fixup worker for this page, - * don't try to create another one if we're already PageChecked() + * For experimental build, we error out instead of EAGAIN. * - * The extent_io writepage code will redirty the page if we send back + * We should not hit such out-of-band dirty folios anymore. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { + DEBUG_WARN(); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + BTRFS_I(inode)->root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio)); + return -EUCLEAN; + } + + /* + * folio_checked is set below when we create a fixup worker for this + * folio, don't try to create another one if we're already + * folio_test_checked. + * + * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ - if (PageChecked(page)) + if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); @@ -2872,14 +2905,14 @@ int btrfs_writepage_cow_fixup(struct page *page) /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation - * takes place outside of the page lock, and we can't trust - * page->mapping outside of the page lock. + * takes place outside of the folio lock, and we can't trust + * folio->mapping outside of the folio lock. */ ihold(inode); - btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); - get_page(page); + btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); + folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = page; + fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -2894,7 +2927,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); @@ -2929,8 +2962,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; + ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); @@ -2943,7 +2976,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -2966,8 +2998,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) @@ -2977,8 +3009,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, file_pos - offset, qgroup_reserved, &ins); out: - btrfs_free_path(path); - return ret; } @@ -3011,10 +3041,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) num_bytes = oe->truncated_len; - ram_bytes = num_bytes; - } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3030,7 +3058,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); - return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + return insert_reserved_file_extent(trans, oe->inode, oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } @@ -3042,7 +3070,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, */ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { - struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); + struct btrfs_inode *inode = ordered_extent->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; @@ -3087,29 +3115,21 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - - btrfs_inode_safe_disk_i_size_write(inode, 0); - if (freespace_inode) - trans = btrfs_join_transaction_spacecache(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, ret); - goto out; + /* + * If it's a COW write we need to lock the extent range as we will be + * inserting/replacing file extent items and unpinning an extent map. + * This must be taken before joining a transaction, as it's a higher + * level lock (like the inode's VFS lock), otherwise we can run into an + * ABBA deadlock with other tasks (transactions work like a lock, + * depending on their current state). + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED; + btrfs_lock_extent_bits(io_tree, start, end, + EXTENT_LOCKED | EXTENT_FINISHING_ORDERED, + &cached_state); } - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3123,8 +3143,28 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + /* Logic error */ + ASSERT(list_empty(&ordered_extent->list)); + if (!list_empty(&ordered_extent->list)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, inode); + if (ret) { + /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + } goto out; + } if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -3151,8 +3191,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3171,9 +3211,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) - clear_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); @@ -3181,17 +3221,14 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_abort_transaction(trans, ret); goto out; } - ret = 0; out: - clear_extent_bit(&inode->io_tree, start, end, clear_bits, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 unwritten_start = start; - /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered @@ -3200,13 +3237,8 @@ out: * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ - if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, - &ordered_extent->flags)) - mapping_set_error(ordered_extent->inode->i_mapping, -EIO); - - if (truncated) - unwritten_start += logical_len; - clear_extent_uptodate(io_tree, unwritten_start, end, NULL); + if (ret) + btrfs_mark_ordered_extent_error(ordered_extent); /* * Drop extent maps for the part of the extent we didn't write. @@ -3222,9 +3254,15 @@ out: * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ - if (!btrfs_is_free_space_inode(inode)) + if (!btrfs_is_free_space_inode(inode)) { + u64 unwritten_start = start; + + if (truncated) + unwritten_start += logical_len; + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + } /* * If the ordered extent had an IOERR or something else went @@ -3251,12 +3289,12 @@ out: NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes, 1); + ordered_extent->disk_num_bytes, true); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. */ - btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } @@ -3278,7 +3316,7 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { - if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) && + if (btrfs_is_zoned(ordered->inode->root->fs_info) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); @@ -3288,20 +3326,16 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. + * + * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected) +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected) { SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; - - ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); shash->tfm = fs_info->csum_shash; - - kaddr = kmap_local_page(page) + pgoff; crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - kunmap_local(kaddr); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; @@ -3330,6 +3364,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; + void *kaddr; ASSERT(bv->bv_len == fs_info->sectorsize); @@ -3337,19 +3372,22 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, return true; if (btrfs_is_data_reloc_root(inode->root) && - test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - NULL)) { + btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + NULL)) { /* Skip the range without csum for data reloc inode */ - clear_extent_bits(&inode->io_tree, file_offset, end, - EXTENT_NODATASUM); + btrfs_clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, - csum_expected)) + kaddr = bvec_kmap_local(bv); + if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { + kunmap_local(kaddr); goto zeroit; + } + kunmap_local(kaddr); return true; zeroit: @@ -3379,6 +3417,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode) if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; + WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq @@ -3495,11 +3534,10 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; - struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; @@ -3518,6 +3556,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -3572,7 +3612,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, last_objectid, root); + inode = btrfs_iget(last_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; @@ -3641,10 +3681,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (!inode || inode->i_nlink) { + if (!inode || inode->vfs_inode.i_nlink) { if (inode) { - ret = btrfs_drop_verity_items(BTRFS_I(inode)); - iput(inode); + ret = btrfs_drop_verity_items(inode); + iput(&inode->vfs_inode); inode = NULL; if (ret) goto out; @@ -3667,7 +3707,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) nr_unlink++; /* this will do delete_inode and everything for us */ - iput(inode); + iput(&inode->vfs_inode); } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3684,19 +3724,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); - btrfs_free_path(path); return ret; } /* - * very simple check to peek ahead in the leaf looking for xattrs. If we - * don't find any xattrs, we know there can't be any acls. + * Look ahead in the leaf for xattrs. If we don't find any then we know there + * can't be any ACLs. * - * slot is the slot the inode is in, objectid is the objectid of the inode + * @leaf: the eb leaf where to search + * @slot: the slot the inode is in + * @objectid: the objectid of the inode + * + * Return true if there is xattr/ACL, false otherwise. */ -static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid, - int *first_xattr_slot) +static noinline bool acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3716,58 +3759,120 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* we found a different objectid, there must not be acls */ + /* We found a different objectid, there must be no ACLs. */ if (found_key.objectid != objectid) - return 0; + return false; - /* we found an xattr, assume we've got an acl */ + /* We found an xattr, assume we've got an ACL. */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) - return 1; + return true; } /* - * we found a key greater than an xattr key, there can't - * be any acls later on + * We found a key greater than an xattr key, there can't be any + * ACLs later on. */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) - return 0; + return false; slot++; scanned++; /* - * it goes inode, inode backrefs, xattrs, extents, - * so if there are a ton of hard links to an inode there can - * be a lot of backrefs. Don't waste time searching too hard, - * this is just an optimization + * The item order goes like: + * - inode + * - inode backrefs + * - xattrs + * - extents, + * + * so if there are lots of hard links to an inode there can be + * a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization. */ if (scanned >= 8) break; } - /* we hit the end of the leaf before we found an xattr or - * something larger than an xattr. We have to assume the inode - * has acls + /* + * We hit the end of the leaf before we found an xattr or something + * larger than an xattr. We have to assume the inode has ACLs. */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; - return 1; + return true; +} + +static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (WARN_ON_ONCE(inode->file_extent_tree)) + return 0; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + if (!S_ISREG(inode->vfs_inode.i_mode)) + return 0; + if (btrfs_is_free_space_inode(inode)) + return 0; + + inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!inode->file_extent_tree) + return -ENOMEM; + + btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); + + return 0; +} + +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; } /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode and add it to + * its root list/tree. + * + * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = in_path; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3776,23 +3881,27 @@ static int btrfs_read_locked_inode(struct inode *inode, bool filled = false; int first_xattr_slot; + ret = btrfs_init_file_extent_tree(inode); + if (ret) + goto out; + ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } + ASSERT(path); - memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3802,39 +3911,41 @@ static int btrfs_read_locked_inode(struct inode *inode, inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); cache_index: /* @@ -3846,9 +3957,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3877,7 +3987,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3885,15 +3995,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3901,13 +4011,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -3915,45 +4024,50 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - root->root_key.objectid, ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(inode, true); + if (ret) + goto out; + return 0; +out: + iget_failed(vfs_inode); + return ret; } /* @@ -4012,19 +4126,21 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; + struct btrfs_key key; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); + btrfs_get_inode_key(inode, &key); + ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1); if (ret) { if (ret > 0) ret = -ENOENT; - goto failed; + return ret; } leaf = path->nodes[0]; @@ -4032,12 +4148,8 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); - ret = 0; -failed: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -4182,6 +4294,7 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode(trans, dir); @@ -4282,9 +4395,9 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { - objectid = inode->root->root_key.objectid; + objectid = btrfs_root_id(inode->root); } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - objectid = inode->location.objectid; + objectid = inode->ref_root_id; } else { WARN_ON(1); fscrypt_free_filename(&fname); @@ -4325,11 +4438,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); - if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + if (IS_ERR(di)) { + ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); goto out; } @@ -4340,7 +4450,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_release_path(path); } else { ret = btrfs_del_root_ref(trans, objectid, - root->root_key.objectid, dir_ino, + btrfs_root_id(root), dir_ino, &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -4373,7 +4483,7 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7); @@ -4390,42 +4500,39 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.objectid == root->root_key.objectid) { + if (key.objectid == btrfs_root_id(root)) { ret = -EPERM; btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); - goto out; + return ret; } btrfs_release_path(path); } - key.objectid = root->root_key.objectid; + key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = 0; if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == root->root_key.objectid && - key.type == BTRFS_ROOT_REF_KEY) + if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } -out: - btrfs_free_path(path); + return ret; } @@ -4433,64 +4540,26 @@ out: static void btrfs_prune_dentries(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - u64 objectid = 0; + struct btrfs_inode *inode; + u64 min_ino = 0; if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = btrfs_find_first_inode(root, min_ino); + while (inode) { + if (atomic_read(&inode->vfs_inode.i_count) > 1) + d_prune_aliases(&inode->vfs_inode); - if (objectid < btrfs_ino(entry)) - node = node->rb_left; - else if (objectid > btrfs_ino(entry)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(entry)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(entry) + 1; - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - if (atomic_read(&inode->i_count) > 1) - d_prune_aliases(inode); - /* - * btrfs_drop_inode will have it removed from the inode - * cache when its usage count hits zero. - */ - iput(inode); - cond_resched(); - spin_lock(&root->inode_lock); - goto again; - } - - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); + min_ino = btrfs_ino(inode) + 1; + /* + * btrfs_drop_inode() will have it removed from the inode + * cache when its usage count hits zero. + */ + iput(&inode->vfs_inode); + cond_resched(); + inode = btrfs_find_first_inode(root, min_ino); } - spin_unlock(&root->inode_lock); } int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) @@ -4517,7 +4586,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", - dest->root_key.objectid); + btrfs_root_id(dest)); ret = -EPERM; goto out_up_write; } @@ -4525,7 +4594,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_up_write; } @@ -4554,11 +4623,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = PTR_ERR(trans); goto out_release; } - ret = btrfs_record_root_in_trans(trans, root); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_end_trans; - } btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); qgroup_reserved = 0; trans->block_rsv = &block_rsv; @@ -4586,7 +4650,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4594,8 +4658,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) } ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - dest->root_key.objectid); + BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4604,7 +4667,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4645,7 +4708,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - int err = 0; + int ret = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; struct fscrypt_name fname; @@ -4661,33 +4724,33 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } - err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); - if (err) - return err; + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_notrans; } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } - err = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) goto out; last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); - if (!err) { + if (!ret) { btrfs_i_size_write(BTRFS_I(inode), 0); /* * Propagate the last_unlink_trans value of the deleted dir to @@ -4709,23 +4772,83 @@ out_notrans: btrfs_btree_balance_dirty(fs_info); fscrypt_free_filename(&fname); - return err; + return ret; +} + +static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize) +{ + ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u", + blockstart, blocksize); + + if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1) + return true; + return false; +} + +static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) +{ + const pgoff_t index = (start >> PAGE_SHIFT); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct folio *folio; + u64 zero_start; + u64 zero_end; + int ret = 0; + +again: + folio = filemap_lock_folio(mapping, index); + /* No folio present. */ + if (IS_ERR(folio)) + return 0; + + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto out_unlock; + } + } + folio_wait_writeback(folio); + + /* + * We do not need to lock extents nor wait for OE, as it's already + * beyond EOF. + */ + + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = folio_pos(folio) + folio_size(folio) - 1; + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + +out_unlock: + folio_unlock(folio); + folio_put(folio); + return ret; } /* - * Read, zero a chunk and write a block. + * Handle the truncation of a fs block. * - * @inode - inode that we're zeroing - * @from - the offset to start zeroing - * @len - the length to zero, 0 to zero the entire range respective to the - * offset - * @front - zero up to the offset instead of from the offset on + * @inode - inode that we're zeroing + * @offset - the file offset of the block to truncate + * The value must be inside [@start, @end], and the function will do + * extra checks if the block that covers @offset needs to be zeroed. + * @start - the start file offset of the range we want to zero + * @end - the end (inclusive) file offset of the range we want to zero. * - * This will find the block for the "from" offset and cow the block and zero the - * part we want to zero. This is used with truncate and hole punching. + * If the range is not block aligned, read out the folio that covers @offset, + * and if needed zero blocks that are inside the folio and covered by [@start, @end). + * If @start or @end + 1 lands inside a block, that block will be marked dirty + * for writeback. + * + * This is utilized by hole punch, zero range, file expansion. */ -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front) +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -4735,20 +4858,56 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (blocksize - 1); + pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; + const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), + blocksize); + const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize), + blocksize); + bool need_truncate_head = false; + bool need_truncate_tail = false; + u64 zero_start; + u64 zero_end; u64 block_start; u64 block_end; - if (IS_ALIGNED(offset, blocksize) && - (!len || IS_ALIGNED(len, blocksize))) + /* @offset should be inside the range. */ + ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu", + offset, start, end); + + /* The range is aligned at both ends. */ + if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) { + /* + * For block size < page size case, we may have polluted blocks + * beyond EOF. So we also need to zero them out. + */ + if (end == (u64)-1 && blocksize < PAGE_SIZE) + ret = truncate_block_zero_beyond_eof(inode, start); goto out; + } - block_start = round_down(from, blocksize); + /* + * @offset may not be inside the head nor tail block. In that case we + * don't need to do anything. + */ + if (!in_head_block && !in_tail_block) + goto out; + + /* + * Skip the truncatioin if the range in the target block is already aligned. + * The seemingly complex check will also handle the same block case. + */ + if (in_head_block && !IS_ALIGNED(start, blocksize)) + need_truncate_head = true; + if (in_tail_block && !IS_ALIGNED(end + 1, blocksize)) + need_truncate_tail = true; + if (!need_truncate_head && !need_truncate_tail) + goto out; + + block_start = round_down(offset, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, @@ -4772,10 +4931,13 @@ again: folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out; } @@ -4805,11 +4967,11 @@ again: folio_wait_writeback(folio); - lock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -4817,37 +4979,46 @@ again: goto again; } - clear_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } - if (offset != blocksize) { - if (!len) - len = blocksize - offset; - if (front) - folio_zero_range(folio, block_start - folio_pos(folio), - offset); - else - folio_zero_range(folio, - (block_start - folio_pos(folio)) + offset, - len); + if (end == (u64)-1) { + /* + * We're truncating beyond EOF, the remaining blocks normally are + * already holes thus no need to zero again, but it's possible for + * fs block size < page size cases to have memory mapped writes + * to pollute ranges beyond EOF. + * + * In that case although such polluted blocks beyond EOF will + * not reach disk, it still affects our page caches. + */ + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, + end); + } else { + zero_start = max_t(u64, block_start, start); + zero_end = min_t(u64, block_end, end); } + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) - set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL); + btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, NULL); out_unlock: if (ret) { @@ -4933,16 +5104,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err = 0; + int ret = 0; /* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_block(inode, oldsize, 0, 0); - if (err) - return err; + ret = btrfs_truncate_block(inode, oldsize, oldsize, -1); + if (ret) + return ret; if (size <= hole_start) return 0; @@ -4953,27 +5124,27 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) while (1) { em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { - err = PTR_ERR(em); + ret = PTR_ERR(em); em = NULL; break; } - last_byte = min(extent_map_end(em), block_end); + last_byte = min(btrfs_extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; - err = maybe_insert_hole(inode, cur_offset, hole_size); - if (err) + ret = maybe_insert_hole(inode, cur_offset, hole_size); + if (ret) break; - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, cur_offset, cur_offset + hole_size - 1, @@ -4983,32 +5154,30 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) } hole_em->start = cur_offset; hole_em->len = hole_size; - hole_em->orig_start = cur_offset; - hole_em->block_start = EXTENT_MAP_HOLE; - hole_em->block_len = 0; - hole_em->orig_block_len = 0; + hole_em->disk_bytenr = EXTENT_MAP_HOLE; + hole_em->disk_num_bytes = 0; hole_em->ram_bytes = hole_size; hole_em->generation = btrfs_get_fs_generation(fs_info); - err = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + ret = btrfs_replace_extent_map_range(inode, hole_em, true); + btrfs_free_extent_map(hole_em); } else { - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; } next: - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); - unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); - return err; + btrfs_free_extent_map(em); + btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); + return ret; } static int btrfs_setsize(struct inode *inode, struct iattr *attr) @@ -5065,7 +5234,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); if (btrfs_is_zoned(fs_info)) { - ret = btrfs_wait_ordered_range(inode, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), ALIGN(newsize, fs_info->sectorsize), (u64)-1); if (ret) @@ -5095,7 +5264,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(inode, 0, (u64)-1); + err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (err) return err; i_size_write(inode, BTRFS_I(inode)->disk_i_size); @@ -5191,7 +5360,7 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5205,9 +5374,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1, NULL); - clear_extent_bit(io_tree, start, end, - EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, - &cached_state); + btrfs_clear_extent_bit(io_tree, start, end, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5284,7 +5453,7 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) goto out; @@ -5296,7 +5465,7 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID); goto out; } @@ -5393,7 +5562,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; @@ -5404,7 +5573,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) - goto out; + return ret; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen. @@ -5433,7 +5602,6 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); - btrfs_free_path(path); return ret; } @@ -5448,7 +5616,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key *location, struct btrfs_root **sub_root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -5468,7 +5636,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = dir->root->root_key.objectid; + key.objectid = btrfs_root_id(dir->root); key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -5504,64 +5672,28 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, location->offset = 0; err = 0; out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return err; } -static void inode_tree_add(struct btrfs_inode *inode) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *entry; - struct rb_node **p; - struct rb_node *parent; - struct rb_node *new = &inode->rb_node; - u64 ino = btrfs_ino(inode); - - if (inode_unhashed(&inode->vfs_inode)) - return; - parent = NULL; - spin_lock(&root->inode_lock); - p = &root->inode_tree.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_inode, rb_node); - if (ino < btrfs_ino(entry)) - p = &parent->rb_left; - else if (ino > btrfs_ino(entry)) - p = &parent->rb_right; - else { - WARN_ON(!(entry->vfs_inode.i_state & - (I_WILL_FREE | I_FREEING))); - rb_replace_node(parent, new, &root->inode_tree); - RB_CLEAR_NODE(parent); - spin_unlock(&root->inode_lock); - return; - } - } - rb_link_node(new, parent, p); - rb_insert_color(new, &root->inode_tree); - spin_unlock(&root->inode_lock); -} -static void inode_tree_del(struct btrfs_inode *inode) +static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; - int empty = 0; + struct btrfs_inode *entry; + bool empty = false; - spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&inode->rb_node)) { - rb_erase(&inode->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&inode->rb_node); - empty = RB_EMPTY_ROOT(&root->inode_tree); - } - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + if (entry == inode) + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty && btrfs_root_refs(&root->root_item) == 0) { - spin_lock(&root->inode_lock); - empty = RB_EMPTY_ROOT(&root->inode_tree); - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty) btrfs_add_dead_root(root); } @@ -5572,10 +5704,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - BTRFS_I(inode)->location.objectid = args->ino; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; + btrfs_set_inode_number(BTRFS_I(inode), args->ino); BTRFS_I(inode)->root = btrfs_grab_root(args->root); if (args->root && args->root == args->root->fs_info->tree_root && @@ -5589,12 +5718,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == BTRFS_I(inode)->location.objectid && + return args->ino == btrfs_ino(BTRFS_I(inode)) && args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, - struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5603,87 +5731,106 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, args.ino = ino; args.root = root; - inode = iget5_locked(s, hashval, btrfs_find_actor, + inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* - * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Get an inode object given its inode number and corresponding root. Path is + * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path) +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; + int ret; - inode = btrfs_iget_locked(s, ino, root); + inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - int ret; + if (!(inode->vfs_inode.i_state & I_NEW)) + return inode; - ret = btrfs_read_locked_inode(inode, path); - if (!ret) { - inode_tree_add(BTRFS_I(inode)); - unlock_new_inode(inode); - } else { - iget_failed(inode); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode, this means the inode item - * was not found. - */ - if (ret > 0) - ret = -ENOENT; - inode = ERR_PTR(ret); - } - } + ret = btrfs_read_locked_inode(inode, path); + if (ret) + return ERR_PTR(ret); + unlock_new_inode(&inode->vfs_inode); return inode; } -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) +/* + * Get an inode object given its inode number and corresponding root. + */ +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(s, ino, root, NULL); + struct btrfs_inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->vfs_inode.i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) { + iget_failed(&inode->vfs_inode); + return ERR_PTR(-ENOMEM); + } + + ret = btrfs_read_locked_inode(inode, path); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(&inode->vfs_inode); + return inode; } -static struct inode *new_simple_dir(struct inode *dir, - struct btrfs_key *key, - struct btrfs_root *root) +static struct btrfs_inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) { struct timespec64 ts; - struct inode *inode = new_inode(dir->i_sb); + struct inode *vfs_inode; + struct btrfs_inode *inode; - if (!inode) + vfs_inode = new_inode(dir->i_sb); + if (!vfs_inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = btrfs_grab_root(root); - memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); - set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + inode = BTRFS_I(vfs_inode); + inode->root = btrfs_grab_root(root); + inode->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); + set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); - inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ - inode->i_op = &simple_dir_inode_operations; - inode->i_opflags &= ~IOP_XATTR; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + vfs_inode->i_op = &simple_dir_inode_operations; + vfs_inode->i_opflags &= ~IOP_XATTR; + vfs_inode->i_fop = &simple_dir_operations; + vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - ts = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, ts); - inode_set_atime_to_ts(inode, inode_get_atime(dir)); - BTRFS_I(inode)->i_otime_sec = ts.tv_sec; - BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + ts = inode_set_ctime_current(vfs_inode); + inode_set_mtime_to_ts(vfs_inode, ts); + inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); + inode->i_otime_sec = ts.tv_sec; + inode->i_otime_nsec = ts.tv_nsec; - inode->i_uid = dir->i_uid; - inode->i_gid = dir->i_gid; + vfs_inode->i_uid = dir->i_uid; + vfs_inode->i_gid = dir->i_gid; return inode; } @@ -5697,18 +5844,18 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); -static inline u8 btrfs_inode_type(struct inode *inode) +static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) { - return fs_umode_to_ftype(inode->i_mode); + return fs_umode_to_ftype(inode->vfs_inode.i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; - struct btrfs_key location; + struct btrfs_key location = { 0 }; u8 di_type = 0; int ret = 0; @@ -5720,20 +5867,20 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, location.objectid, root); + inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", - inode->i_mode, btrfs_inode_type(inode), + inode->vfs_inode.i_mode, btrfs_inode_type(inode), di_type); - iput(inode); + iput(&inode->vfs_inode); return ERR_PTR(-EUCLEAN); } - return inode; + return &inode->vfs_inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, @@ -5744,23 +5891,26 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir, &location, root); } else { - inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); + inode = btrfs_iget(location.objectid, sub_root); btrfs_put_root(sub_root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); down_read(&fs_info->cleanup_work_sem); - if (!sb_rdonly(inode->i_sb)) + if (!sb_rdonly(inode->vfs_inode.i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { - iput(inode); + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } } - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return &inode->vfs_inode; } static int btrfs_dentry_delete(const struct dentry *dentry) @@ -5800,7 +5950,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; @@ -5814,15 +5964,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; /* FIXME: we should be able to handle this */ if (ret == 0) - goto out; - ret = 0; + return ret; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } path->slots[0]--; @@ -5833,13 +5982,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; + + return 0; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) @@ -5942,7 +6090,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); void *addr; LIST_HEAD(ins_list); LIST_HEAD(del_list); @@ -5964,7 +6112,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - put = btrfs_readdir_get_delayed_items(inode, private->last_index, + put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index, &ins_list, &del_list); again: @@ -6037,7 +6185,7 @@ again: * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as - * they're returned by readdir. Until we re-use freed offsets + * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. @@ -6054,8 +6202,7 @@ nopos: ret = 0; err: if (put) - btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); - btrfs_free_path(path); + btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); return ret; } @@ -6139,7 +6286,7 @@ static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; - args.ino = BTRFS_I(inode)->location.objectid; + args.ino = btrfs_ino(BTRFS_I(inode)); args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, @@ -6233,7 +6380,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6246,7 +6393,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root; struct btrfs_inode_item *inode_item; - struct btrfs_key *location; struct btrfs_path *path; u64 objectid; struct btrfs_inode_ref *ref; @@ -6255,6 +6401,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_item_batch batch; unsigned long ptr; int ret; + bool xa_reserved = false; path = btrfs_alloc_path(); if (!path) @@ -6264,10 +6411,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); root = BTRFS_I(inode)->root; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + goto out; + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; - inode->i_ino = objectid; + btrfs_set_inode_number(BTRFS_I(inode), objectid); + + ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); + if (ret) + goto out; + xa_reserved = true; if (args->orphan) { /* @@ -6282,8 +6438,10 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) goto out; } - /* index_cnt is ignored for everything but a dir. */ - BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -6308,13 +6466,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); } - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - ret = btrfs_insert_inode_locked(inode); if (ret < 0) { if (!args->orphan) @@ -6397,7 +6551,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6407,28 +6560,28 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, path = NULL; if (args->subvol) { - struct inode *parent; + struct btrfs_inode *parent; /* * Subvolumes inherit properties from their parent subvolume, * not the directory they were created in. */ - parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - BTRFS_I(dir)->root); + parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { - ret = btrfs_inode_inherit_props(trans, inode, parent); - iput(parent); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + parent); + iput(&parent->vfs_inode); } } else { - ret = btrfs_inode_inherit_props(trans, inode, dir); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + BTRFS_I(dir)); } if (ret) { btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, - ret); + btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } /* @@ -6443,7 +6596,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(BTRFS_I(inode)); + ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); + if (WARN_ON(ret)) { + /* Shouldn't happen, we used xa_reserve() before. */ + btrfs_abort_transaction(trans, ret); + goto discard; + } trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); @@ -6471,6 +6629,9 @@ discard: ihold(inode); discard_new_inode(inode); out: + if (xa_reserved) + xa_release(&root->inodes, objectid); + btrfs_free_path(path); return ret; } @@ -6501,7 +6662,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, index, name); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, @@ -6513,7 +6674,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, - btrfs_inode_type(&inode->vfs_inode), index); + btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6544,7 +6705,7 @@ fail_dir_item: u64 local_index; int err; err = btrfs_del_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, &local_index, name); if (err) btrfs_abort_transaction(trans, err); @@ -6642,7 +6803,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) + if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) @@ -6713,26 +6874,27 @@ fail: return err; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, - struct page *page, + struct folio *folio, struct btrfs_file_extent_item *item) { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6749,8 +6911,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + max_size = min_t(unsigned long, blocksize, max_size); + ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, + max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6760,37 +6923,37 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - memzero_page(page, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } -static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct page *page) +static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; - if (!page || PageUptodate(page)) + if (!folio || folio_test_uptodate(folio)) return 0; - ASSERT(page_offset(page) == 0); + ASSERT(folio_pos(folio) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page, fi); + return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - memzero_page(page, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } @@ -6812,7 +6975,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len) + struct folio *folio, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6829,26 +6992,25 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map_tree *em_tree = &inode->extent_tree; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) - free_extent_map(em); - else if (em->block_start == EXTENT_MAP_INLINE && page) - free_extent_map(em); + btrfs_free_extent_map(em); + else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) + btrfs_free_extent_map(em); else goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; - em->orig_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; em->len = (u64)-1; - em->block_len = (u64)-1; path = btrfs_alloc_path(); if (!path) { @@ -6938,9 +7100,8 @@ next: /* New extent overlaps with existing one */ em->start = start; - em->orig_start = start; em->len = found_key.offset - start; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; goto insert; } @@ -6964,23 +7125,22 @@ next: * * Other members are not utilized for inline extents. */ - ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page); + ret = read_inline_extent(path, folio); if (ret < 0) goto out; goto insert; } not_found: em->start = start; - em->orig_start = start; em->len = len; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; insert: ret = 0; btrfs_release_path(path); - if (em->start > start || extent_map_end(em) <= start) { + if (em->start > start || btrfs_extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -6989,7 +7149,7 @@ insert: } write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); @@ -6997,90 +7157,12 @@ out: trace_btrfs_get_extent(root, inode, em); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } return em; } -static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - const u64 start, - const u64 len, - const u64 orig_start, - const u64 block_start, - const u64 block_len, - const u64 orig_block_len, - const u64 ram_bytes, - const int type) -{ - struct extent_map *em = NULL; - struct btrfs_ordered_extent *ordered; - - if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, block_start, - block_len, orig_block_len, ram_bytes, - BTRFS_COMPRESS_NONE, /* compress_type */ - type); - if (IS_ERR(em)) - goto out; - } - ordered = btrfs_alloc_ordered_extent(inode, start, len, len, - block_start, block_len, 0, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT), - BTRFS_COMPRESS_NONE); - if (IS_ERR(ordered)) { - if (em) { - free_extent_map(em); - btrfs_drop_extent_map_range(inode, start, - start + len - 1, false); - } - em = ERR_CAST(ordered); - } else { - ASSERT(!dio_data->ordered); - dio_data->ordered = ordered; - } - out: - - return em; -} - -static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 len) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_map *em; - struct btrfs_key ins; - u64 alloc_hint; - int ret; - - alloc_hint = get_extent_allocation_hint(inode, start, len); -again: - ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, - 0, alloc_hint, &ins, 1, 1); - if (ret == -EAGAIN) { - ASSERT(btrfs_is_zoned(fs_info)); - wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - goto again; - } - if (ret) - return ERR_PTR(ret); - - em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, - ins.objectid, ins.offset, ins.offset, - ins.offset, BTRFS_ORDERED_REGULAR); - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); - - return em; -} - static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; @@ -7103,8 +7185,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent - * @strict: if true, omit optimizations that might force us into unnecessary - * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write @@ -7114,17 +7194,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict) +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, + struct btrfs_file_extent *file_extent, + bool nowait) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; @@ -7134,806 +7214,146 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, return -ENOMEM; path->nowait = nowait; - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); if (ret < 0) - goto out; + return ret; if (ret == 1) { if (path->slots[0] == 0) { - /* can't find the item, must cow */ - ret = 0; - goto out; + /* Can't find the item, must COW. */ + return 0; } path->slots[0]--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { - /* not our file or wrong item type, must cow */ - goto out; + /* Not our file or wrong item type, must COW. */ + return 0; } if (key.offset > offset) { - /* Wrong offset, must cow */ - goto out; + /* Wrong offset, must COW. */ + return 0; } if (btrfs_file_extent_end(path) <= offset) - goto out; + return 0; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); - if (ram_bytes) - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; - nocow_args.strict = strict; nocow_args.free_path = true; - ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + ret = can_nocow_file_extent(path, &key, inode, &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; if (ret != 1) { /* Treat errors as not being able to NOCOW. */ - ret = 0; - goto out; + return 0; } - ret = 0; - if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) - goto out; + if (btrfs_extent_readonly(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset)) + return 0; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + nocow_args.num_bytes, + range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); - if (ret) { - ret = -EAGAIN; - goto out; - } + ret = btrfs_test_range_bit_exists(io_tree, offset, range_end, + EXTENT_DELALLOC); + if (ret) + return -EAGAIN; } - if (orig_start) - *orig_start = key.offset - nocow_args.extent_offset; - if (orig_block_len) - *orig_block_len = nocow_args.disk_num_bytes; - - *len = nocow_args.num_bytes; - ret = 1; -out: - btrfs_free_path(path); - return ret; -} - -static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, - unsigned int iomap_flags) -{ - const bool writing = (iomap_flags & IOMAP_WRITE); - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - int ret = 0; - - while (1) { - if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; - } else { - lock_extent(io_tree, lockstart, lockend, cached_state); - } - /* - * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure there's no ordered - * extents in this range. - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, - lockend - lockstart + 1); - - /* - * We need to make sure there are no buffered pages in this - * range either, we could have raced between the invalidate in - * generic_file_direct_write and locking the extent. The - * invalidate needs to happen so that reads after a write do not - * get stale data. - */ - if (!ordered && - (!writing || !filemap_range_has_page(inode->i_mapping, - lockstart, lockend))) - break; + if (file_extent) + memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); - unlock_extent(io_tree, lockstart, lockend, cached_state); + *len = nocow_args.file_extent.num_bytes; - if (ordered) { - if (nowait) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - break; - } - /* - * If we are doing a DIO read and the ordered extent we - * found is for a buffered write, we can not wait for it - * to complete and retry, because if we do so we can - * deadlock with concurrent buffered writes on page - * locks. This happens only if our DIO read covers more - * than one extent map, if at this point has already - * created an ordered extent for a previous extent map - * and locked its range in the inode's io tree, and a - * concurrent write against that previous extent map's - * range and this range started (we unlock the ranges - * in the io tree only when the bios complete and - * buffered writes always lock pages before attempting - * to lock range in the io tree). - */ - if (writing || - test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered); - else - ret = nowait ? -EAGAIN : -ENOTBLK; - btrfs_put_ordered_extent(ordered); - } else { - /* - * We could trigger writeback for this range (and wait - * for it to complete) and then invalidate the pages for - * this range (through invalidate_inode_pages2_range()), - * but that can lead us to a deadlock with a concurrent - * call to readahead (a buffered read or a defrag call - * triggered a readahead) on a page lock due to an - * ordered dio extent we created before but did not have - * yet a corresponding bio submitted (whence it can not - * complete), which makes readahead wait for that - * ordered extent to complete while holding a lock on - * that page. - */ - ret = nowait ? -EAGAIN : -ENOTBLK; - } - - if (ret) - break; - - cond_resched(); - } - - return ret; + return 1; } /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type) +struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, + const struct btrfs_file_extent *file_extent, + int type) { struct extent_map *em; int ret; + /* + * Note the missing NOCOW type. + * + * For pure NOCOW writes, we should not create an io extent map, but + * just reusing the existing one. + * Only PREALLOC writes (NOCOW write into preallocated range) can + * create an io extent map. + */ ASSERT(type == BTRFS_ORDERED_PREALLOC || type == BTRFS_ORDERED_COMPRESSED || - type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); - em = alloc_extent_map(); + switch (type) { + case BTRFS_ORDERED_PREALLOC: + /* We're only referring part of a larger preallocated extent. */ + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); + break; + case BTRFS_ORDERED_REGULAR: + /* COW results a new extent matching our file extent size. */ + ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); + ASSERT(file_extent->ram_bytes == file_extent->num_bytes); + + /* Since it's a new extent, we should not have any offset. */ + ASSERT(file_extent->offset == 0); + break; + case BTRFS_ORDERED_COMPRESSED: + /* Must be compressed. */ + ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE); + + /* + * Encoded write can make us to refer to part of the + * uncompressed extent. + */ + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); + break; + } + + em = btrfs_alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); em->start = start; - em->orig_start = orig_start; - em->len = len; - em->block_len = block_len; - em->block_start = block_start; - em->orig_block_len = orig_block_len; - em->ram_bytes = ram_bytes; + em->len = file_extent->num_bytes; + em->disk_bytenr = file_extent->disk_bytenr; + em->disk_num_bytes = file_extent->disk_num_bytes; + em->ram_bytes = file_extent->ram_bytes; em->generation = -1; + em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; - if (type == BTRFS_ORDERED_PREALLOC) - em->flags |= EXTENT_FLAG_FILLING; - else if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, compress_type); + if (type == BTRFS_ORDERED_COMPRESSED) + btrfs_extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } - /* em got 2 refs now, callers needs to do free_extent_map once. */ + /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */ return em; } - -static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 *lenp, - unsigned int iomap_flags) -{ - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_map *em = *map; - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - struct btrfs_block_group *bg; - bool can_nocow = false; - bool space_reserved = false; - u64 len = *lenp; - u64 prev_len; - int ret = 0; - - /* - * We don't allocate a new extent in the following cases - * - * 1) The inode is marked as NODATACOW. In this case we'll just use the - * existing extent. - * 2) The extent is marked as PREALLOC. We're good to go here and can - * just use the extent. - * - */ - if ((em->flags & EXTENT_FLAG_PREALLOC) || - ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - em->block_start != EXTENT_MAP_HOLE)) { - if (em->flags & EXTENT_FLAG_PREALLOC) - type = BTRFS_ORDERED_PREALLOC; - else - type = BTRFS_ORDERED_NOCOW; - len = min(len, em->len - (start - em->start)); - block_start = em->block_start + (start - em->start); - - if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false, false) == 1) { - bg = btrfs_inc_nocow_writers(fs_info, block_start); - if (bg) - can_nocow = true; - } - } - - prev_len = len; - if (can_nocow) { - struct extent_map *em2; - - /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - nowait); - if (ret < 0) { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - btrfs_dec_nocow_writers(bg); - if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) - ret = -EAGAIN; - goto out; - } - space_reserved = true; - - em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); - btrfs_dec_nocow_writers(bg); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em2; - em = em2; - } - - if (IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - - dio_data->nocow_done = true; - } else { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - - if (nowait) { - ret = -EAGAIN; - goto out; - } - - /* - * If we could not allocate data space before locking the file - * range and we can't do a NOCOW write, then we have to fail. - */ - if (!dio_data->data_space_reserved) { - ret = -ENOSPC; - goto out; - } - - /* - * We have to COW and we have already reserved data space before, - * so now we reserve only metadata. - */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - false); - if (ret < 0) - goto out; - space_reserved = true; - - em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - *map = em; - len = min(len, em->len - (start - em->start)); - if (len < prev_len) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - prev_len - len, true); - } - - /* - * We have created our ordered extent, so we can now release our reservation - * for an outstanding extent. - */ - btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); - - /* - * Need to update the i_size under the extent lock so buffered - * readers will get the updated i_size when we unlock. - */ - if (start + len > i_size_read(inode)) - i_size_write(inode, start + len); -out: - if (ret && space_reserved) { - btrfs_delalloc_release_extents(BTRFS_I(inode), len); - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } - *lenp = len; - return ret; -} - -static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - loff_t length, unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_map *em; - struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = iter->private; - u64 lockstart, lockend; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - u64 len = length; - const u64 data_alloc_len = length; - bool unlock_extents = false; - - /* - * We could potentially fault if we have a buffer > PAGE_SIZE, and if - * we're NOWAIT we may submit a bio for a partial range and return - * EIOCBQUEUED, which would result in an errant short read. - * - * The best way to handle this would be to allow for partial completions - * of iocb's, so we could submit the partial bio, return and fault in - * the rest of the pages, and then submit the io for the rest of the - * range. However we don't have that currently, so simply return - * -EAGAIN at this point so that the normal path is used. - */ - if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) - return -EAGAIN; - - /* - * Cap the size of reads to that usually seen in buffered I/O as we need - * to allocate a contiguous array for the checksums. - */ - if (!write) - len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); - - lockstart = start; - lockend = start + len - 1; - - /* - * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't - * enough if we've written compressed pages to this area, so we need to - * flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk - the first flush only starts - * compression on the data, while keeping the pages locked, so by the - * time the second flush returns we know bios for the compressed pages - * were submitted and finished, and the pages no longer under writeback. - * - * If we have a NOWAIT request and we have any pages in the range that - * are locked, likely due to compression still in progress, we don't want - * to block on page locks. We also don't want to block on pages marked as - * dirty or under writeback (same as for the non-compression case). - * iomap_dio_rw() did the same check, but after that and before we got - * here, mmap'ed writes may have happened or buffered reads started - * (readpage() and readahead(), which lock pages), as we haven't locked - * the file range yet. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) { - if (flags & IOMAP_NOWAIT) { - if (filemap_range_needs_writeback(inode->i_mapping, - lockstart, lockend)) - return -EAGAIN; - } else { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; - } - } - - memset(dio_data, 0, sizeof(*dio_data)); - - /* - * We always try to allocate data space and must do it before locking - * the file range, to avoid deadlocks with concurrent writes to the same - * range if the range has several extents and the writes don't expand the - * current i_size (the inode lock is taken in shared mode). If we fail to - * allocate data space here we continue and later, after locking the - * file range, we fail with ENOSPC only if we figure out we can not do a - * NOCOW write. - */ - if (write && !(flags & IOMAP_NOWAIT)) { - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, data_alloc_len, false); - if (!ret) - dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) - goto err; - } - - /* - * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered IO, or we are doing a - * NOWAIT read/write and we need to block. - */ - ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); - if (ret < 0) - goto err; - - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } - - /* - * Ok for INLINE and COMPRESSED extents we need to fallback on buffered - * io. INLINE is special, and we could probably kludge it in here, but - * it's still buffered so for safety lets just fall back to the generic - * buffered path. - * - * For COMPRESSED we _have_ to read the entire extent in so we can - * decompress it, so there will be buffering required no matter what we - * do, so go ahead and fallback to buffered. - * - * We return -ENOTBLK because that's what makes DIO go ahead and go back - * to buffered IO. Don't blame me, this is the price we pay for using - * the generic code. - */ - if (extent_map_is_compressed(em) || - em->block_start == EXTENT_MAP_INLINE) { - free_extent_map(em); - /* - * If we are in a NOWAIT context, return -EAGAIN in order to - * fallback to buffered IO. This is not only because we can - * block with buffered IO (no support for NOWAIT semantics at - * the moment) but also to avoid returning short reads to user - * space - this happens if we were able to read some data from - * previous non-compressed extents and then when we fallback to - * buffered IO, at btrfs_file_read_iter() by calling - * filemap_read(), we fail to fault in pages for the read buffer, - * in which case filemap_read() returns a short read (the number - * of bytes previously read is > 0, so it does not return -EFAULT). - */ - ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; - goto unlock_err; - } - - len = min(len, em->len - (start - em->start)); - - /* - * If we have a NOWAIT request and the range contains multiple extents - * (or a mix of extents and holes), then we return -EAGAIN to make the - * caller fallback to a context where it can do a blocking (without - * NOWAIT) request. This way we avoid doing partial IO and returning - * success to the caller, which is not optimal for writes and for reads - * it can result in unexpected behaviour for an application. - * - * When doing a read, because we use IOMAP_DIO_PARTIAL when calling - * iomap_dio_rw(), we can end up returning less data then what the caller - * asked for, resulting in an unexpected, and incorrect, short read. - * That is, the caller asked to read N bytes and we return less than that, - * which is wrong unless we are crossing EOF. This happens if we get a - * page fault error when trying to fault in pages for the buffer that is - * associated to the struct iov_iter passed to iomap_dio_rw(), and we - * have previously submitted bios for other extents in the range, in - * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of - * those bios have completed by the time we get the page fault error, - * which we return back to our caller - we should only return EIOCBQUEUED - * after we have submitted bios for all the extents in the range. - */ - if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); - ret = -EAGAIN; - goto unlock_err; - } - - if (write) { - ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, &len, flags); - if (ret < 0) - goto unlock_err; - unlock_extents = true; - /* Recalc len in case the new em is smaller than requested */ - len = min(len, em->len - (start - em->start)); - if (dio_data->data_space_reserved) { - u64 release_offset; - u64 release_len = 0; - - if (dio_data->nocow_done) { - release_offset = start; - release_len = data_alloc_len; - } else if (len < data_alloc_len) { - release_offset = start + len; - release_len = data_alloc_len - len; - } - - if (release_len > 0) - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - release_offset, - release_len); - } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; - } - - if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - else - free_extent_state(cached_state); - - /* - * Translate extent map information to iomap. - * We trim the extents (and move the addr) even though iomap code does - * that, since we have locked only the parts we are performing I/O in. - */ - if ((em->block_start == EXTENT_MAP_HOLE) || - ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - } else { - iomap->addr = em->block_start + (start - em->start); - iomap->type = IOMAP_MAPPED; - } - iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_dev->bdev; - iomap->length = len; - free_extent_map(em); - - return 0; - -unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); -err: - if (dio_data->data_space_reserved) { - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - start, data_alloc_len); - extent_changeset_free(dio_data->data_reserved); - } - - return ret; -} - -static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - ssize_t written, unsigned int flags, struct iomap *iomap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_dio_data *dio_data = iter->private; - size_t submitted = dio_data->submitted; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - - if (!write && (iomap->type == IOMAP_HOLE)) { - /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); - return 0; - } - - if (submitted < length) { - pos += submitted; - length -= submitted; - if (write) - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - pos, length, false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); - ret = -ENOTBLK; - } - if (write) { - btrfs_put_ordered_extent(dio_data->ordered); - dio_data->ordered = NULL; - } - - if (write) - extent_changeset_free(dio_data->data_reserved); - return ret; -} - -static void btrfs_dio_end_io(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_inode *inode = bbio->inode; - struct bio *bio = &bbio->bio; - - if (bio->bi_status) { - btrfs_warn(inode->root->fs_info, - "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", - btrfs_ino(inode), bio->bi_opf, - dip->file_offset, dip->bytes, bio->bi_status); - } - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_finish_ordered_extent(bbio->ordered, NULL, - dip->file_offset, dip->bytes, - !bio->bi_status); - } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - bbio->bio.bi_private = bbio->private; - iomap_dio_bio_end_io(bio); -} - -static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, - loff_t file_offset) -{ - struct btrfs_bio *bbio = btrfs_bio(bio); - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_dio_data *dio_data = iter->private; - - btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, - btrfs_dio_end_io, bio->bi_private); - bbio->inode = BTRFS_I(iter->inode); - bbio->file_offset = file_offset; - - dip->file_offset = file_offset; - dip->bytes = bio->bi_iter.bi_size; - - dio_data->submitted += bio->bi_iter.bi_size; - - /* - * Check if we are doing a partial write. If we are, we need to split - * the ordered extent to match the submitted bio. Hang on to the - * remaining unfinishable ordered_extent in dio_data so that it can be - * cancelled in iomap_end to avoid a deadlock wherein faulting the - * remaining pages is blocked on the outstanding ordered extent. - */ - if (iter->flags & IOMAP_WRITE) { - int ret; - - ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); - if (ret) { - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - file_offset, dip->bytes, - !ret); - bio->bi_status = errno_to_blk_status(ret); - iomap_dio_bio_end_io(bio); - return; - } - } - - btrfs_submit_bio(bbio, 0); -} - -static const struct iomap_ops btrfs_dio_iomap_ops = { - .iomap_begin = btrfs_dio_iomap_begin, - .iomap_end = btrfs_dio_iomap_end, -}; - -static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_dio_submit_io, - .bio_set = &btrfs_dio_bioset, -}; - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - int ret; - - ret = fiemap_prep(inode, fieinfo, start, &len, 0); - if (ret) - return ret; - - /* - * fiemap_prep() called filemap_write_and_wait() for the whole possible - * file range (0 to LLONG_MAX), but that is not enough if we have - * compression enabled. The first filemap_fdatawrite_range() only kicks - * in the compression of data (in an async thread) and will return - * before the compression is done and writeback is started. A second - * filemap_fdatawrite_range() is needed to wait for the compression to - * complete and writeback to start. We also need to wait for ordered - * extents to complete, because our fiemap implementation uses mainly - * file extent items to list the extents, searching for extent maps - * only for file ranges with holes or prealloc extents to figure out - * if we have delalloc in those ranges. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) - return ret; - } - - btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); - - /* - * We did an initial flush to avoid holding the inode's lock while - * triggering writeback and waiting for the completion of IO and ordered - * extents. Now after we locked the inode we do it again, because it's - * possible a new write may have happened in between those two steps. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) { - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - return ret; - } - } - - ret = extent_fiemap(btrfs_inode, fieinfo, start, len); - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - - return ret; -} - -static int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return extent_writepages(mapping, wbc); -} - -static void btrfs_readahead(struct readahead_control *rac) -{ - extent_readahead(rac); -} - /* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. @@ -7941,13 +7361,12 @@ static void btrfs_readahead(struct readahead_control *rac) * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ -static void wait_subpage_spinlock(struct page *page) +static void wait_subpage_spinlock(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7968,15 +7387,20 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } -static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +static int btrfs_launder_folio(struct folio *folio) { - int ret = try_release_extent_mapping(&folio->page, gfp_flags); + return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), + folio_size(folio), NULL); +} - if (ret == 1) { - wait_subpage_spinlock(&folio->page); - clear_page_extent_mapped(&folio->page); +static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +{ + if (try_release_extent_mapping(folio, gfp_flags)) { + wait_subpage_spinlock(folio); + clear_folio_extent_mapped(folio); + return true; } - return ret; + return false; } static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -8025,7 +7449,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered - * (Private2) already cleared, so it's possible for endio and + * already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * @@ -8033,7 +7457,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); /* * For subpage case, we have call sites like @@ -8053,7 +7477,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, } if (!inode_evicting) - lock_extent(tree, page_start, page_end, &cached_state); + btrfs_lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { @@ -8091,7 +7515,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, range_len = range_end + 1 - cur; if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* - * If Ordered (Private2) is cleared, it means endio has + * If Ordered is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. @@ -8109,10 +7533,10 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, cur, range_end, - EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(tree, cur, range_end, + EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8154,191 +7578,23 @@ next: * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); - if (!inode_evicting) { - clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | - extra_flags, &cached_state); - } + if (!inode_evicting) + btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG | extra_flags, + &cached_state); cur = range_end + 1; } /* * We have iterated through all ordered extents of the page, the page - * should not have Ordered (Private2) anymore, or the above iteration + * should not have Ordered anymore, or the above iteration * did something wrong. */ ASSERT(!folio_test_ordered(folio)); btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); - clear_page_extent_mapped(&folio->page); -} - -/* - * btrfs_page_mkwrite() is not allowed to change the file size as it gets - * called from a page fault handler when a page is first dirtied. Hence we must - * be careful to check for EOF conditions here. We set the page up correctly - * for a written page which means we get ENOSPC checking when writing into - * holes and correct delalloc and unwritten extent mapping on filesystems that - * support these features. - * - * We are not allowed to take the i_mutex here so we have to play games to - * protect against truncate races as the page could now be beyond EOF. Because - * truncate_setsize() writes the inode size before removing pages, once we have - * the page lock we can determine safely if the page is beyond EOF. If it is not - * beyond EOF, then the page is guaranteed safe against truncation until we - * unlock the page. - */ -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct folio *folio = page_folio(page); - struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_changeset *data_reserved = NULL; - unsigned long zero_start; - loff_t size; - vm_fault_t ret; - int ret2; - int reserved = 0; - u64 reserved_space; - u64 page_start; - u64 page_end; - u64 end; - - ASSERT(folio_order(folio) == 0); - - reserved_space = PAGE_SIZE; - - sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - end = page_end; - - /* - * Reserving delalloc space after obtaining the page lock can lead to - * deadlock. For example, if a dirty page is locked by this function - * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepages() function could - * end up waiting indefinitely to get a lock on the page currently - * being processed by btrfs_page_mkwrite() function. - */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; - goto out_noreserve; - } - - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ -again: - down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); - size = i_size_read(inode); - - if ((page->mapping != inode->i_mapping) || - (page_start >= size)) { - /* page got truncated out from underneath us */ - goto out_unlock; - } - wait_on_page_writeback(page); - - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); - goto out_unlock; - } - - /* - * we can't set the delalloc bits if there are pending ordered - * extents. Drop our locks and wait for them to finish - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, - PAGE_SIZE); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - goto again; - } - - if (page->index == ((size - 1) >> PAGE_SHIFT)) { - reserved_space = round_up(size - page_start, - fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { - end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); - } - } - - /* - * page_mkwrite gets called when the page is firstly dirtied after it's - * faulted in, but write(2) could also dirty a page and set delalloc - * bits, thus in this case for space account reason, we still need to - * clear any delalloc bits within this page range since we have to - * reserve data&meta space before lock_page() (see above comments). - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); - - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, - &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; - goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); - else - zero_start = PAGE_SIZE; - - if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); - - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); - btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); - btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); - - btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - - unlock_extent(io_tree, page_start, page_end, &cached_state); - up_read(&BTRFS_I(inode)->i_mmap_lock); - - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - -out_unlock: - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); -out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); -out_noreserve: - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return ret; + clear_folio_extent_mapped(folio); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) @@ -8358,7 +7614,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(&inode->vfs_inode, + ret = btrfs_wait_ordered_range(inode, inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) @@ -8430,7 +7686,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last @@ -8445,7 +7701,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) @@ -8489,7 +7745,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, + inode->vfs_inode.i_size, (u64)-1); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -8559,20 +7816,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; - struct extent_io_tree *file_extent_tree = NULL; - - /* Self tests may pass a NULL fs_info. */ - if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { - file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); - if (!file_extent_tree) - return NULL; - } ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); - if (!ei) { - kfree(file_extent_tree); + if (!ei) return NULL; - } ei->root = NULL; ei->generation = 0; @@ -8585,8 +7832,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->disk_i_size = 0; ei->flags = 0; ei->ro_flags = 0; + /* + * ->index_cnt will be properly initialized later when creating a new + * inode (btrfs_create_new_inode()) or when reading an existing inode + * from disk (btrfs_read_locked_inode()). + */ ei->csum_bytes = 0; - ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_reflink_trans = 0; @@ -8607,26 +7858,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_otime_nsec = 0; inode = &ei->vfs_inode; - extent_map_tree_init(&ei->extent_tree); + btrfs_extent_map_tree_init(&ei->extent_tree); /* This io tree sets the valid inode. */ - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - ei->file_extent_tree = file_extent_tree; - if (file_extent_tree) { - extent_io_tree_init(fs_info, ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); - /* Lockdep class is set only for the file extent tree. */ - lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); - } + ei->file_extent_tree = NULL; + mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); - RB_CLEAR_NODE(&ei->rb_node); init_rwsem(&ei->i_mmap_lock); return inode; @@ -8662,9 +7907,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!S_ISDIR(vfs_inode->i_mode)) { WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); } - WARN_ON(inode->csum_bytes); - WARN_ON(inode->defrag_bytes); + if (!root || !btrfs_is_data_reloc_root(root)) + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8698,7 +7944,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) } } btrfs_qgroup_check_reserved_leak(inode); - inode_tree_del(inode); + btrfs_del_inode_from_root(inode); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); @@ -8732,7 +7978,6 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); - bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); } @@ -8743,17 +7988,9 @@ int __init btrfs_init_cachep(void) SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) - goto fail; - - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bbio.bio), - BIOSET_NEED_BVECS)) - goto fail; + return -ENOMEM; return 0; -fail: - btrfs_destroy_cachep(); - return -ENOMEM; } static int btrfs_getattr(struct mnt_idmap *idmap, @@ -8789,6 +8026,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; + stat->subvol = BTRFS_I(inode)->root->root_key.objectid; + stat->result_mask |= STATX_SUBVOL; + spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; inode_bytes = inode_get_bytes(inode); @@ -8952,31 +8192,45 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), @@ -9212,16 +8466,23 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } if (new_inode) { @@ -9229,18 +8490,27 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } - if (!ret && new_inode->i_nlink == 0) + if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } } @@ -9355,8 +8625,6 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { - struct btrfs_inode *binode; - struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); @@ -9367,30 +8635,30 @@ static int start_delalloc_inodes(struct btrfs_root *root, spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - binode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + struct btrfs_inode *inode; + struct inode *tmp_inode; + + inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - list_move_tail(&binode->delalloc_inodes, - &root->delalloc_inodes); + list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && - test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) continue; - inode = igrab(&binode->vfs_inode); - if (!inode) { + tmp_inode = igrab(&inode->vfs_inode); + if (!tmp_inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) - set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, - &binode->runtime_flags); + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(inode); + work = btrfs_alloc_delalloc_work(&inode->vfs_inode); if (!work) { - iput(inode); + iput(&inode->vfs_inode); ret = -ENOMEM; goto out; } @@ -9398,8 +8666,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -9516,7 +8784,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + /* + * Symlinks utilize uncompressed inline extent data, which should not + * reach block size. + */ + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + name_len >= fs_info->sectorsize) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); @@ -9555,8 +8828,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); @@ -9580,7 +8853,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); @@ -9668,7 +8940,7 @@ free_qgroup: * or we leak qgroup data reservation. */ btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, qgroup_released, + btrfs_root_id(inode->root), qgroup_released, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret); } @@ -9730,11 +9002,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 0); + ins.offset, false); break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, false); @@ -9743,17 +9015,16 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } em->start = cur_offset; - em->orig_start = cur_offset; em->len = ins.offset; - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; + em->disk_bytenr = ins.objectid; + em->offset = 0; + em->disk_num_bytes = ins.offset; em->ram_bytes = ins.offset; em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); - free_extent_map(em); + btrfs_free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; @@ -9888,28 +9159,6 @@ out_inode: return finish_open_simple(file, ret); } -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - while (index <= end_index) { - page = find_get_page(inode->vfs_inode.i_mapping, index); - ASSERT(page); /* Pages should be in the extent_io_tree */ - - /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(page_folio(page)) == 0); - btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); - put_page(page); - index++; - } -} - int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { @@ -9947,27 +9196,29 @@ static ssize_t btrfs_encoded_read_inline( struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *item; u64 ram_bytes; unsigned long ptr; void *tmp; ssize_t ret; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; + + path->nowait = nowait; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { if (ret > 0) { /* The extent item disappeared? */ - ret = -EIO; + return -EIO; } - goto out; + return ret; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -9980,17 +9231,16 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, item)); if (ret < 0) - goto out; + return ret; encoded->compression = ret; if (encoded->compression) { size_t inline_size; inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); - if (inline_size > count) { - ret = -ENOBUFS; - goto out; - } + if (inline_size > count) + return -ENOBUFS; + count = inline_size; encoded->unencoded_len = ram_bytes; encoded->unencoded_offset = iocb->ki_pos - extent_start; @@ -10002,13 +9252,12 @@ static ssize_t btrfs_encoded_read_inline( } tmp = kmalloc(count, GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out; - } + if (!tmp) + return -ENOMEM; + read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -10016,14 +9265,14 @@ static ssize_t btrfs_encoded_read_inline( if (ret != count) ret = -EFAULT; kfree(tmp); -out: - btrfs_free_path(path); + return ret; } struct btrfs_encoded_read_private { - wait_queue_head_t wait; - atomic_t pending; + struct completion *sync_reads; + void *uring_ctx; + refcount_t pending_refs; blk_status_t status; }; @@ -10033,35 +9282,58 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) if (bbio->bio.bi_status) { /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the - * atomic_dec_return() or io_wait_event() in - * btrfs_encoded_read_regular_fill_pages() to ensure that this - * write is observed before the load of status in + * The memory barrier implied by the refcount_dec_and_test() here + * pairs with the memory barrier implied by the refcount_dec_and_test() + * in btrfs_encoded_read_regular_fill_pages() to ensure that + * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (!atomic_dec_return(&priv->pending)) - wake_up(&priv->wait); + if (refcount_dec_and_test(&priv->pending_refs)) { + int err = blk_status_to_errno(READ_ONCE(priv->status)); + + if (priv->uring_ctx) { + btrfs_uring_read_extent_endio(priv->uring_ctx, err); + kfree(priv); + } else { + complete(priv->sync_reads); + } + } bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .pending = ATOMIC_INIT(1), - }; + struct btrfs_encoded_read_private *priv, sync_priv; + struct completion sync_reads; unsigned long i = 0; struct btrfs_bio *bbio; + int ret; - init_waitqueue_head(&priv.wait); + /* + * Fast path for synchronous reads which completes in this call, io_uring + * needs longer time span. + */ + if (uring_ctx) { + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + } else { + priv = &sync_priv; + init_completion(&sync_reads); + priv->sync_reads = &sync_reads; + } + + refcount_set(&priv->pending_refs, 1); + priv->status = 0; + priv->uring_ctx = uring_ctx; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -10069,11 +9341,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; @@ -10084,22 +9356,31 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); + + if (uring_ctx) { + if (refcount_dec_and_test(&priv->pending_refs)) { + ret = blk_status_to_errno(READ_ONCE(priv->status)); + btrfs_uring_read_extent_endio(uring_ctx, ret); + kfree(priv); + return ret; + } - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); - /* See btrfs_encoded_read_endio() for ordering. */ - return blk_status_to_errno(READ_ONCE(priv.status)); + return -EIOCBQUEUED; + } else { + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&sync_reads); + /* See btrfs_encoded_read_endio() for ordering. */ + return blk_status_to_errno(READ_ONCE(priv->status)); + } } -static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, - struct iov_iter *iter, - u64 start, u64 lockend, - struct extent_state **cached_state, - u64 disk_bytenr, u64 disk_io_size, - size_t count, bool compressed, - bool *unlocked) +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; @@ -10113,18 +9394,18 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages, 0); + ret = btrfs_alloc_page_array(nr_pages, pages, false); if (ret) { ret = -ENOMEM; goto out; } - ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, - disk_io_size, pages); + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, NULL); if (ret) goto out; - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -10160,21 +9441,26 @@ out: } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded) + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); - u64 start, lockend, disk_bytenr, disk_io_size; - struct extent_state *cached_state = NULL; + u64 start, lockend; struct extent_map *em; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); bool unlocked = false; file_accessed(iocb->ki_filp); - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + ret = btrfs_inode_lock(inode, + BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); + if (ret) + return ret; if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -10187,21 +9473,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - for (;;) { + if (nowait) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, - lockend - start + 1); - if (ret) + if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, + start, lockend)) { + ret = -EAGAIN; goto out_unlock_inode; - lock_extent(io_tree, start, lockend, &cached_state); + } + + if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) { + ret = -EAGAIN; + goto out_unlock_inode; + } + ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); - if (!ordered) - break; - btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, &cached_state); - cond_resched(); + if (ordered) { + btrfs_put_ordered_extent(ordered); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); + ret = -EAGAIN; + goto out_unlock_inode; + } + } else { + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + + btrfs_lock_extent(io_tree, start, lockend, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); + cond_resched(); + } } em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); @@ -10210,94 +9521,89 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock_extent; } - if (em->block_start == EXTENT_MAP_INLINE) { + if (em->disk_bytenr == EXTENT_MAP_INLINE) { u64 extent_start = em->start; /* * For inline extents we get everything we need out of the * extent item. */ - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, - &cached_state, extent_start, + cached_state, extent_start, count, encoded, &unlocked); - goto out; + goto out_unlock_extent; } /* * We only want to return up to EOF even if the extent extends beyond * that. */ - encoded->len = min_t(u64, extent_map_end(em), + encoded->len = min_t(u64, btrfs_extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { - disk_bytenr = EXTENT_MAP_HOLE; + *disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (extent_map_is_compressed(em)) { - disk_bytenr = em->block_start; + } else if (btrfs_extent_map_is_compressed(em)) { + *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. */ - if (em->block_len > count) { + if (em->disk_num_bytes > count) { ret = -ENOBUFS; goto out_em; } - disk_io_size = em->block_len; - count = em->block_len; + *disk_io_size = em->disk_num_bytes; + count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; - encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, - extent_map_compression(em)); + btrfs_extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - disk_bytenr = em->block_start + (start - em->start); + *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ - disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; - count = start + disk_io_size - iocb->ki_pos; + *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + *disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; - disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; - if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, &cached_state); + if (*disk_bytenr == EXTENT_MAP_HOLE) { + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { - ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, - &cached_state, disk_bytenr, - disk_io_size, count, - encoded->compression, - &unlocked); + ret = -EIOCBQUEUED; + goto out_unlock_extent; } -out: - if (ret >= 0) - iocb->ki_pos += encoded->len; out_em: - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock_extent: - if (!unlocked) - unlock_extent(io_tree, start, lockend, &cached_state); + /* Leave inode and extent locked if we need to do a read. */ + if (!unlocked && ret != -EIOCBQUEUED) + btrfs_unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: - if (!unlocked) + if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -10312,12 +9618,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; int compression; size_t orig_count; u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; - unsigned long nr_pages, i; - struct page **pages; + unsigned long nr_folios, i; + struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; @@ -10406,24 +9713,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); - nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); - if (!pages) + nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); + if (!folios) return -ENOMEM; - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_folios; i++) { size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr; - pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); - if (!pages[i]) { + folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); + if (!folios[i]) { ret = -ENOMEM; - goto out_pages; + goto out_folios; } - kaddr = kmap_local_page(pages[i]); + kaddr = kmap_local_folio(folios[i], 0); if (copy_from_iter(kaddr, bytes, from) != bytes) { kunmap_local(kaddr); ret = -EFAULT; - goto out_pages; + goto out_folios; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); @@ -10433,22 +9740,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, for (;;) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) - goto out_pages; + goto out_folios; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) - goto out_pages; - lock_extent(io_tree, start, end, &cached_state); + goto out_folios; + btrfs_lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } @@ -10468,10 +9775,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_qgroup_free_data; /* Try an inline extent first. */ - if (start == 0 && encoded->unencoded_len == encoded->len && - encoded->unencoded_offset == 0) { - ret = cow_file_range_inline(inode, encoded->len, orig_count, - compression, pages, true); + if (encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0 && + can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { + ret = __cow_file_range_inline(inode, encoded->len, + orig_count, compression, folios[0], + true); if (ret <= 0) { if (ret == 0) ret = orig_count; @@ -10485,22 +9794,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_delalloc_release; extent_reserved = true; - em = create_io_em(inode, start, num_bytes, - start - encoded->unencoded_offset, ins.objectid, - ins.offset, ins.offset, ram_bytes, compression, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = num_bytes; + file_extent.ram_bytes = ram_bytes; + file_extent.offset = encoded->unencoded_offset; + file_extent.compression = compression; + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserved; } - free_extent_map(em); + btrfs_free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, - ins.objectid, ins.offset, - encoded->unencoded_offset, - (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED), - compression); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + (1U << BTRFS_ORDERED_ENCODED) | + (1U << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -10511,17 +9820,17 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); ret = orig_count; goto out; out_free_reserved: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_delalloc_release: btrfs_delalloc_release_extents(inode, num_bytes); btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); @@ -10534,15 +9843,15 @@ out_free_data_space: * bytes_may_use. */ if (!extent_reserved) - btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); + btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); out_unlock: - unlock_extent(io_tree, start, end, &cached_state); -out_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) - __free_page(pages[i]); + btrfs_unlock_extent(io_tree, start, end, &cached_state); +out_folios: + for (i = 0; i < nr_folios; i++) { + if (folios[i]) + folio_put(folios[i]); } - kvfree(pages); + kvfree(folios); out: if (ret >= 0) iocb->ki_pos += encoded->len; @@ -10689,39 +9998,59 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and * we don't really care. */ - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -10736,7 +10065,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -10750,7 +10080,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -10766,36 +10097,53 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", - root->root_key.objectid); - return -EPERM; + btrfs_root_id(root)); + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state); + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->block_start == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->block_start == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -10807,23 +10155,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = em->block_start + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -10858,7 +10228,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -10899,24 +10268,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); - unlock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); @@ -10925,6 +10297,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; @@ -10933,7 +10309,6 @@ out: *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; - sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else @@ -10995,7 +10370,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en if (ordered) { btrfs_err(root->fs_info, "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", - start, end, btrfs_ino(inode), root->root_key.objectid, + start, end, btrfs_ino(inode), btrfs_root_id(root), ordered->file_offset, ordered->file_offset + ordered->num_bytes - 1); btrfs_put_ordered_extent(ordered); @@ -11004,6 +10379,36 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en ASSERT(ordered == NULL); } +/* + * Find the first inode with a minimum number. + * + * @root: The root to search for. + * @min_ino: The minimum inode number. + * + * Find the first inode in the @root with a number >= @min_ino and return it. + * Returns NULL if no such inode found. + */ +struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + if (igrab(&inode->vfs_inode)) + break; + + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -11056,6 +10461,7 @@ static const struct address_space_operations btrfs_aops = { .writepages = btrfs_writepages, .readahead = btrfs_readahead, .invalidate_folio = btrfs_invalidate_folio, + .launder_folio = btrfs_launder_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, |