diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 3392 |
1 files changed, 1331 insertions, 2061 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7fed887e700c..a9322601ab5c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,7 +32,7 @@ #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/iomap.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/fsverity.h> #include "misc.h" #include "ctree.h" @@ -70,31 +70,13 @@ #include "orphan.h" #include "backref.h" #include "raid-stripe-tree.h" +#include "fiemap.h" struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; -struct btrfs_dio_data { - ssize_t submitted; - struct extent_changeset *data_reserved; - struct btrfs_ordered_extent *ordered; - bool data_space_reserved; - bool nocow_done; -}; - -struct btrfs_dio_private { - /* Range of I/O */ - u64 file_offset; - u32 bytes; - - /* This must be last */ - struct btrfs_bio bbio; -}; - -static struct bio_set btrfs_dio_bioset; - struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; @@ -134,14 +116,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type); static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) @@ -254,7 +231,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, btrfs_ino(inode), file_off, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); @@ -264,7 +241,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off logical += file_off; btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -331,15 +308,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); /* Output without objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -347,7 +324,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } else { btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -416,37 +393,16 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct page *locked_page, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = 0, page_end = 0; - struct page *page; - - if (locked_page) { - page_start = page_offset(locked_page); - page_end = page_start + PAGE_SIZE - 1; - } + struct folio *folio; while (index <= end_index) { - /* - * For locked page, we will call btrfs_mark_ordered_io_finished - * through btrfs_mark_ordered_io_finished() on it - * in run_delalloc_range() for the error handling, which will - * clear page Ordered and run the ordered extent accounting. - * - * Here we can't just clear the Ordered bit, or - * btrfs_mark_ordered_io_finished() would skip the accounting - * for the page range, and the ordered extent will never finish. - */ - if (locked_page && index == (page_start >> PAGE_SHIFT)) { - index++; - continue; - } - page = find_get_page(inode->vfs_inode.i_mapping, index); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; - if (!page) + if (IS_ERR(folio)) continue; /* @@ -454,25 +410,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_folio_clamp_clear_ordered(inode->root->fs_info, - page_folio(page), offset, bytes); - put_page(page); - } - - if (locked_page) { - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being - * instantiated then skip it, since the first page of a range is - * going to be properly cleaned up by the caller of - * run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; - } + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, + offset, bytes); + folio_put(folio); } return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); @@ -512,12 +452,12 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages, + struct folio *compressed_folio, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct page *page = NULL; + const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; @@ -525,10 +465,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; u64 i_size; - ASSERT((compressed_size > 0 && compressed_pages) || - (compressed_size == 0 && !compressed_pages)); + /* + * The decompressed size must still be no larger than a sector. Under + * heavy race, we can have size == 0 passed in, but that shouldn't be a + * big deal and we can continue the insertion. + */ + ASSERT(size <= sectorsize); - if (compressed_size && compressed_pages) + /* + * The compressed size also needs to be no larger than a sector. + * That's also why we only need one page as the parameter. + */ + if (compressed_folio) + ASSERT(compressed_size <= sectorsize); + else + ASSERT(compressed_size == 0); + + if (compressed_size && compressed_folio) cur_size = compressed_size; if (!extent_inserted) { @@ -556,32 +509,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { - struct page *cpage; - int i = 0; - while (compressed_size > 0) { - cpage = compressed_pages[i]; - cur_size = min_t(unsigned long, compressed_size, - PAGE_SIZE); - - kaddr = kmap_local_page(cpage); - write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_local(kaddr); + kaddr = kmap_local_folio(compressed_folio, 0); + write_extent_buffer(leaf, kaddr, ptr, compressed_size); + kunmap_local(kaddr); - i++; - ptr += cur_size; - compressed_size -= cur_size; - } btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->vfs_inode.i_mapping, 0); + struct folio *folio; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); + ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); - put_page(page); + folio_put(folio); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -611,17 +555,62 @@ fail: return ret; } +static bool can_cow_file_range_inline(struct btrfs_inode *inode, + u64 offset, u64 size, + size_t compressed_size) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 data_len = (compressed_size ?: size); + + /* Inline extents must start at offset 0. */ + if (offset != 0) + return false; + + /* + * Due to the page size limit, for subpage we can only trigger the + * writeback for the dirty sectors of page, that means data writeback + * is doing more writeback than what we want. + * + * This is especially unexpected for some call sites like fallocate, + * where we only increase i_size after everything is done. + * This means we can trigger inline extent even if we didn't want to. + * So here we skip inline extent creation completely. + */ + if (fs_info->sectorsize != PAGE_SIZE) + return false; + + /* Inline extents are limited to sectorsize. */ + if (size > fs_info->sectorsize) + return false; + + /* We cannot exceed the maximum inline data size. */ + if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + return false; + + /* We cannot exceed the user specified max_inline size. */ + if (data_len > fs_info->max_inline) + return false; + + /* Inline extents must be the entirety of the file. */ + if (size < i_size_read(&inode->vfs_inode)) + return false; + + return true; +} /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. + * + * If being used directly, you must have already checked we're allowed to cow + * the range by getting true from can_cow_file_range_inline(). */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, - size_t compressed_size, - int compress_type, - struct page **compressed_pages, - bool update_i_size) +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, + u64 size, size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; @@ -631,18 +620,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, int ret; struct btrfs_path *path; - /* - * We can create an inline extent if it ends at or beyond the current - * i_size, is no larger than a sector (decompressed), and the (possibly - * compressed) data fits in a leaf and the configured maximum inline - * size. - */ - if (size < i_size_read(&inode->vfs_inode) || - size > fs_info->sectorsize || - data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - data_len > fs_info->max_inline) - return 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -668,7 +645,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, - compressed_pages, update_i_size); + compressed_folio, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -701,19 +678,68 @@ out: return ret; } +static noinline int cow_file_range_inline(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 offset, u64 end, + size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) +{ + struct extent_state *cached = NULL; + unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; + u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); + int ret; + + if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) + return 1; + + lock_extent(&inode->io_tree, offset, end, &cached); + ret = __cow_file_range_inline(inode, size, compressed_size, + compress_type, compressed_folio, + update_i_size); + if (ret > 0) { + unlock_extent(&inode->io_tree, offset, end, &cached); + return ret; + } + + /* + * In the successful case (ret == 0 here), cow_file_range will return 1. + * + * Quite a bit further up the callstack in extent_writepage(), ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * In the failure case, the locked_folio does get unlocked by + * btrfs_folio_end_all_writers, which asserts that it is still locked + * at that point, so we must *not* unlock it here. + * + * The other two callsites in compress_file_range do not have a + * locked_folio, so they are not relevant to this logic. + */ + if (ret == 0) + locked_folio = NULL; + + extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, + clear_flags, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + return ret; +} + struct async_extent { u64 start; u64 ram_size; u64 compressed_size; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; int compress_type; struct list_head list; }; struct async_chunk { struct btrfs_inode *inode; - struct page *locked_page; + struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; @@ -731,8 +757,8 @@ struct async_cow { static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, - struct page **pages, - unsigned long nr_pages, + struct folio **folios, + unsigned long nr_folios, int compress_type) { struct async_extent *async_extent; @@ -743,8 +769,8 @@ static noinline int add_async_extent(struct async_chunk *cow, async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; - async_extent->pages = pages; - async_extent->nr_pages = nr_pages; + async_extent->folios = folios; + async_extent->nr_folios = nr_folios; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; @@ -766,32 +792,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } /* - * Special check for subpage. - * - * We lock the full page then run each delalloc range in the page, thus - * for the following case, we will hit some subpage specific corner case: - * - * 0 32K 64K - * | |///////| |///////| - * \- A \- B - * - * In above case, both range A and range B will try to unlock the full - * page [0, 64K), causing the one finished later will have page - * unlocked already, triggering various page lock requirement BUG_ON()s. + * Only enable sector perfect compression for experimental builds. * - * So here we add an artificial limit that subpage compression can only - * if the range is fully page aligned. + * This is a big feature change for subpage cases, and can hit + * different corner cases, so only limit this feature for + * experimental build for now. * - * In theory we only need to ensure the first page is fully covered, but - * the tailing partial page will be locked until the full compression - * finishes, delaying the write of other range. - * - * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range - * first to prevent any submitted async extent to unlock the full page. - * By this, we can ensure for subpage case that only the last async_cow - * will unlock the full page. + * ETA for moving this out of experimental builds is 6.15. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->sectorsize < PAGE_SIZE && + !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; @@ -809,7 +819,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) - return btrfs_compress_heuristic(&inode->vfs_inode, start, end); + return btrfs_compress_heuristic(inode, start, end); return 0; } @@ -819,7 +829,28 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode, small_write); + btrfs_add_inode_defrag(inode, small_write); +} + +static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long end_index = end >> PAGE_SHIFT; + struct folio *folio; + int ret = 0; + + for (unsigned long index = start >> PAGE_SHIFT; + index <= end_index; index++) { + folio = filemap_get_folio(inode->i_mapping, index); + if (IS_ERR(folio)) { + if (!ret) + ret = PTR_ERR(folio); + continue; + } + btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + end + 1 - start); + folio_put(folio); + } + return ret; } /* @@ -848,8 +879,8 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int poff; @@ -863,7 +894,16 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* + * All the folios should have been locked thus no failure. + * + * And even if some folios are missing, btrfs_compress_folios() + * would handle them correctly, so here just do an ASSERT() check for + * early logic errors. + */ + ASSERT(ret == 0); /* * We need to save i_size before now because it could change in between @@ -879,9 +919,9 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - pages = NULL; - nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); + folios = NULL; + nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through @@ -906,17 +946,6 @@ again: (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* - * For subpage case, we require full page alignment for the sector - * aligned range. - * Thus we must also check against @actual_end, not just @end. - */ - if (blocksize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(round_up(actual_end, blocksize))) - goto cleanup_and_bail_uncompressed; - } - total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -930,8 +959,8 @@ again: if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { + folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); + if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. @@ -945,9 +974,9 @@ again: compress_type = inode->prop_compress; /* Compression level is applied here. */ - ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), - mapping, start, pages, &nr_pages, &total_in, - &total_compressed); + ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + mapping, start, folios, &nr_folios, &total_in, + &total_compressed); if (ret) goto mark_incompressible; @@ -957,7 +986,7 @@ again: */ poff = offset_in_page(total_compressed); if (poff) - memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); + folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); /* * Try to create an inline extent. @@ -968,43 +997,16 @@ again: * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - if (total_in < actual_end) { - ret = cow_file_range_inline(inode, actual_end, 0, - BTRFS_COMPRESS_NONE, NULL, - false); - } else { - ret = cow_file_range_inline(inode, actual_end, - total_compressed, - compress_type, pages, - false); - } - if (ret <= 0) { - unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING; - - if (ret < 0) - mapping_set_error(mapping, -EIO); - - /* - * inline extent creation worked or returned error, - * we don't need to create any more async work items. - * Unlock and free up our temp pages. - * - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be done _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - NULL, - clear_flags, - PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - goto free_pages; - } + if (total_in < actual_end) + ret = cow_file_range_inline(inode, NULL, start, end, 0, + BTRFS_COMPRESS_NONE, NULL, false); + else + ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, + compress_type, folios[0], false); + if (ret <= 0) { + if (ret < 0) + mapping_set_error(mapping, -EIO); + goto free_pages; } /* @@ -1026,8 +1028,8 @@ again: * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages, - nr_pages, compress_type); + ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, + nr_folios, compress_type); BUG_ON(ret); if (start + total_in < end) { start += total_in; @@ -1044,12 +1046,12 @@ cleanup_and_bail_uncompressed: BTRFS_COMPRESS_NONE); BUG_ON(ret); free_pages: - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - btrfs_free_compr_page(pages[i]); + if (folios) { + for (i = 0; i < nr_folios; i++) { + WARN_ON(folios[i]->mapping); + btrfs_free_compr_folio(folios[i]); } - kfree(pages); + kfree(folios); } } @@ -1057,21 +1059,21 @@ static void free_async_extent_pages(struct async_extent *async_extent) { int i; - if (!async_extent->pages) + if (!async_extent->folios) return; - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - btrfs_free_compr_page(async_extent->pages[i]); + for (i = 0; i < async_extent->nr_folios; i++) { + WARN_ON(async_extent->folios[i]->mapping); + btrfs_free_compr_folio(async_extent->folios[i]); } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + kfree(async_extent->folios); + async_extent->nr_folios = 0; + async_extent->folios = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, - struct page *locked_page) + struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1084,21 +1086,18 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + ret = run_delalloc_cow(inode, locked_folio, start, end, + &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); - if (locked_page) { - const u64 page_start = page_offset(locked_page); - - set_page_writeback(locked_page); - end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, locked_page, - page_start, PAGE_SIZE, - !ret); - mapping_set_error(locked_page->mapping, ret); - unlock_page(locked_page); - } + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + if (locked_folio) + btrfs_folio_end_lock(inode->root->fs_info, locked_folio, + start, async_extent->ram_size); + btrfs_err_rl(inode->root->fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, async_extent->ram_size, ret); } } @@ -1111,8 +1110,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; struct btrfs_key ins; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; + struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; u64 start = async_extent->start; @@ -1122,20 +1123,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, kthread_associate_blkcg(async_chunk->blkcg_css); /* - * If async_chunk->locked_page is in the async_extent range, we need to + * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ - if (async_chunk->locked_page) { - u64 locked_page_start = page_offset(async_chunk->locked_page); - u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + if (async_chunk->locked_folio) { + u64 locked_folio_start = folio_pos(async_chunk->locked_folio); + u64 locked_folio_end = locked_folio_start + + folio_size(async_chunk->locked_folio) - 1; - if (!(start >= locked_page_end || end <= locked_page_start)) - locked_page = async_chunk->locked_page; + if (!(start >= locked_folio_end || end <= locked_folio_start)) + locked_folio = async_chunk->locked_folio; } - lock_extent(io_tree, start, end, NULL); if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } @@ -1150,34 +1151,29 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } + lock_extent(io_tree, start, end, &cached); + /* Here we're doing allocation and writeback of the compressed pages */ - em = create_io_em(inode, start, - async_extent->ram_size, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - async_extent->ram_size, /* ram_bytes */ - async_extent->compress_type, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.ram_bytes = async_extent->ram_size; + file_extent.num_bytes = async_extent->ram_size; + file_extent.offset = 0; + file_extent.compression = async_extent->compress_type; + + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserve; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ - async_extent->ram_size, /* num_bytes */ - async_extent->ram_size, /* ram_bytes */ - ins.objectid, /* disk_bytenr */ - ins.offset, /* disk_num_bytes */ - 0, /* offset */ - 1 << BTRFS_ORDERED_COMPRESSED, - async_extent->compress_type); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1 << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1187,11 +1183,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, - async_extent->pages, /* compressed_pages */ - async_extent->nr_pages, + async_extent->folios, /* compressed_folios */ + async_extent->nr_folios, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: @@ -1205,7 +1201,8 @@ out_free_reserve: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + NULL, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | @@ -1215,13 +1212,13 @@ out_free_reserve: kthread_associate_blkcg(NULL); btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - root->root_key.objectid, btrfs_ino(inode), start, + btrfs_root_id(root), btrfs_ino(inode), start, async_extent->ram_size, ret); kfree(async_extent); } -static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, - u64 num_bytes) +u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1235,15 +1232,15 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em); em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; + if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + alloc_hint = extent_map_block_start(em); if (em) free_extent_map(em); } else { - alloc_hint = em->block_start; + alloc_hint = extent_map_block_start(em); free_extent_map(em); } } @@ -1258,21 +1255,21 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * - * locked_page is the page that writepage had locked already. We use + * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_page. + * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_page and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_page is + * unlocks all pages including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_page are unlocked. + * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1281,16 +1278,16 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, u64 start, u64 end, - u64 *done_offset, + struct folio *locked_folio, u64 start, + u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_state *cached = NULL; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; @@ -1298,7 +1295,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct extent_map *em; unsigned clear_bits; unsigned long page_ops; - bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { @@ -1312,57 +1308,36 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { - u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), - end + 1); - + if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, actual_end, 0, + ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); - if (ret == 0) { - /* - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be run _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + if (ret <= 0) { /* - * locked_page is locked by the caller of - * writepage_delalloc(), not locked by - * __process_pages_contig(). - * - * We can't let __process_pages_contig() to unlock it, - * as it doesn't have any subpage::writers recorded. + * We succeeded, return 1 so the caller knows we're done + * with this page and already handled the IO. * - * Here we manually unlock the page, since the caller - * can't determine if it's an inline extent or a - * compressed extent. + * If there was an error then cow_file_range_inline() has + * already done the cleanup. */ - unlock_page(locked_page); - ret = 1; + if (ret == 0) + ret = 1; goto done; - } else if (ret < 0) { - goto out_unlock; } } - alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); + + /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; /* * Relocation relies on the relocated extents to have exactly the same @@ -1382,9 +1357,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, while (num_bytes > 0) { struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; - cur_alloc_size = num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { @@ -1415,28 +1390,36 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; - extent_reserved = true; - - ram_size = ins.offset; - em = create_io_em(inode, start, ins.offset, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - ram_size, /* ram_bytes */ - BTRFS_COMPRESS_NONE, /* compress_type */ - BTRFS_ORDERED_REGULAR /* type */); + + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = ins.offset; + file_extent.ram_bytes = ins.offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; + + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ + lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, + &cached); + + em = btrfs_create_io_em(inode, start, &file_extent, + BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, - ram_size, ins.objectid, cur_alloc_size, - 0, 1 << BTRFS_ORDERED_REGULAR, - BTRFS_COMPRESS_NONE); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { + unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1457,35 +1440,20 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (ret) btrfs_drop_extent_map_range(inode, start, - start + ram_size - 1, + start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered (Private2) bit so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; - extent_reserved = false; + cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1495,13 +1463,15 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; return ret; out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); + btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1515,29 +1485,31 @@ out_unlock: * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * btrfs_run_delalloc_range(). + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the cleanup function. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_page) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { - if (!locked_page) + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, @@ -1548,13 +1520,12 @@ out_unlock: * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ - if (extent_reserved) { + if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_page, - clear_bits, + locked_folio, &cached, clear_bits, page_ops); - start += cur_alloc_size; + btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } /* @@ -1563,11 +1534,18 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - if (start < end) { + if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); - } + extent_clear_unlock_delalloc(inode, start + cur_alloc_size, + end, locked_folio, + &cached, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, + end - start - cur_alloc_size + 1, NULL); + } + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret; } @@ -1589,10 +1567,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ u64 alloc_hint = 0; if (do_free) { - struct async_chunk *async_chunk; struct async_cow *async_cow; - async_chunk = container_of(work, struct async_chunk, work); btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -1620,7 +1596,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1639,7 +1615,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, if (!ctx) return false; - unlock_extent(&inode->io_tree, start, end, NULL); set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; @@ -1661,15 +1636,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, INIT_LIST_HEAD(&async_chunk[i].extents); /* - * The locked_page comes all the way from writepage and its - * the original page we were actually given. As we spread + * The locked_folio comes all the way from writepage and its + * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk - * structs, only the first struct needs a pointer to locked_page + * structs, only the first struct needs a pointer to + * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ - if (locked_page) { + if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to @@ -1679,12 +1655,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, locked_page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); - async_chunk[i].locked_page = locked_page; - locked_page = NULL; + async_chunk[i].locked_folio = locked_folio; + locked_folio = NULL; } else { - async_chunk[i].locked_page = NULL; + async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { @@ -1713,7 +1689,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -1721,48 +1697,27 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, &done_offset, - true, false); + ret = cow_file_range(inode, locked_folio, start, end, + &done_offset, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, - done_offset, wbc, pages_dirty); + extent_write_locked_range(&inode->vfs_inode, locked_folio, + start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } -static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes, bool nowait) -{ - struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); - struct btrfs_ordered_sum *sums; - int ret; - LIST_HEAD(list); - - ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, - &list, 0, nowait); - if (ret == 0 && list_empty(&list)) - return 0; - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); - } - if (ret < 0) - return ret; - return 1; -} - -static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end) +static int fallback_to_cow(struct btrfs_inode *inode, + struct folio *locked_folio, const u64 start, + const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_state *cached_state = NULL; u64 range_start = start; u64 count; int ret; @@ -1799,6 +1754,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ + lock_extent(io_tree, start, end, &cached_state); count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { @@ -1810,20 +1766,22 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, bytes = range_bytes; spin_lock(&sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, NULL); } + unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, + true); ASSERT(ret != 1); return ret; } @@ -1836,20 +1794,17 @@ struct can_nocow_file_extent_args { /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; - bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. */ bool free_path; - /* Output fields. Only set when can_nocow_file_extent() returns 1. */ - - u64 disk_bytenr; - u64 disk_num_bytes; - u64 extent_offset; - /* Number of bytes that can be written to in NOCOW mode. */ - u64 num_bytes; + /* + * Output fields. Only set when can_nocow_file_extent() returns 1. + * The expected file extent for the NOCOW write. + */ + struct btrfs_file_extent file_extent; }; /* @@ -1870,6 +1825,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; + struct btrfs_root *csum_root; + u64 io_start; u64 extent_end; u8 extent_type; int can_nocow = 0; @@ -1882,11 +1839,6 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out; - /* Can't access these fields unless we know it's not an inline extent. */ - args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - args->extent_offset = btrfs_file_extent_offset(leaf, fi); - if (!(inode->flags & BTRFS_INODE_NODATACOW) && extent_type == BTRFS_FILE_EXTENT_REG) goto out; @@ -1896,13 +1848,12 @@ static int can_nocow_file_extent(struct btrfs_path *path, * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ - if (!args->strict && - btrfs_file_extent_generation(leaf, fi) <= + if (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; /* An explicit hole, must COW. */ - if (args->disk_bytenr == 0) + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out; /* Compressed/encrypted/encoded extents must be COWed. */ @@ -1913,6 +1864,12 @@ static int can_nocow_file_extent(struct btrfs_path *path, extent_end = btrfs_file_extent_end(path); + args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); + args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); + /* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid @@ -1920,9 +1877,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ btrfs_release_path(path); - ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->extent_offset, - args->disk_bytenr, args->strict, path); + ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1930,7 +1886,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (args->free_path) { /* * We don't need the path anymore, plus through the - * csum_exist_in_range() call below we will end up allocating + * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage. */ @@ -1943,16 +1899,19 @@ static int can_nocow_file_extent(struct btrfs_path *path, atomic_read(&root->snapshot_force_cow)) goto out; - args->disk_bytenr += args->extent_offset; - args->disk_bytenr += args->start - key->offset; - args->num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.offset += args->start - key->offset; + io_start = args->file_extent.disk_bytenr + args->file_extent.offset; /* * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ - ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, - nowait); + + csum_root = btrfs_csum_root(root->fs_info, io_start); + ret = btrfs_lookup_csums_list(csum_root, io_start, + io_start + args->file_extent.num_bytes - 1, + NULL, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1966,6 +1925,53 @@ static int can_nocow_file_extent(struct btrfs_path *path, } /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + +/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * @@ -1973,13 +1979,18 @@ static int can_nocow_file_extent(struct btrfs_path *path, * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2002,14 +2013,14 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.end = end; nocow_args.writeback_path = true; - while (1) { + while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; + struct extent_state *cached_state = NULL; u64 extent_end; - u64 ram_bytes; u64 nocow_end; int extent_type; bool is_prealloc; @@ -2088,7 +2099,6 @@ next_slot: ret = -EUCLEAN; goto error; } - ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); extent_end = btrfs_file_extent_end(path); /* @@ -2108,7 +2118,9 @@ next_slot: goto must_cow; ret = 0; - nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + nocow_bg = btrfs_inc_nocow_writers(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset); if (!nocow_bg) { must_cow: /* @@ -2135,29 +2147,29 @@ must_cow: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1); + ret = fallback_to_cow(inode, locked_folio, cow_start, + found_key.offset - 1); cow_start = (u64)-1; if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } } - nocow_end = cur_offset + nocow_args.num_bytes - 1; + nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; + lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); + is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; if (is_prealloc) { - u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; - em = create_io_em(inode, cur_offset, nocow_args.num_bytes, - orig_start, - nocow_args.disk_bytenr, /* block_start */ - nocow_args.num_bytes, /* block_len */ - nocow_args.disk_num_bytes, /* orig_block_len */ - ram_bytes, BTRFS_COMPRESS_NONE, - BTRFS_ORDERED_PREALLOC); + em = btrfs_create_io_em(inode, cur_offset, + &nocow_args.file_extent, + BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, cur_offset, + nocow_end, &cached_state); btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; @@ -2166,18 +2178,18 @@ must_cow: } ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - nocow_args.num_bytes, nocow_args.num_bytes, - nocow_args.disk_bytenr, nocow_args.num_bytes, 0, + &nocow_args.file_extent, is_prealloc ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW), - BTRFS_COMPRESS_NONE); + : (1 << BTRFS_ORDERED_NOCOW)); btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); } + unlock_extent(&inode->io_tree, cur_offset, + nocow_end, &cached_state); ret = PTR_ERR(ordered); goto error; } @@ -2192,8 +2204,8 @@ must_cow: btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | + locked_folio, &cached_state, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2206,8 +2218,6 @@ must_cow: */ if (ret) goto error; - if (cur_offset > end) - break; } btrfs_release_path(path); @@ -2215,11 +2225,12 @@ must_cow: cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end); + ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; - if (ret) + if (ret) { + cow_end = end; goto error; + } } btrfs_free_path(path); @@ -2227,20 +2238,64 @@ must_cow: error: /* + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_offset end + * |/////////////| | + * + * For range [start, cur_offset) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Only need to clear the dirty flag as they will never be submitted. + * Ordered extent and extent maps are handled by + * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * + * 2) Failed with error from fallback_to_cow() + * start cur_offset cow_end end + * |/////////////|-----------| | + * + * For range [start, cur_offset) it's the same as case 1). + * But for range [cur_offset, cow_end), the folios have dirty flag + * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * + * Thus we should not call extent_clear_unlock_delalloc() on range + * [cur_offset, cow_end), as the folios are already unlocked. + * + * So clear the folio dirty flags for [start, cur_offset) first. + */ + if (cur_offset > start) + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + + /* * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; - if (cur_offset < end) + if (cow_end) + cur_offset = cow_end + 1; + + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. + */ + if (cur_offset < end) { + struct extent_state *cached = NULL; + + lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_DEFRAG | + locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + } btrfs_free_path(path); + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, end + 1 - start, ret); return ret; } @@ -2259,40 +2314,39 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* - * The range must cover part of the @locked_page, or a return of 1 + * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= page_offset(locked_page) || - start >= page_offset(locked_page) + PAGE_SIZE)); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_pos(locked_folio) + folio_size(locked_folio))); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_page, start, end); + ret = run_delalloc_nocow(inode, locked_folio, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, locked_page, start, end, wbc)) + run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_page, start, end, NULL, + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_page, start, - end - start + 1); + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2575,44 +2629,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered) -{ - u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - u64 len = bbio->bio.bi_iter.bi_size; - struct btrfs_ordered_extent *new; - int ret; - - /* Must always be called for the beginning of an ordered extent. */ - if (WARN_ON_ONCE(start != ordered->disk_bytenr)) - return -EINVAL; - - /* No need to split if the ordered extent covers the entire bio. */ - if (ordered->disk_num_bytes == len) { - refcount_inc(&ordered->refs); - bbio->ordered = ordered; - return 0; - } - - /* - * Don't split the extent_map for NOCOW extents, as we're writing into - * a pre-existing one. - */ - if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); - if (ret) - return ret; - } - - new = btrfs_split_ordered_extent(ordered, len); - if (IS_ERR(new)) - return PTR_ERR(new); - bbio->ordered = new; - return 0; -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2655,7 +2671,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (IS_ERR(em)) return PTR_ERR(em); - if (em->block_start != EXTENT_MAP_HOLE) + if (em->disk_bytenr != EXTENT_MAP_HOLE) goto next; em_len = em->len; @@ -2705,7 +2721,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { - struct page *page; + struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2717,50 +2733,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page = fixup->page; + struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 page_start = page_offset(page); - u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = folio_pos(folio) + folio_size(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before - * we take the page lock. + * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + folio_size(folio)); again: - lock_page(page); + folio_lock(folio); /* - * Before we queued this fixup, we took a reference on the page. - * page->mapping may go NULL, but it shouldn't be moved to a different + * Before we queued this fixup, we took a reference on the folio. + * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + if (!folio->mapping || !folio_test_dirty(folio) || + !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * - * 1) We got here and our page had already been dealt with and + * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. - * 2) Our page was already dealt with, but we happened to get an + * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but - * because the page was already dealt with we don't want to - * mark the page with an error, so make sure we're resetting + * because the folio was already dealt with we don't want to + * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC - * when the page was already properly dealt with. + * when the folio was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE, + page_start, folio_size(folio), true); } ret = 0; @@ -2768,7 +2785,7 @@ again: } /* - * We can't mess with the page state unless it is locked, so now that + * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) @@ -2777,14 +2794,14 @@ again: lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PageOrdered(page)) + if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -2802,7 +2819,7 @@ again: * * The page was dirty when we started, nothing should have cleaned it. */ - BUG_ON(!PageDirty(page)); + BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); @@ -2816,14 +2833,14 @@ out_page: * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ - mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page, page_start, - PAGE_SIZE, !ret); - clear_page_dirty_for_io(page); - } - btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); - unlock_page(page); - put_page(page); + mapping_set_error(folio->mapping, ret); + btrfs_mark_ordered_io_finished(inode, folio, page_start, + folio_size(folio), !ret); + folio_clear_dirty_for_io(folio); + } + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* @@ -2836,33 +2853,34 @@ out_page: /* * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a + * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. + * the delalloc bit and make it safe to write the folio. */ -int btrfs_writepage_cow_fixup(struct page *page) +int btrfs_writepage_cow_fixup(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; - /* This page has ordered extent covering it already */ - if (PageOrdered(page)) + /* This folio has ordered extent covering it already */ + if (folio_test_ordered(folio)) return 0; /* - * PageChecked is set below when we create a fixup worker for this page, - * don't try to create another one if we're already PageChecked() + * folio_checked is set below when we create a fixup worker for this + * folio, don't try to create another one if we're already + * folio_test_checked. * - * The extent_io writepage code will redirty the page if we send back + * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ - if (PageChecked(page)) + if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); @@ -2872,14 +2890,14 @@ int btrfs_writepage_cow_fixup(struct page *page) /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation - * takes place outside of the page lock, and we can't trust - * page->mapping outside of the page lock. + * takes place outside of the folio lock, and we can't trust + * page->mapping outside of the folio lock. */ ihold(inode); - btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); - get_page(page); + btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); + folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = page; + fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -2943,7 +2961,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -3011,10 +3028,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) num_bytes = oe->truncated_len; - ram_bytes = num_bytes; - } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3030,7 +3045,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); - return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + return insert_reserved_file_extent(trans, oe->inode, oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } @@ -3042,7 +3057,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, */ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { - struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); + struct btrfs_inode *inode = ordered_extent->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; @@ -3087,29 +3102,19 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - - btrfs_inode_safe_disk_i_size_write(inode, 0); - if (freespace_inode) - trans = btrfs_join_transaction_spacecache(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, ret); - goto out; + /* + * If it's a COW write we need to lock the extent range as we will be + * inserting/replacing file extent items and unpinning an extent map. + * This must be taken before joining a transaction, as it's a higher + * level lock (like the inode's VFS lock), otherwise we can run into an + * ABBA deadlock with other tasks (transactions work like a lock, + * depending on their current state). + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); } - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3123,8 +3128,28 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + /* Logic error */ + ASSERT(list_empty(&ordered_extent->list)); + if (!list_empty(&ordered_extent->list)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, inode); + if (ret) { + /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + } + goto out; + } if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -3181,7 +3206,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_abort_transaction(trans, ret); goto out; } - ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, &cached_state); @@ -3200,9 +3224,8 @@ out: * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ - if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, - &ordered_extent->flags)) - mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + if (ret) + btrfs_mark_ordered_extent_error(ordered_extent); if (truncated) unwritten_start += logical_len; @@ -3256,7 +3279,7 @@ out: * Actually free the qgroup rsv which was released when * the ordered extent was created. */ - btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } @@ -3278,7 +3301,7 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { - if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) && + if (btrfs_is_zoned(ordered->inode->root->fs_info) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); @@ -3572,7 +3595,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, last_objectid, root); + inode = btrfs_iget(last_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; @@ -3757,14 +3780,69 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 1; } +static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (WARN_ON_ONCE(inode->file_extent_tree)) + return 0; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + if (!S_ISREG(inode->vfs_inode.i_mode)) + return 0; + if (btrfs_is_free_space_inode(inode)) + return 0; + + inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!inode->file_extent_tree) + return -ENOMEM; + + extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); + + return 0; +} + +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; +} + /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode and add it to + * its root list/tree. + * + * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) +static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3776,23 +3854,27 @@ static int btrfs_read_locked_inode(struct inode *inode, bool filled = false; int first_xattr_slot; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + goto out; + ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } + ASSERT(path); - memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3832,7 +3914,9 @@ static int btrfs_read_locked_inode(struct inode *inode, inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = (u64)-1; + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); @@ -3923,10 +4007,8 @@ cache_acl: btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), - root->root_key.objectid, ret); + btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -3953,7 +4035,15 @@ cache_acl: } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret) + goto out; + return 0; +out: + iget_failed(inode); + return ret; } /* @@ -4014,13 +4104,15 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; struct btrfs_path *path; struct extent_buffer *leaf; + struct btrfs_key key; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); + btrfs_get_inode_key(inode, &key); + ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -4032,7 +4124,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: @@ -4182,6 +4273,7 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode(trans, dir); @@ -4282,9 +4374,9 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { - objectid = inode->root->root_key.objectid; + objectid = btrfs_root_id(inode->root); } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - objectid = inode->location.objectid; + objectid = inode->ref_root_id; } else { WARN_ON(1); fscrypt_free_filename(&fname); @@ -4325,11 +4417,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); - if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + if (IS_ERR(di)) { + ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); goto out; } @@ -4340,7 +4429,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_release_path(path); } else { ret = btrfs_del_root_ref(trans, objectid, - root->root_key.objectid, dir_ino, + btrfs_root_id(root), dir_ino, &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -4390,7 +4479,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.objectid == root->root_key.objectid) { + if (key.objectid == btrfs_root_id(root)) { ret = -EPERM; btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", @@ -4400,7 +4489,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_release_path(path); } - key.objectid = root->root_key.objectid; + key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; @@ -4420,8 +4509,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == root->root_key.objectid && - key.type == BTRFS_ROOT_REF_KEY) + if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } out: @@ -4433,64 +4521,26 @@ out: static void btrfs_prune_dentries(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - u64 objectid = 0; + struct btrfs_inode *inode; + u64 min_ino = 0; if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(entry)) - node = node->rb_left; - else if (objectid > btrfs_ino(entry)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(entry)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(entry) + 1; - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - if (atomic_read(&inode->i_count) > 1) - d_prune_aliases(inode); - /* - * btrfs_drop_inode will have it removed from the inode - * cache when its usage count hits zero. - */ - iput(inode); - cond_resched(); - spin_lock(&root->inode_lock); - goto again; - } - - if (cond_resched_lock(&root->inode_lock)) - goto again; + inode = btrfs_find_first_inode(root, min_ino); + while (inode) { + if (atomic_read(&inode->vfs_inode.i_count) > 1) + d_prune_aliases(&inode->vfs_inode); - node = rb_next(node); + min_ino = btrfs_ino(inode) + 1; + /* + * btrfs_drop_inode() will have it removed from the inode + * cache when its usage count hits zero. + */ + iput(&inode->vfs_inode); + cond_resched(); + inode = btrfs_find_first_inode(root, min_ino); } - spin_unlock(&root->inode_lock); } int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) @@ -4517,7 +4567,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", - dest->root_key.objectid); + btrfs_root_id(dest)); ret = -EPERM; goto out_up_write; } @@ -4525,7 +4575,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_up_write; } @@ -4554,11 +4604,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = PTR_ERR(trans); goto out_release; } - ret = btrfs_record_root_in_trans(trans, root); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_end_trans; - } btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); qgroup_reserved = 0; trans->block_rsv = &block_rsv; @@ -4586,7 +4631,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4594,8 +4639,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) } ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - dest->root_key.objectid); + BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4604,7 +4648,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4645,7 +4689,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - int err = 0; + int ret = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; struct fscrypt_name fname; @@ -4661,33 +4705,33 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } - err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); - if (err) - return err; + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_notrans; } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } - err = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) goto out; last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); - if (!err) { + if (!ret) { btrfs_i_size_write(BTRFS_I(inode), 0); /* * Propagate the last_unlink_trans value of the deleted dir to @@ -4709,7 +4753,7 @@ out_notrans: btrfs_btree_balance_dirty(fs_info); fscrypt_free_filename(&fname); - return err; + return ret; } /* @@ -4933,16 +4977,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err = 0; + int ret = 0; /* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_block(inode, oldsize, 0, 0); - if (err) - return err; + ret = btrfs_truncate_block(inode, oldsize, 0, 0); + if (ret) + return ret; if (size <= hole_start) return 0; @@ -4953,7 +4997,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) while (1) { em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { - err = PTR_ERR(em); + ret = PTR_ERR(em); em = NULL; break; } @@ -4964,13 +5008,13 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; - err = maybe_insert_hole(inode, cur_offset, hole_size); - if (err) + ret = maybe_insert_hole(inode, cur_offset, hole_size); + if (ret) break; - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; hole_em = alloc_extent_map(); @@ -4983,20 +5027,18 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) } hole_em->start = cur_offset; hole_em->len = hole_size; - hole_em->orig_start = cur_offset; - hole_em->block_start = EXTENT_MAP_HOLE; - hole_em->block_len = 0; - hole_em->orig_block_len = 0; + hole_em->disk_bytenr = EXTENT_MAP_HOLE; + hole_em->disk_num_bytes = 0; hole_em->ram_bytes = hole_size; hole_em->generation = btrfs_get_fs_generation(fs_info); - err = btrfs_replace_extent_map_range(inode, hole_em, true); + ret = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; } next: @@ -5008,7 +5050,7 @@ next: } free_extent_map(em); unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); - return err; + return ret; } static int btrfs_setsize(struct inode *inode, struct iattr *attr) @@ -5065,7 +5107,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); if (btrfs_is_zoned(fs_info)) { - ret = btrfs_wait_ordered_range(inode, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), ALIGN(newsize, fs_info->sectorsize), (u64)-1); if (ret) @@ -5095,7 +5137,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(inode, 0, (u64)-1); + err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (err) return err; i_size_write(inode, BTRFS_I(inode)->disk_i_size); @@ -5284,7 +5326,7 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) goto out; @@ -5296,7 +5338,7 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID); goto out; } @@ -5468,7 +5510,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = dir->root->root_key.objectid; + key.objectid = btrfs_root_id(dir->root); key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -5509,59 +5551,24 @@ out: return err; } -static void inode_tree_add(struct btrfs_inode *inode) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *entry; - struct rb_node **p; - struct rb_node *parent; - struct rb_node *new = &inode->rb_node; - u64 ino = btrfs_ino(inode); - if (inode_unhashed(&inode->vfs_inode)) - return; - parent = NULL; - spin_lock(&root->inode_lock); - p = &root->inode_tree.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_inode, rb_node); - if (ino < btrfs_ino(entry)) - p = &parent->rb_left; - else if (ino > btrfs_ino(entry)) - p = &parent->rb_right; - else { - WARN_ON(!(entry->vfs_inode.i_state & - (I_WILL_FREE | I_FREEING))); - rb_replace_node(parent, new, &root->inode_tree); - RB_CLEAR_NODE(parent); - spin_unlock(&root->inode_lock); - return; - } - } - rb_link_node(new, parent, p); - rb_insert_color(new, &root->inode_tree); - spin_unlock(&root->inode_lock); -} - -static void inode_tree_del(struct btrfs_inode *inode) +static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; - int empty = 0; + struct btrfs_inode *entry; + bool empty = false; - spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&inode->rb_node)) { - rb_erase(&inode->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&inode->rb_node); - empty = RB_EMPTY_ROOT(&root->inode_tree); - } - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + if (entry == inode) + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty && btrfs_root_refs(&root->root_item) == 0) { - spin_lock(&root->inode_lock); - empty = RB_EMPTY_ROOT(&root->inode_tree); - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty) btrfs_add_dead_root(root); } @@ -5572,10 +5579,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - BTRFS_I(inode)->location.objectid = args->ino; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; + btrfs_set_inode_number(BTRFS_I(inode), args->ino); BTRFS_I(inode)->root = btrfs_grab_root(args->root); if (args->root && args->root == args->root->fs_info->tree_root && @@ -5589,12 +5593,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == BTRFS_I(inode)->location.objectid && + return args->ino == btrfs_ino(BTRFS_I(inode)) && args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, - struct btrfs_root *root) +static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5603,53 +5606,64 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, args.ino = ino; args.root = root; - inode = iget5_locked(s, hashval, btrfs_find_actor, + inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; } /* - * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Get an inode object given its inode number and corresponding root. Path is + * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path) +struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { struct inode *inode; + int ret; - inode = btrfs_iget_locked(s, ino, root); + inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - int ret; + if (!(inode->i_state & I_NEW)) + return inode; - ret = btrfs_read_locked_inode(inode, path); - if (!ret) { - inode_tree_add(BTRFS_I(inode)); - unlock_new_inode(inode); - } else { - iget_failed(inode); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode, this means the inode item - * was not found. - */ - if (ret > 0) - ret = -ENOENT; - inode = ERR_PTR(ret); - } - } + ret = btrfs_read_locked_inode(inode, path); + if (ret) + return ERR_PTR(ret); + unlock_new_inode(inode); return inode; } -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) +/* + * Get an inode object given its inode number and corresponding root. + */ +struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(s, ino, root, NULL); + struct inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_read_locked_inode(inode, path); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(inode); + return inode; } static struct inode *new_simple_dir(struct inode *dir, @@ -5663,10 +5677,11 @@ static struct inode *new_simple_dir(struct inode *dir, return ERR_PTR(-ENOMEM); BTRFS_I(inode)->root = btrfs_grab_root(root); - memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + BTRFS_I(inode)->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); - inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry @@ -5708,7 +5723,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; - struct btrfs_key location; + struct btrfs_key location = { 0 }; u8 di_type = 0; int ret = 0; @@ -5720,7 +5735,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, location.objectid, root); + inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) return inode; @@ -5744,7 +5759,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir, &location, root); } else { - inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); + inode = btrfs_iget(location.objectid, sub_root); btrfs_put_root(sub_root); if (IS_ERR(inode)) @@ -5964,7 +5979,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - put = btrfs_readdir_get_delayed_items(inode, private->last_index, + put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index, &ins_list, &del_list); again: @@ -6037,7 +6052,7 @@ again: * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as - * they're returned by readdir. Until we re-use freed offsets + * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. @@ -6054,7 +6069,7 @@ nopos: ret = 0; err: if (put) - btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); + btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -6139,7 +6154,7 @@ static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; - args.ino = BTRFS_I(inode)->location.objectid; + args.ino = btrfs_ino(BTRFS_I(inode)); args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, @@ -6246,7 +6261,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root; struct btrfs_inode_item *inode_item; - struct btrfs_key *location; struct btrfs_path *path; u64 objectid; struct btrfs_inode_ref *ref; @@ -6255,6 +6269,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_item_batch batch; unsigned long ptr; int ret; + bool xa_reserved = false; path = btrfs_alloc_path(); if (!path) @@ -6264,10 +6279,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); root = BTRFS_I(inode)->root; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + goto out; + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; - inode->i_ino = objectid; + btrfs_set_inode_number(BTRFS_I(inode), objectid); + + ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); + if (ret) + goto out; + xa_reserved = true; if (args->orphan) { /* @@ -6282,8 +6306,10 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) goto out; } - /* index_cnt is ignored for everything but a dir. */ - BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -6310,11 +6336,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - ret = btrfs_insert_inode_locked(inode); if (ret < 0) { if (!args->orphan) @@ -6397,7 +6418,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6413,8 +6433,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, * Subvolumes inherit properties from their parent subvolume, * not the directory they were created in. */ - parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - BTRFS_I(dir)->root); + parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { @@ -6427,8 +6446,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) { btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, - ret); + btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } /* @@ -6443,7 +6461,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(BTRFS_I(inode)); + ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); + if (WARN_ON(ret)) { + /* Shouldn't happen, we used xa_reserve() before. */ + btrfs_abort_transaction(trans, ret); + goto discard; + } trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); @@ -6471,6 +6494,9 @@ discard: ihold(inode); discard_new_inode(inode); out: + if (xa_reserved) + xa_release(&root->inodes, objectid); + btrfs_free_path(path); return ret; } @@ -6501,7 +6527,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, index, name); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, @@ -6544,7 +6570,7 @@ fail_dir_item: u64 local_index; int err; err = btrfs_del_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, &local_index, name); if (err) btrfs_abort_transaction(trans, err); @@ -6642,7 +6668,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) + if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) @@ -6728,7 +6754,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, } static noinline int uncompress_inline(struct btrfs_path *path, - struct page *page, + struct folio *folio, struct btrfs_file_extent_item *item) { int ret; @@ -6750,7 +6776,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, + max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6761,36 +6788,35 @@ static noinline int uncompress_inline(struct btrfs_path *path, */ if (max_size < PAGE_SIZE) - memzero_page(page, max_size, PAGE_SIZE - max_size); + folio_zero_range(folio, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } -static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct page *page) +static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; - if (!page || PageUptodate(page)) + if (!folio || folio_test_uptodate(folio)) return 0; - ASSERT(page_offset(page) == 0); + ASSERT(folio_pos(folio) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page, fi); + return uncompress_inline(path, folio, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) - memzero_page(page, copy_size, PAGE_SIZE - copy_size); + folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); return 0; } @@ -6812,7 +6838,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len) + struct folio *folio, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6835,7 +6861,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); - else if (em->block_start == EXTENT_MAP_INLINE && page) + else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) free_extent_map(em); else goto out; @@ -6846,9 +6872,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, goto out; } em->start = EXTENT_MAP_HOLE; - em->orig_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; em->len = (u64)-1; - em->block_len = (u64)-1; path = btrfs_alloc_path(); if (!path) { @@ -6938,9 +6963,8 @@ next: /* New extent overlaps with existing one */ em->start = start; - em->orig_start = start; em->len = found_key.offset - start; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; goto insert; } @@ -6964,19 +6988,18 @@ next: * * Other members are not utilized for inline extents. */ - ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page); + ret = read_inline_extent(path, folio); if (ret < 0) goto out; goto insert; } not_found: em->start = start; - em->orig_start = start; em->len = len; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; insert: ret = 0; btrfs_release_path(path); @@ -6989,7 +7012,7 @@ insert: } write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); @@ -7003,84 +7026,6 @@ out: return em; } -static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - const u64 start, - const u64 len, - const u64 orig_start, - const u64 block_start, - const u64 block_len, - const u64 orig_block_len, - const u64 ram_bytes, - const int type) -{ - struct extent_map *em = NULL; - struct btrfs_ordered_extent *ordered; - - if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, block_start, - block_len, orig_block_len, ram_bytes, - BTRFS_COMPRESS_NONE, /* compress_type */ - type); - if (IS_ERR(em)) - goto out; - } - ordered = btrfs_alloc_ordered_extent(inode, start, len, len, - block_start, block_len, 0, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT), - BTRFS_COMPRESS_NONE); - if (IS_ERR(ordered)) { - if (em) { - free_extent_map(em); - btrfs_drop_extent_map_range(inode, start, - start + len - 1, false); - } - em = ERR_CAST(ordered); - } else { - ASSERT(!dio_data->ordered); - dio_data->ordered = ordered; - } - out: - - return em; -} - -static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 len) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_map *em; - struct btrfs_key ins; - u64 alloc_hint; - int ret; - - alloc_hint = get_extent_allocation_hint(inode, start, len); -again: - ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, - 0, alloc_hint, &ins, 1, 1); - if (ret == -EAGAIN) { - ASSERT(btrfs_is_zoned(fs_info)); - wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - goto again; - } - if (ret) - return ERR_PTR(ret); - - em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, - ins.objectid, ins.offset, ins.offset, - ins.offset, BTRFS_ORDERED_REGULAR); - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); - - return em; -} - static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; @@ -7103,8 +7048,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent - * @strict: if true, omit optimizations that might force us into unnecessary - * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write @@ -7115,8 +7058,8 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict) + struct btrfs_file_extent *file_extent, + bool nowait) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -7166,12 +7109,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); - if (ram_bytes) - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; - nocow_args.strict = strict; nocow_args.free_path = true; ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); @@ -7185,14 +7125,16 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } ret = 0; - if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) + if (btrfs_extent_readonly(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset)) goto out; if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + nocow_args.num_bytes, + range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { @@ -7201,143 +7143,75 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } } - if (orig_start) - *orig_start = key.offset - nocow_args.extent_offset; - if (orig_block_len) - *orig_block_len = nocow_args.disk_num_bytes; + if (file_extent) + memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); - *len = nocow_args.num_bytes; + *len = nocow_args.file_extent.num_bytes; ret = 1; out: btrfs_free_path(path); return ret; } -static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, - unsigned int iomap_flags) -{ - const bool writing = (iomap_flags & IOMAP_WRITE); - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - int ret = 0; - - while (1) { - if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; - } else { - lock_extent(io_tree, lockstart, lockend, cached_state); - } - /* - * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure there's no ordered - * extents in this range. - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, - lockend - lockstart + 1); - - /* - * We need to make sure there are no buffered pages in this - * range either, we could have raced between the invalidate in - * generic_file_direct_write and locking the extent. The - * invalidate needs to happen so that reads after a write do not - * get stale data. - */ - if (!ordered && - (!writing || !filemap_range_has_page(inode->i_mapping, - lockstart, lockend))) - break; - - unlock_extent(io_tree, lockstart, lockend, cached_state); - - if (ordered) { - if (nowait) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - break; - } - /* - * If we are doing a DIO read and the ordered extent we - * found is for a buffered write, we can not wait for it - * to complete and retry, because if we do so we can - * deadlock with concurrent buffered writes on page - * locks. This happens only if our DIO read covers more - * than one extent map, if at this point has already - * created an ordered extent for a previous extent map - * and locked its range in the inode's io tree, and a - * concurrent write against that previous extent map's - * range and this range started (we unlock the ranges - * in the io tree only when the bios complete and - * buffered writes always lock pages before attempting - * to lock range in the io tree). - */ - if (writing || - test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered); - else - ret = nowait ? -EAGAIN : -ENOTBLK; - btrfs_put_ordered_extent(ordered); - } else { - /* - * We could trigger writeback for this range (and wait - * for it to complete) and then invalidate the pages for - * this range (through invalidate_inode_pages2_range()), - * but that can lead us to a deadlock with a concurrent - * call to readahead (a buffered read or a defrag call - * triggered a readahead) on a page lock due to an - * ordered dio extent we created before but did not have - * yet a corresponding bio submitted (whence it can not - * complete), which makes readahead wait for that - * ordered extent to complete while holding a lock on - * that page. - */ - ret = nowait ? -EAGAIN : -ENOTBLK; - } - - if (ret) - break; - - cond_resched(); - } - - return ret; -} - /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type) +struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, + const struct btrfs_file_extent *file_extent, + int type) { struct extent_map *em; int ret; + /* + * Note the missing NOCOW type. + * + * For pure NOCOW writes, we should not create an io extent map, but + * just reusing the existing one. + * Only PREALLOC writes (NOCOW write into preallocated range) can + * create an io extent map. + */ ASSERT(type == BTRFS_ORDERED_PREALLOC || type == BTRFS_ORDERED_COMPRESSED || - type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); + switch (type) { + case BTRFS_ORDERED_PREALLOC: + /* We're only referring part of a larger preallocated extent. */ + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); + break; + case BTRFS_ORDERED_REGULAR: + /* COW results a new extent matching our file extent size. */ + ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); + ASSERT(file_extent->ram_bytes == file_extent->num_bytes); + + /* Since it's a new extent, we should not have any offset. */ + ASSERT(file_extent->offset == 0); + break; + case BTRFS_ORDERED_COMPRESSED: + /* Must be compressed. */ + ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE); + + /* + * Encoded write can make us to refer to part of the + * uncompressed extent. + */ + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); + break; + } + em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); em->start = start; - em->orig_start = orig_start; - em->len = len; - em->block_len = block_len; - em->block_start = block_start; - em->orig_block_len = orig_block_len; - em->ram_bytes = ram_bytes; + em->len = file_extent->num_bytes; + em->disk_bytenr = file_extent->disk_bytenr; + em->disk_num_bytes = file_extent->disk_num_bytes; + em->ram_bytes = file_extent->ram_bytes; em->generation = -1; + em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; - if (type == BTRFS_ORDERED_PREALLOC) - em->flags |= EXTENT_FLAG_FILLING; - else if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, compress_type); + if (type == BTRFS_ORDERED_COMPRESSED) + extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { @@ -7349,591 +7223,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, return em; } - -static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 *lenp, - unsigned int iomap_flags) -{ - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_map *em = *map; - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - struct btrfs_block_group *bg; - bool can_nocow = false; - bool space_reserved = false; - u64 len = *lenp; - u64 prev_len; - int ret = 0; - - /* - * We don't allocate a new extent in the following cases - * - * 1) The inode is marked as NODATACOW. In this case we'll just use the - * existing extent. - * 2) The extent is marked as PREALLOC. We're good to go here and can - * just use the extent. - * - */ - if ((em->flags & EXTENT_FLAG_PREALLOC) || - ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - em->block_start != EXTENT_MAP_HOLE)) { - if (em->flags & EXTENT_FLAG_PREALLOC) - type = BTRFS_ORDERED_PREALLOC; - else - type = BTRFS_ORDERED_NOCOW; - len = min(len, em->len - (start - em->start)); - block_start = em->block_start + (start - em->start); - - if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false, false) == 1) { - bg = btrfs_inc_nocow_writers(fs_info, block_start); - if (bg) - can_nocow = true; - } - } - - prev_len = len; - if (can_nocow) { - struct extent_map *em2; - - /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - nowait); - if (ret < 0) { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - btrfs_dec_nocow_writers(bg); - if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) - ret = -EAGAIN; - goto out; - } - space_reserved = true; - - em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); - btrfs_dec_nocow_writers(bg); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em2; - em = em2; - } - - if (IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - - dio_data->nocow_done = true; - } else { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - - if (nowait) { - ret = -EAGAIN; - goto out; - } - - /* - * If we could not allocate data space before locking the file - * range and we can't do a NOCOW write, then we have to fail. - */ - if (!dio_data->data_space_reserved) { - ret = -ENOSPC; - goto out; - } - - /* - * We have to COW and we have already reserved data space before, - * so now we reserve only metadata. - */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - false); - if (ret < 0) - goto out; - space_reserved = true; - - em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - *map = em; - len = min(len, em->len - (start - em->start)); - if (len < prev_len) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - prev_len - len, true); - } - - /* - * We have created our ordered extent, so we can now release our reservation - * for an outstanding extent. - */ - btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); - - /* - * Need to update the i_size under the extent lock so buffered - * readers will get the updated i_size when we unlock. - */ - if (start + len > i_size_read(inode)) - i_size_write(inode, start + len); -out: - if (ret && space_reserved) { - btrfs_delalloc_release_extents(BTRFS_I(inode), len); - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } - *lenp = len; - return ret; -} - -static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - loff_t length, unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_map *em; - struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = iter->private; - u64 lockstart, lockend; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - u64 len = length; - const u64 data_alloc_len = length; - bool unlock_extents = false; - - /* - * We could potentially fault if we have a buffer > PAGE_SIZE, and if - * we're NOWAIT we may submit a bio for a partial range and return - * EIOCBQUEUED, which would result in an errant short read. - * - * The best way to handle this would be to allow for partial completions - * of iocb's, so we could submit the partial bio, return and fault in - * the rest of the pages, and then submit the io for the rest of the - * range. However we don't have that currently, so simply return - * -EAGAIN at this point so that the normal path is used. - */ - if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) - return -EAGAIN; - - /* - * Cap the size of reads to that usually seen in buffered I/O as we need - * to allocate a contiguous array for the checksums. - */ - if (!write) - len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); - - lockstart = start; - lockend = start + len - 1; - - /* - * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't - * enough if we've written compressed pages to this area, so we need to - * flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk - the first flush only starts - * compression on the data, while keeping the pages locked, so by the - * time the second flush returns we know bios for the compressed pages - * were submitted and finished, and the pages no longer under writeback. - * - * If we have a NOWAIT request and we have any pages in the range that - * are locked, likely due to compression still in progress, we don't want - * to block on page locks. We also don't want to block on pages marked as - * dirty or under writeback (same as for the non-compression case). - * iomap_dio_rw() did the same check, but after that and before we got - * here, mmap'ed writes may have happened or buffered reads started - * (readpage() and readahead(), which lock pages), as we haven't locked - * the file range yet. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) { - if (flags & IOMAP_NOWAIT) { - if (filemap_range_needs_writeback(inode->i_mapping, - lockstart, lockend)) - return -EAGAIN; - } else { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; - } - } - - memset(dio_data, 0, sizeof(*dio_data)); - - /* - * We always try to allocate data space and must do it before locking - * the file range, to avoid deadlocks with concurrent writes to the same - * range if the range has several extents and the writes don't expand the - * current i_size (the inode lock is taken in shared mode). If we fail to - * allocate data space here we continue and later, after locking the - * file range, we fail with ENOSPC only if we figure out we can not do a - * NOCOW write. - */ - if (write && !(flags & IOMAP_NOWAIT)) { - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, data_alloc_len, false); - if (!ret) - dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) - goto err; - } - - /* - * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered IO, or we are doing a - * NOWAIT read/write and we need to block. - */ - ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); - if (ret < 0) - goto err; - - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } - - /* - * Ok for INLINE and COMPRESSED extents we need to fallback on buffered - * io. INLINE is special, and we could probably kludge it in here, but - * it's still buffered so for safety lets just fall back to the generic - * buffered path. - * - * For COMPRESSED we _have_ to read the entire extent in so we can - * decompress it, so there will be buffering required no matter what we - * do, so go ahead and fallback to buffered. - * - * We return -ENOTBLK because that's what makes DIO go ahead and go back - * to buffered IO. Don't blame me, this is the price we pay for using - * the generic code. - */ - if (extent_map_is_compressed(em) || - em->block_start == EXTENT_MAP_INLINE) { - free_extent_map(em); - /* - * If we are in a NOWAIT context, return -EAGAIN in order to - * fallback to buffered IO. This is not only because we can - * block with buffered IO (no support for NOWAIT semantics at - * the moment) but also to avoid returning short reads to user - * space - this happens if we were able to read some data from - * previous non-compressed extents and then when we fallback to - * buffered IO, at btrfs_file_read_iter() by calling - * filemap_read(), we fail to fault in pages for the read buffer, - * in which case filemap_read() returns a short read (the number - * of bytes previously read is > 0, so it does not return -EFAULT). - */ - ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; - goto unlock_err; - } - - len = min(len, em->len - (start - em->start)); - - /* - * If we have a NOWAIT request and the range contains multiple extents - * (or a mix of extents and holes), then we return -EAGAIN to make the - * caller fallback to a context where it can do a blocking (without - * NOWAIT) request. This way we avoid doing partial IO and returning - * success to the caller, which is not optimal for writes and for reads - * it can result in unexpected behaviour for an application. - * - * When doing a read, because we use IOMAP_DIO_PARTIAL when calling - * iomap_dio_rw(), we can end up returning less data then what the caller - * asked for, resulting in an unexpected, and incorrect, short read. - * That is, the caller asked to read N bytes and we return less than that, - * which is wrong unless we are crossing EOF. This happens if we get a - * page fault error when trying to fault in pages for the buffer that is - * associated to the struct iov_iter passed to iomap_dio_rw(), and we - * have previously submitted bios for other extents in the range, in - * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of - * those bios have completed by the time we get the page fault error, - * which we return back to our caller - we should only return EIOCBQUEUED - * after we have submitted bios for all the extents in the range. - */ - if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); - ret = -EAGAIN; - goto unlock_err; - } - - if (write) { - ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, &len, flags); - if (ret < 0) - goto unlock_err; - unlock_extents = true; - /* Recalc len in case the new em is smaller than requested */ - len = min(len, em->len - (start - em->start)); - if (dio_data->data_space_reserved) { - u64 release_offset; - u64 release_len = 0; - - if (dio_data->nocow_done) { - release_offset = start; - release_len = data_alloc_len; - } else if (len < data_alloc_len) { - release_offset = start + len; - release_len = data_alloc_len - len; - } - - if (release_len > 0) - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - release_offset, - release_len); - } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; - } - - if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - else - free_extent_state(cached_state); - - /* - * Translate extent map information to iomap. - * We trim the extents (and move the addr) even though iomap code does - * that, since we have locked only the parts we are performing I/O in. - */ - if ((em->block_start == EXTENT_MAP_HOLE) || - ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - } else { - iomap->addr = em->block_start + (start - em->start); - iomap->type = IOMAP_MAPPED; - } - iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_dev->bdev; - iomap->length = len; - free_extent_map(em); - - return 0; - -unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); -err: - if (dio_data->data_space_reserved) { - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - start, data_alloc_len); - extent_changeset_free(dio_data->data_reserved); - } - - return ret; -} - -static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - ssize_t written, unsigned int flags, struct iomap *iomap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_dio_data *dio_data = iter->private; - size_t submitted = dio_data->submitted; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - - if (!write && (iomap->type == IOMAP_HOLE)) { - /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); - return 0; - } - - if (submitted < length) { - pos += submitted; - length -= submitted; - if (write) - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - pos, length, false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); - ret = -ENOTBLK; - } - if (write) { - btrfs_put_ordered_extent(dio_data->ordered); - dio_data->ordered = NULL; - } - - if (write) - extent_changeset_free(dio_data->data_reserved); - return ret; -} - -static void btrfs_dio_end_io(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_inode *inode = bbio->inode; - struct bio *bio = &bbio->bio; - - if (bio->bi_status) { - btrfs_warn(inode->root->fs_info, - "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", - btrfs_ino(inode), bio->bi_opf, - dip->file_offset, dip->bytes, bio->bi_status); - } - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_finish_ordered_extent(bbio->ordered, NULL, - dip->file_offset, dip->bytes, - !bio->bi_status); - } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - bbio->bio.bi_private = bbio->private; - iomap_dio_bio_end_io(bio); -} - -static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, - loff_t file_offset) -{ - struct btrfs_bio *bbio = btrfs_bio(bio); - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_dio_data *dio_data = iter->private; - - btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, - btrfs_dio_end_io, bio->bi_private); - bbio->inode = BTRFS_I(iter->inode); - bbio->file_offset = file_offset; - - dip->file_offset = file_offset; - dip->bytes = bio->bi_iter.bi_size; - - dio_data->submitted += bio->bi_iter.bi_size; - - /* - * Check if we are doing a partial write. If we are, we need to split - * the ordered extent to match the submitted bio. Hang on to the - * remaining unfinishable ordered_extent in dio_data so that it can be - * cancelled in iomap_end to avoid a deadlock wherein faulting the - * remaining pages is blocked on the outstanding ordered extent. - */ - if (iter->flags & IOMAP_WRITE) { - int ret; - - ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); - if (ret) { - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - file_offset, dip->bytes, - !ret); - bio->bi_status = errno_to_blk_status(ret); - iomap_dio_bio_end_io(bio); - return; - } - } - - btrfs_submit_bio(bbio, 0); -} - -static const struct iomap_ops btrfs_dio_iomap_ops = { - .iomap_begin = btrfs_dio_iomap_begin, - .iomap_end = btrfs_dio_iomap_end, -}; - -static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_dio_submit_io, - .bio_set = &btrfs_dio_bioset, -}; - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - int ret; - - ret = fiemap_prep(inode, fieinfo, start, &len, 0); - if (ret) - return ret; - - /* - * fiemap_prep() called filemap_write_and_wait() for the whole possible - * file range (0 to LLONG_MAX), but that is not enough if we have - * compression enabled. The first filemap_fdatawrite_range() only kicks - * in the compression of data (in an async thread) and will return - * before the compression is done and writeback is started. A second - * filemap_fdatawrite_range() is needed to wait for the compression to - * complete and writeback to start. We also need to wait for ordered - * extents to complete, because our fiemap implementation uses mainly - * file extent items to list the extents, searching for extent maps - * only for file ranges with holes or prealloc extents to figure out - * if we have delalloc in those ranges. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) - return ret; - } - - btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); - - /* - * We did an initial flush to avoid holding the inode's lock while - * triggering writeback and waiting for the completion of IO and ordered - * extents. Now after we locked the inode we do it again, because it's - * possible a new write may have happened in between those two steps. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) { - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - return ret; - } - } - - ret = extent_fiemap(btrfs_inode, fieinfo, start, len); - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - - return ret; -} - -static int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return extent_writepages(mapping, wbc); -} - -static void btrfs_readahead(struct readahead_control *rac) -{ - extent_readahead(rac); -} - /* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. @@ -7941,13 +7230,12 @@ static void btrfs_readahead(struct readahead_control *rac) * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ -static void wait_subpage_spinlock(struct page *page) +static void wait_subpage_spinlock(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page->mapping)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7968,15 +7256,20 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } -static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +static int btrfs_launder_folio(struct folio *folio) { - int ret = try_release_extent_mapping(&folio->page, gfp_flags); + return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), + PAGE_SIZE, NULL); +} - if (ret == 1) { - wait_subpage_spinlock(&folio->page); - clear_page_extent_mapped(&folio->page); +static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +{ + if (try_release_extent_mapping(folio, gfp_flags)) { + wait_subpage_spinlock(folio); + clear_folio_extent_mapped(folio); + return true; } - return ret; + return false; } static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -8025,7 +7318,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered - * (Private2) already cleared, so it's possible for endio and + * already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * @@ -8033,7 +7326,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); /* * For subpage case, we have call sites like @@ -8091,7 +7384,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, range_len = range_end + 1 - cur; if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* - * If Ordered (Private2) is cleared, it means endio has + * If Ordered is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. @@ -8164,181 +7457,14 @@ next: } /* * We have iterated through all ordered extents of the page, the page - * should not have Ordered (Private2) anymore, or the above iteration + * should not have Ordered anymore, or the above iteration * did something wrong. */ ASSERT(!folio_test_ordered(folio)); btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); - clear_page_extent_mapped(&folio->page); -} - -/* - * btrfs_page_mkwrite() is not allowed to change the file size as it gets - * called from a page fault handler when a page is first dirtied. Hence we must - * be careful to check for EOF conditions here. We set the page up correctly - * for a written page which means we get ENOSPC checking when writing into - * holes and correct delalloc and unwritten extent mapping on filesystems that - * support these features. - * - * We are not allowed to take the i_mutex here so we have to play games to - * protect against truncate races as the page could now be beyond EOF. Because - * truncate_setsize() writes the inode size before removing pages, once we have - * the page lock we can determine safely if the page is beyond EOF. If it is not - * beyond EOF, then the page is guaranteed safe against truncation until we - * unlock the page. - */ -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct folio *folio = page_folio(page); - struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_changeset *data_reserved = NULL; - unsigned long zero_start; - loff_t size; - vm_fault_t ret; - int ret2; - int reserved = 0; - u64 reserved_space; - u64 page_start; - u64 page_end; - u64 end; - - ASSERT(folio_order(folio) == 0); - - reserved_space = PAGE_SIZE; - - sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - end = page_end; - - /* - * Reserving delalloc space after obtaining the page lock can lead to - * deadlock. For example, if a dirty page is locked by this function - * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepages() function could - * end up waiting indefinitely to get a lock on the page currently - * being processed by btrfs_page_mkwrite() function. - */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; - goto out_noreserve; - } - - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ -again: - down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); - size = i_size_read(inode); - - if ((page->mapping != inode->i_mapping) || - (page_start >= size)) { - /* page got truncated out from underneath us */ - goto out_unlock; - } - wait_on_page_writeback(page); - - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); - goto out_unlock; - } - - /* - * we can't set the delalloc bits if there are pending ordered - * extents. Drop our locks and wait for them to finish - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, - PAGE_SIZE); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - goto again; - } - - if (page->index == ((size - 1) >> PAGE_SHIFT)) { - reserved_space = round_up(size - page_start, - fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { - end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); - } - } - - /* - * page_mkwrite gets called when the page is firstly dirtied after it's - * faulted in, but write(2) could also dirty a page and set delalloc - * bits, thus in this case for space account reason, we still need to - * clear any delalloc bits within this page range since we have to - * reserve data&meta space before lock_page() (see above comments). - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); - - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, - &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; - goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); - else - zero_start = PAGE_SIZE; - - if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); - - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); - btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); - btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); - - btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - - unlock_extent(io_tree, page_start, page_end, &cached_state); - up_read(&BTRFS_I(inode)->i_mmap_lock); - - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - -out_unlock: - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); -out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); -out_noreserve: - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return ret; + clear_folio_extent_mapped(folio); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) @@ -8358,7 +7484,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(&inode->vfs_inode, + ret = btrfs_wait_ordered_range(inode, inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) @@ -8559,20 +7685,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; - struct extent_io_tree *file_extent_tree = NULL; - - /* Self tests may pass a NULL fs_info. */ - if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { - file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); - if (!file_extent_tree) - return NULL; - } ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); - if (!ei) { - kfree(file_extent_tree); + if (!ei) return NULL; - } ei->root = NULL; ei->generation = 0; @@ -8585,8 +7701,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->disk_i_size = 0; ei->flags = 0; ei->ro_flags = 0; + /* + * ->index_cnt will be properly initialized later when creating a new + * inode (btrfs_create_new_inode()) or when reading an existing inode + * from disk (btrfs_read_locked_inode()). + */ ei->csum_bytes = 0; - ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_reflink_trans = 0; @@ -8613,20 +7733,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - ei->file_extent_tree = file_extent_tree; - if (file_extent_tree) { - extent_io_tree_init(fs_info, ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); - /* Lockdep class is set only for the file extent tree. */ - lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); - } + ei->file_extent_tree = NULL; + mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); - RB_CLEAR_NODE(&ei->rb_node); init_rwsem(&ei->i_mmap_lock); return inode; @@ -8662,9 +7776,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!S_ISDIR(vfs_inode->i_mode)) { WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); } - WARN_ON(inode->csum_bytes); - WARN_ON(inode->defrag_bytes); + if (!root || !btrfs_is_data_reloc_root(root)) + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8698,7 +7813,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) } } btrfs_qgroup_check_reserved_leak(inode); - inode_tree_del(inode); + btrfs_del_inode_from_root(inode); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); @@ -8732,7 +7847,6 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); - bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); } @@ -8743,17 +7857,9 @@ int __init btrfs_init_cachep(void) SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) - goto fail; - - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bbio.bio), - BIOSET_NEED_BVECS)) - goto fail; + return -ENOMEM; return 0; -fail: - btrfs_destroy_cachep(); - return -ENOMEM; } static int btrfs_getattr(struct mnt_idmap *idmap, @@ -8789,6 +7895,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; + stat->subvol = BTRFS_I(inode)->root->root_key.objectid; + stat->result_mask |= STATX_SUBVOL; + spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; inode_bytes = inode_get_bytes(inode); @@ -8952,31 +8061,45 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), @@ -9212,16 +8335,23 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } if (new_inode) { @@ -9229,18 +8359,27 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } - if (!ret && new_inode->i_nlink == 0) + if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } } @@ -9580,7 +8719,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); @@ -9668,7 +8806,7 @@ free_qgroup: * or we leak qgroup data reservation. */ btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, qgroup_released, + btrfs_root_id(inode->root), qgroup_released, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret); } @@ -9743,11 +8881,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } em->start = cur_offset; - em->orig_start = cur_offset; em->len = ins.offset; - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; + em->disk_bytenr = ins.objectid; + em->offset = 0; + em->disk_num_bytes = ins.offset; em->ram_bytes = ins.offset; em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; @@ -9888,28 +9025,6 @@ out_inode: return finish_open_simple(file, ret); } -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - while (index <= end_index) { - page = find_get_page(inode->vfs_inode.i_mapping, index); - ASSERT(page); /* Pages should be in the extent_io_tree */ - - /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(page_folio(page)) == 0); - btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); - put_page(page); - index++; - } -} - int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { @@ -9954,12 +9069,16 @@ static ssize_t btrfs_encoded_read_inline( unsigned long ptr; void *tmp; ssize_t ret; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } + + path->nowait = nowait; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { @@ -10022,8 +9141,9 @@ out: } struct btrfs_encoded_read_private { - wait_queue_head_t wait; - atomic_t pending; + struct completion done; + void *uring_ctx; + refcount_t pending_refs; blk_status_t status; }; @@ -10042,26 +9162,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (!atomic_dec_return(&priv->pending)) - wake_up(&priv->wait); + if (refcount_dec_and_test(&priv->pending_refs)) { + int err = blk_status_to_errno(READ_ONCE(priv->status)); + + if (priv->uring_ctx) { + btrfs_uring_read_extent_endio(priv->uring_ctx, err); + kfree(priv); + } else { + complete(&priv->done); + } + } bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .pending = ATOMIC_INIT(1), - }; + struct btrfs_encoded_read_private *priv; unsigned long i = 0; struct btrfs_bio *bbio; + int ret; - init_waitqueue_head(&priv.wait); + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); + priv->status = 0; + priv->uring_ctx = uring_ctx; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -10069,11 +9203,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; @@ -10084,22 +9218,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); - /* See btrfs_encoded_read_endio() for ordering. */ - return blk_status_to_errno(READ_ONCE(priv.status)); + if (uring_ctx) { + if (refcount_dec_and_test(&priv->pending_refs)) { + ret = blk_status_to_errno(READ_ONCE(priv->status)); + btrfs_uring_read_extent_endio(uring_ctx, ret); + kfree(priv); + return ret; + } + + return -EIOCBQUEUED; + } else { + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); + /* See btrfs_encoded_read_endio() for ordering. */ + ret = blk_status_to_errno(READ_ONCE(priv->status)); + kfree(priv); + return ret; + } } -static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, - struct iov_iter *iter, - u64 start, u64 lockend, - struct extent_state **cached_state, - u64 disk_bytenr, u64 disk_io_size, - size_t count, bool compressed, - bool *unlocked) +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; @@ -10113,14 +9258,14 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages, 0); + ret = btrfs_alloc_page_array(nr_pages, pages, false); if (ret) { ret = -ENOMEM; goto out; } - ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, - disk_io_size, pages); + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, NULL); if (ret) goto out; @@ -10160,21 +9305,26 @@ out: } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded) + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); - u64 start, lockend, disk_bytenr, disk_io_size; - struct extent_state *cached_state = NULL; + u64 start, lockend; struct extent_map *em; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); bool unlocked = false; file_accessed(iocb->ki_filp); - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + ret = btrfs_inode_lock(inode, + BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); + if (ret) + return ret; if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -10187,21 +9337,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - for (;;) { + if (nowait) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, - lockend - start + 1); - if (ret) + if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, + start, lockend)) { + ret = -EAGAIN; goto out_unlock_inode; - lock_extent(io_tree, start, lockend, &cached_state); + } + + if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + ret = -EAGAIN; + goto out_unlock_inode; + } + ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); - if (!ordered) - break; - btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, &cached_state); - cond_resched(); + if (ordered) { + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + ret = -EAGAIN; + goto out_unlock_inode; + } + } else { + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + + lock_extent(io_tree, start, lockend, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + cond_resched(); + } } em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); @@ -10210,7 +9385,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock_extent; } - if (em->block_start == EXTENT_MAP_INLINE) { + if (em->disk_bytenr == EXTENT_MAP_INLINE) { u64 extent_start = em->start; /* @@ -10220,9 +9395,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, - &cached_state, extent_start, + cached_state, extent_start, count, encoded, &unlocked); - goto out; + goto out_unlock_extent; } /* @@ -10231,73 +9406,68 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { - disk_bytenr = EXTENT_MAP_HOLE; + *disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; } else if (extent_map_is_compressed(em)) { - disk_bytenr = em->block_start; + *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. */ - if (em->block_len > count) { + if (em->disk_num_bytes > count) { ret = -ENOBUFS; goto out_em; } - disk_io_size = em->block_len; - count = em->block_len; + *disk_io_size = em->disk_num_bytes; + count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; - encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - disk_bytenr = em->block_start + (start - em->start); + *disk_bytenr = extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ - disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; - count = start + disk_io_size - iocb->ki_pos; + *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + *disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; - disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } free_extent_map(em); em = NULL; - if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, &cached_state); + if (*disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { - ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, - &cached_state, disk_bytenr, - disk_io_size, count, - encoded->compression, - &unlocked); + ret = -EIOCBQUEUED; + goto out_unlock_extent; } -out: - if (ret >= 0) - iocb->ki_pos += encoded->len; out_em: free_extent_map(em); out_unlock_extent: - if (!unlocked) - unlock_extent(io_tree, start, lockend, &cached_state); + /* Leave inode and extent locked if we need to do a read. */ + if (!unlocked && ret != -EIOCBQUEUED) + unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: - if (!unlocked) + if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -10312,12 +9482,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; int compression; size_t orig_count; u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; - unsigned long nr_pages, i; - struct page **pages; + unsigned long nr_folios, i; + struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; @@ -10406,24 +9577,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); - nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); - if (!pages) + nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); + if (!folios) return -ENOMEM; - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_folios; i++) { size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr; - pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); - if (!pages[i]) { + folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); + if (!folios[i]) { ret = -ENOMEM; - goto out_pages; + goto out_folios; } - kaddr = kmap_local_page(pages[i]); + kaddr = kmap_local_folio(folios[i], 0); if (copy_from_iter(kaddr, bytes, from) != bytes) { kunmap_local(kaddr); ret = -EFAULT; - goto out_pages; + goto out_folios; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); @@ -10433,14 +9604,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, for (;;) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) - goto out_pages; + goto out_folios; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) - goto out_pages; + goto out_folios; lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && @@ -10468,10 +9639,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_qgroup_free_data; /* Try an inline extent first. */ - if (start == 0 && encoded->unencoded_len == encoded->len && - encoded->unencoded_offset == 0) { - ret = cow_file_range_inline(inode, encoded->len, orig_count, - compression, pages, true); + if (encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0 && + can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { + ret = __cow_file_range_inline(inode, encoded->len, + orig_count, compression, folios[0], + true); if (ret <= 0) { if (ret == 0) ret = orig_count; @@ -10485,22 +9658,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_delalloc_release; extent_reserved = true; - em = create_io_em(inode, start, num_bytes, - start - encoded->unencoded_offset, ins.objectid, - ins.offset, ins.offset, ram_bytes, compression, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = num_bytes; + file_extent.ram_bytes = ram_bytes; + file_extent.offset = encoded->unencoded_offset; + file_extent.compression = compression; + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserved; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, - ins.objectid, ins.offset, - encoded->unencoded_offset, + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED), - compression); + (1 << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -10515,7 +9688,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); ret = orig_count; goto out; @@ -10537,12 +9710,12 @@ out_free_data_space: btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: unlock_extent(io_tree, start, end, &cached_state); -out_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) - __free_page(pages[i]); +out_folios: + for (i = 0; i < nr_folios; i++) { + if (folios[i]) + folio_put(folios[i]); } - kvfree(pages); + kvfree(folios); out: if (ret >= 0) iocb->ki_pos += encoded->len; @@ -10689,39 +9862,59 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and * we don't really care. */ - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -10736,7 +9929,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -10750,7 +9944,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -10766,11 +9961,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", - root->root_key.objectid); - return -EPERM; + btrfs_root_id(root)); + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -10778,24 +9975,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->block_start == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->block_start == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -10807,23 +10019,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = em->block_start + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -10858,7 +10092,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -10899,20 +10132,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); @@ -10925,6 +10161,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; @@ -10933,7 +10173,6 @@ out: *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; - sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else @@ -10995,7 +10234,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en if (ordered) { btrfs_err(root->fs_info, "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", - start, end, btrfs_ino(inode), root->root_key.objectid, + start, end, btrfs_ino(inode), btrfs_root_id(root), ordered->file_offset, ordered->file_offset + ordered->num_bytes - 1); btrfs_put_ordered_extent(ordered); @@ -11004,6 +10243,36 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en ASSERT(ordered == NULL); } +/* + * Find the first inode with a minimum number. + * + * @root: The root to search for. + * @min_ino: The minimum inode number. + * + * Find the first inode in the @root with a number >= @min_ino and return it. + * Returns NULL if no such inode found. + */ +struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + if (igrab(&inode->vfs_inode)) + break; + + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -11056,6 +10325,7 @@ static const struct address_space_operations btrfs_aops = { .writepages = btrfs_writepages, .readahead = btrfs_readahead, .invalidate_folio = btrfs_invalidate_folio, + .launder_folio = btrfs_launder_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, |