summaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c4436
1 files changed, 1921 insertions, 2515 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7fed887e700c..c0c778243bf1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,7 +32,7 @@
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <linux/iomap.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
@@ -70,31 +70,13 @@
#include "orphan.h"
#include "backref.h"
#include "raid-stripe-tree.h"
+#include "fiemap.h"
struct btrfs_iget_args {
u64 ino;
struct btrfs_root *root;
};
-struct btrfs_dio_data {
- ssize_t submitted;
- struct extent_changeset *data_reserved;
- struct btrfs_ordered_extent *ordered;
- bool data_space_reserved;
- bool nocow_done;
-};
-
-struct btrfs_dio_private {
- /* Range of I/O */
- u64 file_offset;
- u32 bytes;
-
- /* This must be last */
- struct btrfs_bio bbio;
-};
-
-static struct bio_set btrfs_dio_bioset;
-
struct btrfs_rename_ctx {
/* Output field. Stores the index number of the old directory entry. */
u64 index;
@@ -134,14 +116,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty);
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
- u64 len, u64 orig_start, u64 block_start,
- u64 block_len, u64 orig_block_len,
- u64 ram_bytes, int compress_type,
- int type);
static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
u64 root, void *warn_ctx)
@@ -254,7 +231,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid, btrfs_ino(inode), file_off,
+ btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
@@ -264,7 +241,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
logical += file_off;
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
btrfs_ino(inode), file_off, logical,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -331,15 +308,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* Output without objectid, which is more meaningful */
- if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
+ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -347,7 +324,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
} else {
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -416,37 +393,16 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
* extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
- struct page *locked_page,
u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start = 0, page_end = 0;
- struct page *page;
-
- if (locked_page) {
- page_start = page_offset(locked_page);
- page_end = page_start + PAGE_SIZE - 1;
- }
+ struct folio *folio;
while (index <= end_index) {
- /*
- * For locked page, we will call btrfs_mark_ordered_io_finished
- * through btrfs_mark_ordered_io_finished() on it
- * in run_delalloc_range() for the error handling, which will
- * clear page Ordered and run the ordered extent accounting.
- *
- * Here we can't just clear the Ordered bit, or
- * btrfs_mark_ordered_io_finished() would skip the accounting
- * for the page range, and the ordered extent will never finish.
- */
- if (locked_page && index == (page_start >> PAGE_SHIFT)) {
- index++;
- continue;
- }
- page = find_get_page(inode->vfs_inode.i_mapping, index);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
index++;
- if (!page)
+ if (IS_ERR(folio))
continue;
/*
@@ -454,25 +410,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
- page_folio(page), offset, bytes);
- put_page(page);
- }
-
- if (locked_page) {
- /* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_start + PAGE_SIZE)
- return;
- /*
- * In case this page belongs to the delalloc range being
- * instantiated then skip it, since the first page of a range is
- * going to be properly cleaned up by the caller of
- * run_delalloc_range
- */
- if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
- offset = page_offset(locked_page) + PAGE_SIZE;
- }
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
+ offset, bytes);
+ folio_put(folio);
}
return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
@@ -512,12 +452,12 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, bool extent_inserted,
size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages,
+ struct folio *compressed_folio,
bool update_i_size)
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
- struct page *page = NULL;
+ const u32 sectorsize = trans->fs_info->sectorsize;
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
@@ -525,10 +465,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
u64 i_size;
- ASSERT((compressed_size > 0 && compressed_pages) ||
- (compressed_size == 0 && !compressed_pages));
+ /*
+ * The decompressed size must still be no larger than a sector. Under
+ * heavy race, we can have size == 0 passed in, but that shouldn't be a
+ * big deal and we can continue the insertion.
+ */
+ ASSERT(size <= sectorsize);
+
+ /*
+ * The compressed size also needs to be no larger than a sector.
+ * That's also why we only need one page as the parameter.
+ */
+ if (compressed_folio)
+ ASSERT(compressed_size <= sectorsize);
+ else
+ ASSERT(compressed_size == 0);
- if (compressed_size && compressed_pages)
+ if (compressed_size && compressed_folio)
cur_size = compressed_size;
if (!extent_inserted) {
@@ -536,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t datasize;
key.objectid = btrfs_ino(inode);
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -556,32 +509,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
ptr = btrfs_file_extent_inline_start(ei);
if (compress_type != BTRFS_COMPRESS_NONE) {
- struct page *cpage;
- int i = 0;
- while (compressed_size > 0) {
- cpage = compressed_pages[i];
- cur_size = min_t(unsigned long, compressed_size,
- PAGE_SIZE);
-
- kaddr = kmap_local_page(cpage);
- write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_local(kaddr);
+ kaddr = kmap_local_folio(compressed_folio, 0);
+ write_extent_buffer(leaf, kaddr, ptr, compressed_size);
+ kunmap_local(kaddr);
- i++;
- ptr += cur_size;
- compressed_size -= cur_size;
- }
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
- page = find_get_page(inode->vfs_inode.i_mapping, 0);
+ struct folio *folio;
+
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
+ ASSERT(!IS_ERR(folio));
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_local_page(page);
+ kaddr = kmap_local_folio(folio, 0);
write_extent_buffer(leaf, kaddr, ptr, size);
kunmap_local(kaddr);
- put_page(page);
+ folio_put(folio);
}
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -611,17 +555,53 @@ fail:
return ret;
}
+static bool can_cow_file_range_inline(struct btrfs_inode *inode,
+ u64 offset, u64 size,
+ size_t compressed_size)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 data_len = (compressed_size ?: size);
+
+ /* Inline extents must start at offset 0. */
+ if (offset != 0)
+ return false;
+
+ /* Inline extents are limited to sectorsize. */
+ if (size > fs_info->sectorsize)
+ return false;
+
+ /* We do not allow a non-compressed extent to be as large as block size. */
+ if (data_len >= fs_info->sectorsize)
+ return false;
+
+ /* We cannot exceed the maximum inline data size. */
+ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ return false;
+
+ /* We cannot exceed the user specified max_inline size. */
+ if (data_len > fs_info->max_inline)
+ return false;
+
+ /* Inline extents must be the entirety of the file. */
+ if (size < i_size_read(&inode->vfs_inode))
+ return false;
+
+ return true;
+}
/*
* conditionally insert an inline extent into the file. This
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
+ *
+ * If being used directly, you must have already checked we're allowed to cow
+ * the range by getting true from can_cow_file_range_inline().
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
- size_t compressed_size,
- int compress_type,
- struct page **compressed_pages,
- bool update_i_size)
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
+ u64 size, size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
@@ -631,18 +611,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
int ret;
struct btrfs_path *path;
- /*
- * We can create an inline extent if it ends at or beyond the current
- * i_size, is no larger than a sector (decompressed), and the (possibly
- * compressed) data fits in a leaf and the configured maximum inline
- * size.
- */
- if (size < i_size_read(&inode->vfs_inode) ||
- size > fs_info->sectorsize ||
- data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- data_len > fs_info->max_inline)
- return 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -668,7 +636,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
- compressed_pages, update_i_size);
+ compressed_folio, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -695,25 +663,74 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
+ btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
}
+static noinline int cow_file_range_inline(struct btrfs_inode *inode,
+ struct folio *locked_folio,
+ u64 offset, u64 end,
+ size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
+{
+ struct extent_state *cached = NULL;
+ unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
+ u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
+ int ret;
+
+ if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
+ return 1;
+
+ btrfs_lock_extent(&inode->io_tree, offset, end, &cached);
+ ret = __cow_file_range_inline(inode, size, compressed_size,
+ compress_type, compressed_folio,
+ update_i_size);
+ if (ret > 0) {
+ btrfs_unlock_extent(&inode->io_tree, offset, end, &cached);
+ return ret;
+ }
+
+ /*
+ * In the successful case (ret == 0 here), cow_file_range will return 1.
+ *
+ * Quite a bit further up the callstack in extent_writepage(), ret == 1
+ * is treated as a short circuited success and does not unlock the folio,
+ * so we must do it here.
+ *
+ * In the failure case, the locked_folio does get unlocked by
+ * btrfs_folio_end_all_writers, which asserts that it is still locked
+ * at that point, so we must *not* unlock it here.
+ *
+ * The other two callsites in compress_file_range do not have a
+ * locked_folio, so they are not relevant to this logic.
+ */
+ if (ret == 0)
+ locked_folio = NULL;
+
+ extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
+ clear_flags, PAGE_UNLOCK |
+ PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
+ return ret;
+}
+
struct async_extent {
u64 start;
u64 ram_size;
u64 compressed_size;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
int compress_type;
struct list_head list;
};
struct async_chunk {
struct btrfs_inode *inode;
- struct page *locked_page;
+ struct folio *locked_folio;
u64 start;
u64 end;
blk_opf_t write_flags;
@@ -731,8 +748,8 @@ struct async_cow {
static noinline int add_async_extent(struct async_chunk *cow,
u64 start, u64 ram_size,
u64 compressed_size,
- struct page **pages,
- unsigned long nr_pages,
+ struct folio **folios,
+ unsigned long nr_folios,
int compress_type)
{
struct async_extent *async_extent;
@@ -743,8 +760,8 @@ static noinline int add_async_extent(struct async_chunk *cow,
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
- async_extent->pages = pages;
- async_extent->nr_pages = nr_pages;
+ async_extent->folios = folios;
+ async_extent->nr_folios = nr_folios;
async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
@@ -760,42 +777,9 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!btrfs_inode_can_compress(inode)) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
- btrfs_ino(inode));
+ DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
return 0;
}
- /*
- * Special check for subpage.
- *
- * We lock the full page then run each delalloc range in the page, thus
- * for the following case, we will hit some subpage specific corner case:
- *
- * 0 32K 64K
- * | |///////| |///////|
- * \- A \- B
- *
- * In above case, both range A and range B will try to unlock the full
- * page [0, 64K), causing the one finished later will have page
- * unlocked already, triggering various page lock requirement BUG_ON()s.
- *
- * So here we add an artificial limit that subpage compression can only
- * if the range is fully page aligned.
- *
- * In theory we only need to ensure the first page is fully covered, but
- * the tailing partial page will be locked until the full compression
- * finishes, delaying the write of other range.
- *
- * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
- * first to prevent any submitted async extent to unlock the full page.
- * By this, we can ensure for subpage case that only the last async_cow
- * will unlock the full page.
- */
- if (fs_info->sectorsize < PAGE_SIZE) {
- if (!PAGE_ALIGNED(start) ||
- !PAGE_ALIGNED(end + 1))
- return 0;
- }
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
@@ -809,7 +793,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
if (btrfs_test_opt(fs_info, COMPRESS) ||
inode->flags & BTRFS_INODE_COMPRESS ||
inode->prop_compress)
- return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
+ return btrfs_compress_heuristic(inode, start, end);
return 0;
}
@@ -819,7 +803,28 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode, small_write);
+ btrfs_add_inode_defrag(inode, small_write);
+}
+
+static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct folio *folio;
+ int ret = 0;
+
+ for (unsigned long index = start >> PAGE_SHIFT;
+ index <= end_index; index++) {
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
+ if (IS_ERR(folio)) {
+ if (!ret)
+ ret = PTR_ERR(folio);
+ continue;
+ }
+ btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
+ end + 1 - start);
+ folio_put(folio);
+ }
+ return ret;
}
/*
@@ -848,13 +853,14 @@ static void compress_file_range(struct btrfs_work *work)
u64 actual_end;
u64 i_size;
int ret = 0;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
unsigned int poff;
int i;
int compress_type = fs_info->compress_type;
+ int compress_level = fs_info->compress_level;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
@@ -863,7 +869,16 @@ static void compress_file_range(struct btrfs_work *work)
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
- extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+ ret = extent_range_clear_dirty_for_io(inode, start, end);
+
+ /*
+ * All the folios should have been locked thus no failure.
+ *
+ * And even if some folios are missing, btrfs_compress_folios()
+ * would handle them correctly, so here just do an ASSERT() check for
+ * early logic errors.
+ */
+ ASSERT(ret == 0);
/*
* We need to save i_size before now because it could change in between
@@ -879,9 +894,9 @@ static void compress_file_range(struct btrfs_work *work)
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
- pages = NULL;
- nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
+ folios = NULL;
+ nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
/*
* we don't want to send crud past the end of i_size through
@@ -906,17 +921,6 @@ again:
(start > 0 || end + 1 < inode->disk_i_size))
goto cleanup_and_bail_uncompressed;
- /*
- * For subpage case, we require full page alignment for the sector
- * aligned range.
- * Thus we must also check against @actual_end, not just @end.
- */
- if (blocksize < PAGE_SIZE) {
- if (!PAGE_ALIGNED(start) ||
- !PAGE_ALIGNED(round_up(actual_end, blocksize)))
- goto cleanup_and_bail_uncompressed;
- }
-
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -930,8 +934,8 @@ again:
if (!inode_need_compress(inode, start, end))
goto cleanup_and_bail_uncompressed;
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages) {
+ folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
+ if (!folios) {
/*
* Memory allocation failure is not a fatal error, we can fall
* back to uncompressed code.
@@ -939,15 +943,17 @@ again:
goto cleanup_and_bail_uncompressed;
}
- if (inode->defrag_compress)
+ if (inode->defrag_compress) {
compress_type = inode->defrag_compress;
- else if (inode->prop_compress)
+ compress_level = inode->defrag_compress_level;
+ } else if (inode->prop_compress) {
compress_type = inode->prop_compress;
+ }
/* Compression level is applied here. */
- ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
- mapping, start, pages, &nr_pages, &total_in,
- &total_compressed);
+ ret = btrfs_compress_folios(compress_type, compress_level,
+ mapping, start, folios, &nr_folios, &total_in,
+ &total_compressed);
if (ret)
goto mark_incompressible;
@@ -957,7 +963,7 @@ again:
*/
poff = offset_in_page(total_compressed);
if (poff)
- memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
+ folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
/*
* Try to create an inline extent.
@@ -968,43 +974,16 @@ again:
* Check cow_file_range() for why we don't even try to create inline
* extent for the subpage case.
*/
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
- if (total_in < actual_end) {
- ret = cow_file_range_inline(inode, actual_end, 0,
- BTRFS_COMPRESS_NONE, NULL,
- false);
- } else {
- ret = cow_file_range_inline(inode, actual_end,
- total_compressed,
- compress_type, pages,
- false);
- }
- if (ret <= 0) {
- unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING;
-
- if (ret < 0)
- mapping_set_error(mapping, -EIO);
-
- /*
- * inline extent creation worked or returned error,
- * we don't need to create any more async work items.
- * Unlock and free up our temp pages.
- *
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be done _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- NULL,
- clear_flags,
- PAGE_UNLOCK |
- PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK);
- goto free_pages;
- }
+ if (total_in < actual_end)
+ ret = cow_file_range_inline(inode, NULL, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
+ else
+ ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
+ compress_type, folios[0], false);
+ if (ret <= 0) {
+ if (ret < 0)
+ mapping_set_error(mapping, -EIO);
+ goto free_pages;
}
/*
@@ -1026,8 +1005,8 @@ again:
* The async work queues will take care of doing actual allocation on
* disk for these compressed pages, and will submit the bios.
*/
- ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages,
- nr_pages, compress_type);
+ ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
+ nr_folios, compress_type);
BUG_ON(ret);
if (start + total_in < end) {
start += total_in;
@@ -1044,12 +1023,12 @@ cleanup_and_bail_uncompressed:
BTRFS_COMPRESS_NONE);
BUG_ON(ret);
free_pages:
- if (pages) {
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- btrfs_free_compr_page(pages[i]);
+ if (folios) {
+ for (i = 0; i < nr_folios; i++) {
+ WARN_ON(folios[i]->mapping);
+ btrfs_free_compr_folio(folios[i]);
}
- kfree(pages);
+ kfree(folios);
}
}
@@ -1057,21 +1036,21 @@ static void free_async_extent_pages(struct async_extent *async_extent)
{
int i;
- if (!async_extent->pages)
+ if (!async_extent->folios)
return;
- for (i = 0; i < async_extent->nr_pages; i++) {
- WARN_ON(async_extent->pages[i]->mapping);
- btrfs_free_compr_page(async_extent->pages[i]);
+ for (i = 0; i < async_extent->nr_folios; i++) {
+ WARN_ON(async_extent->folios[i]->mapping);
+ btrfs_free_compr_folio(async_extent->folios[i]);
}
- kfree(async_extent->pages);
- async_extent->nr_pages = 0;
- async_extent->pages = NULL;
+ kfree(async_extent->folios);
+ async_extent->nr_folios = 0;
+ async_extent->folios = NULL;
}
static void submit_uncompressed_range(struct btrfs_inode *inode,
struct async_extent *async_extent,
- struct page *locked_page)
+ struct folio *locked_folio)
{
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1084,21 +1063,17 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
};
wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
- ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
+ ret = run_delalloc_cow(inode, locked_folio, start, end,
+ &wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
- btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
- if (locked_page) {
- const u64 page_start = page_offset(locked_page);
-
- set_page_writeback(locked_page);
- end_page_writeback(locked_page);
- btrfs_mark_ordered_io_finished(inode, locked_page,
- page_start, PAGE_SIZE,
- !ret);
- mapping_set_error(locked_page->mapping, ret);
- unlock_page(locked_page);
- }
+ if (locked_folio)
+ btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
+ start, async_extent->ram_size);
+ btrfs_err_rl(inode->root->fs_info,
+ "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), start, async_extent->ram_size, ret);
}
}
@@ -1111,10 +1086,13 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
struct btrfs_key ins;
- struct page *locked_page = NULL;
+ struct folio *locked_folio = NULL;
+ struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
+ bool free_pages = false;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1122,20 +1100,23 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
kthread_associate_blkcg(async_chunk->blkcg_css);
/*
- * If async_chunk->locked_page is in the async_extent range, we need to
+ * If async_chunk->locked_folio is in the async_extent range, we need to
* handle it.
*/
- if (async_chunk->locked_page) {
- u64 locked_page_start = page_offset(async_chunk->locked_page);
- u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
+ if (async_chunk->locked_folio) {
+ u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
+ u64 locked_folio_end = locked_folio_start +
+ folio_size(async_chunk->locked_folio) - 1;
- if (!(start >= locked_page_end || end <= locked_page_start))
- locked_page = async_chunk->locked_page;
+ if (!(start >= locked_folio_end || end <= locked_folio_start))
+ locked_folio = async_chunk->locked_folio;
}
- lock_extent(io_tree, start, end, NULL);
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
- submit_uncompressed_range(inode, async_extent, locked_page);
+ ASSERT(!async_extent->folios);
+ ASSERT(async_extent->nr_folios == 0);
+ submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
@@ -1150,34 +1131,30 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* non-contiguous space for the uncompressed size instead. So
* fall back to uncompressed.
*/
- submit_uncompressed_range(inode, async_extent, locked_page);
+ submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
+ btrfs_lock_extent(io_tree, start, end, &cached);
+
/* Here we're doing allocation and writeback of the compressed pages */
- em = create_io_em(inode, start,
- async_extent->ram_size, /* len */
- start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.ram_bytes = async_extent->ram_size;
+ file_extent.num_bytes = async_extent->ram_size;
+ file_extent.offset = 0;
+ file_extent.compression = async_extent->compress_type;
+
+ em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserve;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */
- async_extent->ram_size, /* num_bytes */
- async_extent->ram_size, /* ram_bytes */
- ins.objectid, /* disk_bytenr */
- ins.offset, /* disk_num_bytes */
- 0, /* offset */
- 1 << BTRFS_ORDERED_COMPRESSED,
- async_extent->compress_type);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ 1U << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1187,25 +1164,28 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
/* Clear dirty, set writeback and unlock the pages. */
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
btrfs_submit_compressed_write(ordered,
- async_extent->pages, /* compressed_pages */
- async_extent->nr_pages,
+ async_extent->folios, /* compressed_folios */
+ async_extent->nr_folios,
async_chunk->write_flags, true);
*alloc_hint = ins.objectid + ins.offset;
done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
+ if (free_pages)
+ free_async_extent_pages(async_extent);
kfree(async_extent);
return;
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ NULL, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
@@ -1215,36 +1195,36 @@ out_free_reserve:
kthread_associate_blkcg(NULL);
btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
- root->root_key.objectid, btrfs_ino(inode), start,
+ btrfs_root_id(root), btrfs_ino(inode), start,
async_extent->ram_size, ret);
kfree(async_extent);
}
-static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
- u64 num_bytes)
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ u64 num_bytes)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 alloc_hint = 0;
read_lock(&em_tree->lock);
- em = search_extent_mapping(em_tree, start, num_bytes);
+ em = btrfs_search_extent_mapping(em_tree, start, num_bytes);
if (em) {
/*
* if block start isn't an actual block number then find the
* first block in this inode and use that as a hint. If that
* block is also bogus then just don't worry about it.
*/
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- em = search_extent_mapping(em_tree, 0, 0);
- if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ btrfs_free_extent_map(em);
+ em = btrfs_search_extent_mapping(em_tree, 0, 0);
+ if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = btrfs_extent_map_block_start(em);
if (em)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
} else {
- alloc_hint = em->block_start;
- free_extent_map(em);
+ alloc_hint = btrfs_extent_map_block_start(em);
+ btrfs_free_extent_map(em);
}
}
read_unlock(&em_tree->lock);
@@ -1258,39 +1238,36 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* allocate extents on disk for the range, and create ordered data structs
* in ram to track those extents.
*
- * locked_page is the page that writepage had locked already. We use
+ * locked_folio is the folio that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
- * When this function fails, it unlocks all pages except @locked_page.
+ * When this function fails, it unlocks all pages except @locked_folio.
*
* When this function successfully creates an inline extent, it returns 1 and
- * unlocks all pages including locked_page and starts I/O on them.
- * (In reality inline extents are limited to a single page, so locked_page is
+ * unlocks all pages including locked_folio and starts I/O on them.
+ * (In reality inline extents are limited to a single page, so locked_folio is
* the only page handled anyway).
*
* When this function succeed and creates a normal extent, the page locking
* status depends on the passed in flags:
*
* - If @keep_locked is set, all pages are kept locked.
- * - Else all pages except for @locked_page are unlocked.
+ * - Else all pages except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
- * while-loop, the ordered extents created in previous iterations are kept
- * intact. So, the caller must clean them up by calling
- * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
- * example.
+ * while-loop, the ordered extents created in previous iterations are cleaned up.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
- struct page *locked_page, u64 start, u64 end,
- u64 *done_offset,
+ struct folio *locked_folio, u64 start,
+ u64 end, u64 *done_offset,
bool keep_locked, bool no_inline)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_state *cached = NULL;
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
- unsigned long ram_size;
u64 cur_alloc_size = 0;
u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
@@ -1298,7 +1275,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct extent_map *em;
unsigned clear_bits;
unsigned long page_ops;
- bool extent_reserved = false;
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
@@ -1312,57 +1288,36 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
- u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
- end + 1);
-
+ if (!no_inline) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, actual_end, 0,
+ ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
- if (ret == 0) {
- /*
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be run _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- locked_page,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
+ if (ret <= 0) {
/*
- * locked_page is locked by the caller of
- * writepage_delalloc(), not locked by
- * __process_pages_contig().
- *
- * We can't let __process_pages_contig() to unlock it,
- * as it doesn't have any subpage::writers recorded.
+ * We succeeded, return 1 so the caller knows we're done
+ * with this page and already handled the IO.
*
- * Here we manually unlock the page, since the caller
- * can't determine if it's an inline extent or a
- * compressed extent.
+ * If there was an error then cow_file_range_inline() has
+ * already done the cleanup.
*/
- unlock_page(locked_page);
- ret = 1;
+ if (ret == 0)
+ ret = 1;
goto done;
- } else if (ret < 0) {
- goto out_unlock;
}
}
- alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+ alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
+
+ /*
+ * We're not doing compressed IO, don't unlock the first page (which
+ * the caller expects to stay locked), don't clear any dirty bits and
+ * don't set any writeback bits.
+ *
+ * Do set the Ordered (Private2) bit so we know this page was properly
+ * setup for writepage.
+ */
+ page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+ page_ops |= PAGE_SET_ORDERED;
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1382,9 +1337,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
while (num_bytes > 0) {
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
- cur_alloc_size = num_bytes;
- ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
+ ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret == -EAGAIN) {
@@ -1407,36 +1362,49 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
continue;
}
if (done_offset) {
- *done_offset = start - 1;
- return 0;
+ /*
+ * Move @end to the end of the processed range,
+ * and exit the loop to unlock the processed extents.
+ */
+ end = start - 1;
+ ret = 0;
+ break;
}
ret = -ENOSPC;
}
if (ret < 0)
goto out_unlock;
cur_alloc_size = ins.offset;
- extent_reserved = true;
-
- ram_size = ins.offset;
- em = create_io_em(inode, start, ins.offset, /* len */
- start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- ram_size, /* ram_bytes */
- BTRFS_COMPRESS_NONE, /* compress_type */
- BTRFS_ORDERED_REGULAR /* type */);
+
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = ins.offset;
+ file_extent.ram_bytes = ins.offset;
+ file_extent.offset = 0;
+ file_extent.compression = BTRFS_COMPRESS_NONE;
+
+ /*
+ * Locked range will be released either during error clean up or
+ * after the whole range is finished.
+ */
+ btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
+ &cached);
+
+ em = btrfs_create_io_em(inode, start, &file_extent,
+ BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
+ btrfs_unlock_extent(&inode->io_tree, start,
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
- ram_size, ins.objectid, cur_alloc_size,
- 0, 1 << BTRFS_ORDERED_REGULAR,
- BTRFS_COMPRESS_NONE);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ 1U << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
+ btrfs_unlock_extent(&inode->io_tree, start,
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1457,35 +1425,20 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*/
if (ret)
btrfs_drop_extent_map_range(inode, start,
- start + ram_size - 1,
+ start + cur_alloc_size - 1,
false);
}
btrfs_put_ordered_extent(ordered);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /*
- * We're not doing compressed IO, don't unlock the first page
- * (which the caller expects to stay locked), don't clear any
- * dirty bits and don't set any writeback bits
- *
- * Do set the Ordered (Private2) bit so we know this page was
- * properly setup for writepage.
- */
- page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
- page_ops |= PAGE_SET_ORDERED;
-
- extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
- locked_page,
- EXTENT_LOCKED | EXTENT_DELALLOC,
- page_ops);
if (num_bytes < cur_alloc_size)
num_bytes = 0;
else
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
- extent_reserved = false;
+ cur_alloc_size = 0;
/*
* btrfs_reloc_clone_csums() error, since start is increased
@@ -1495,16 +1448,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
}
+ extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done:
if (done_offset)
*done_offset = end;
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
+ btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_unlock:
/*
* Now, we have three regions to clean up:
@@ -1515,29 +1470,31 @@ out_unlock:
* We process each region below.
*/
- clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
-
/*
* For the range (1). We have already instantiated the ordered extents
- * for this region. They are cleaned up by
- * btrfs_cleanup_ordered_extents() in e.g,
- * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
- * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
- * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
- * function.
+ * for this region, thus we need to cleanup those ordered extents.
+ * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
+ * are also handled by the ordered extents cleanup.
*
- * However, in case of @keep_locked, we still need to unlock the pages
- * (except @locked_page) to ensure all the pages are unlocked.
+ * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
+ * finish the writeback of the involved folios, which will be never submitted.
*/
- if (keep_locked && orig_start < start) {
- if (!locked_page)
+ if (orig_start < start) {
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
+ if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
+
+ btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_page, 0, page_ops);
+ locked_folio, NULL, clear_bits, page_ops);
}
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
/*
* For the range (2). If we reserved an extent for our delalloc range
* (or a subrange) and failed to create the respective ordered extent,
@@ -1548,13 +1505,12 @@ out_unlock:
* to decrement again the data space_info's bytes_may_use counter,
* therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
- if (extent_reserved) {
+ if (cur_alloc_size) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
- locked_page,
- clear_bits,
+ locked_folio, &cached, clear_bits,
page_ops);
- start += cur_alloc_size;
+ btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
}
/*
@@ -1563,11 +1519,18 @@ out_unlock:
* space_info's bytes_may_use counter, reserved in
* btrfs_check_data_free_space().
*/
- if (start < end) {
+ if (start + cur_alloc_size < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
- extent_clear_unlock_delalloc(inode, start, end, locked_page,
- clear_bits, page_ops);
- }
+ extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
+ end, locked_folio,
+ &cached, clear_bits, page_ops);
+ btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
+ end - start - cur_alloc_size + 1, NULL);
+ }
+ btrfs_err_rl(fs_info,
+ "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
return ret;
}
@@ -1589,10 +1552,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
u64 alloc_hint = 0;
if (do_free) {
- struct async_chunk *async_chunk;
struct async_cow *async_cow;
- async_chunk = container_of(work, struct async_chunk, work);
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
@@ -1607,8 +1568,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
PAGE_SHIFT;
while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
+ async_extent = list_first_entry(&async_chunk->extents,
+ struct async_extent, list);
list_del(&async_extent->list);
submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
}
@@ -1620,7 +1581,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
}
static bool run_delalloc_compressed(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1639,7 +1600,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
if (!ctx)
return false;
- unlock_extent(&inode->io_tree, start, end, NULL);
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
async_chunk = ctx->chunks;
@@ -1661,15 +1621,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
INIT_LIST_HEAD(&async_chunk[i].extents);
/*
- * The locked_page comes all the way from writepage and its
- * the original page we were actually given. As we spread
+ * The locked_folio comes all the way from writepage and its
+ * the original folio we were actually given. As we spread
* this large delalloc region across multiple async_chunk
- * structs, only the first struct needs a pointer to locked_page
+ * structs, only the first struct needs a pointer to
+ * locked_folio.
*
* This way we don't need racey decisions about who is supposed
* to unlock it.
*/
- if (locked_page) {
+ if (locked_folio) {
/*
* Depending on the compressibility, the pages might or
* might not go through async. We want all of them to
@@ -1679,12 +1640,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* need full accuracy. Just account the whole thing
* against the first page.
*/
- wbc_account_cgroup_owner(wbc, locked_page,
+ wbc_account_cgroup_owner(wbc, locked_folio,
cur_end - start);
- async_chunk[i].locked_page = locked_page;
- locked_page = NULL;
+ async_chunk[i].locked_folio = locked_folio;
+ locked_folio = NULL;
} else {
- async_chunk[i].locked_page = NULL;
+ async_chunk[i].locked_folio = NULL;
}
if (blkcg_css != blkcg_root_css) {
@@ -1713,7 +1674,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* covered by the range.
*/
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty)
{
@@ -1721,48 +1682,27 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
int ret;
while (start <= end) {
- ret = cow_file_range(inode, locked_page, start, end, &done_offset,
- true, false);
+ ret = cow_file_range(inode, locked_folio, start, end,
+ &done_offset, true, false);
if (ret)
return ret;
- extent_write_locked_range(&inode->vfs_inode, locked_page, start,
- done_offset, wbc, pages_dirty);
+ extent_write_locked_range(&inode->vfs_inode, locked_folio,
+ start, done_offset, wbc, pages_dirty);
start = done_offset + 1;
}
return 1;
}
-static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes, bool nowait)
-{
- struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
- struct btrfs_ordered_sum *sums;
- int ret;
- LIST_HEAD(list);
-
- ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
- &list, 0, nowait);
- if (ret == 0 && list_empty(&list))
- return 0;
-
- while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
- list_del(&sums->list);
- kfree(sums);
- }
- if (ret < 0)
- return ret;
- return 1;
-}
-
-static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
- const u64 start, const u64 end)
+static int fallback_to_cow(struct btrfs_inode *inode,
+ struct folio *locked_folio, const u64 start,
+ const u64 end)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_state *cached_state = NULL;
u64 range_start = start;
u64 count;
int ret;
@@ -1799,8 +1739,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
- count = count_range_bits(io_tree, &range_start, end, range_bytes,
- EXTENT_NORESERVE, 0, NULL);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
+ count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
+ EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1810,20 +1751,21 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
bytes = range_bytes;
spin_lock(&sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+ btrfs_space_info_update_bytes_may_use(sinfo, bytes);
spin_unlock(&sinfo->lock);
if (count > 0)
- clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
- NULL);
+ btrfs_clear_extent_bits(io_tree, start, end, EXTENT_NORESERVE);
}
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
/*
* Don't try to create inline extents, as a mix of inline extent that
* is written out and unlocked directly and a normal NOCOW extent
* doesn't work.
*/
- ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
+ true);
ASSERT(ret != 1);
return ret;
}
@@ -1836,20 +1778,17 @@ struct can_nocow_file_extent_args {
/* End file offset (inclusive) of the range we want to NOCOW. */
u64 end;
bool writeback_path;
- bool strict;
/*
* Free the path passed to can_nocow_file_extent() once it's not needed
* anymore.
*/
bool free_path;
- /* Output fields. Only set when can_nocow_file_extent() returns 1. */
-
- u64 disk_bytenr;
- u64 disk_num_bytes;
- u64 extent_offset;
- /* Number of bytes that can be written to in NOCOW mode. */
- u64 num_bytes;
+ /*
+ * Output fields. Only set when can_nocow_file_extent() returns 1.
+ * The expected file extent for the NOCOW write.
+ */
+ struct btrfs_file_extent file_extent;
};
/*
@@ -1870,6 +1809,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
+ struct btrfs_root *csum_root;
+ u64 io_start;
u64 extent_end;
u8 extent_type;
int can_nocow = 0;
@@ -1882,11 +1823,6 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
goto out;
- /* Can't access these fields unless we know it's not an inline extent. */
- args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
- args->extent_offset = btrfs_file_extent_offset(leaf, fi);
-
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
extent_type == BTRFS_FILE_EXTENT_REG)
goto out;
@@ -1896,13 +1832,12 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* for its subvolume was created, then this implies the extent is shared,
* hence we must COW.
*/
- if (!args->strict &&
- btrfs_file_extent_generation(leaf, fi) <=
+ if (btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item))
goto out;
/* An explicit hole, must COW. */
- if (args->disk_bytenr == 0)
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
goto out;
/* Compressed/encrypted/encoded extents must be COWed. */
@@ -1913,6 +1848,12 @@ static int can_nocow_file_extent(struct btrfs_path *path,
extent_end = btrfs_file_extent_end(path);
+ args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
+ args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
+
/*
* The following checks can be expensive, as they need to take other
* locks and do btree or rbtree searches, so release the path to avoid
@@ -1920,9 +1861,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
*/
btrfs_release_path(path);
- ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
- key->offset - args->extent_offset,
- args->disk_bytenr, args->strict, path);
+ ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
+ args->file_extent.disk_bytenr, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1930,7 +1870,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (args->free_path) {
/*
* We don't need the path anymore, plus through the
- * csum_exist_in_range() call below we will end up allocating
+ * btrfs_lookup_csums_list() call below we will end up allocating
* another path. So free the path to avoid unnecessary extra
* memory usage.
*/
@@ -1943,16 +1883,19 @@ static int can_nocow_file_extent(struct btrfs_path *path,
atomic_read(&root->snapshot_force_cow))
goto out;
- args->disk_bytenr += args->extent_offset;
- args->disk_bytenr += args->start - key->offset;
- args->num_bytes = min(args->end + 1, extent_end) - args->start;
+ args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
+ args->file_extent.offset += args->start - key->offset;
+ io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
/*
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
- nowait);
+
+ csum_root = btrfs_csum_root(root->fs_info, io_start);
+ ret = btrfs_lookup_csums_list(csum_root, io_start,
+ io_start + args->file_extent.num_bytes - 1,
+ NULL, nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1966,6 +1909,112 @@ static int can_nocow_file_extent(struct btrfs_path *path,
}
/*
+ * Cleanup the dirty folios which will never be submitted due to error.
+ *
+ * When running a delalloc range, we may need to split the ranges (due to
+ * fragmentation or NOCOW). If we hit an error in the later part, we will error
+ * out and previously successfully executed range will never be submitted, thus
+ * we have to cleanup those folios by clearing their dirty flag, starting and
+ * finishing the writeback.
+ */
+static void cleanup_dirty_folios(struct btrfs_inode *inode,
+ struct folio *locked_folio,
+ u64 start, u64 end, int error)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ u32 len;
+
+ ASSERT(end + 1 - start < U32_MAX);
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(end + 1, fs_info->sectorsize));
+ len = end + 1 - start;
+
+ /*
+ * Handle the locked folio first.
+ * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
+ */
+ btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+
+ for (pgoff_t index = start_index; index <= end_index; index++) {
+ struct folio *folio;
+
+ /* Already handled at the beginning. */
+ if (index == locked_folio->index)
+ continue;
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
+ /* Cache already dropped, no need to do any cleanup. */
+ if (IS_ERR(folio))
+ continue;
+ btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ mapping_set_error(mapping, error);
+}
+
+static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct extent_state **cached,
+ struct can_nocow_file_extent_args *nocow_args,
+ u64 file_pos, bool is_prealloc)
+{
+ struct btrfs_ordered_extent *ordered;
+ u64 len = nocow_args->file_extent.num_bytes;
+ u64 end = file_pos + len - 1;
+ int ret = 0;
+
+ btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
+
+ if (is_prealloc) {
+ struct extent_map *em;
+
+ em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(em);
+ }
+ btrfs_free_extent_map(em);
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
+ is_prealloc
+ ? (1U << BTRFS_ORDERED_PREALLOC)
+ : (1U << BTRFS_ORDERED_NOCOW));
+ if (IS_ERR(ordered)) {
+ if (is_prealloc)
+ btrfs_drop_extent_map_range(inode, file_pos, end, false);
+ btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(ordered);
+ }
+
+ if (btrfs_is_data_reloc_root(inode->root))
+ /*
+ * Errors are handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler from freeing
+ * metadata of the created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
+ /*
+ * On error, we need to cleanup the ordered extents we created.
+ *
+ * We do not clear the folio Dirty flags because they are set and
+ * cleaered by the caller.
+ */
+ if (ret < 0)
+ btrfs_cleanup_ordered_extents(inode, file_pos, end);
+ return ret;
+}
+
+/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
@@ -1973,13 +2022,18 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* blocks on disk
*/
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
- struct page *locked_page,
+ struct folio *locked_folio,
const u64 start, const u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
+ /*
+ * If not 0, represents the inclusive end of the last fallback_to_cow()
+ * range. Only for error handling.
+ */
+ u64 cow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
@@ -2002,17 +2056,14 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
nocow_args.end = end;
nocow_args.writeback_path = true;
- while (1) {
+ while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
- struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
+ struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 ram_bytes;
- u64 nocow_end;
int extent_type;
- bool is_prealloc;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
@@ -2067,12 +2118,13 @@ next_slot:
/*
* If the found extent starts after requested offset, then
- * adjust extent_end to be right before this extent begins
+ * adjust cur_offset to be right before this extent begins.
*/
if (found_key.offset > cur_offset) {
- extent_end = found_key.offset;
- extent_type = 0;
- goto must_cow;
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = found_key.offset;
+ goto next_slot;
}
/*
@@ -2088,7 +2140,6 @@ next_slot:
ret = -EUCLEAN;
goto error;
}
- ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
extent_end = btrfs_file_extent_end(path);
/*
@@ -2108,7 +2159,9 @@ next_slot:
goto must_cow;
ret = 0;
- nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ nocow_bg = btrfs_inc_nocow_writers(fs_info,
+ nocow_args.file_extent.disk_bytenr +
+ nocow_args.file_extent.offset);
if (!nocow_bg) {
must_cow:
/*
@@ -2135,79 +2188,23 @@ must_cow:
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = fallback_to_cow(inode, locked_page,
- cow_start, found_key.offset - 1);
- cow_start = (u64)-1;
+ ret = fallback_to_cow(inode, locked_folio, cow_start,
+ found_key.offset - 1);
if (ret) {
+ cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg);
goto error;
}
+ cow_start = (u64)-1;
}
- nocow_end = cur_offset + nocow_args.num_bytes - 1;
- is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
- if (is_prealloc) {
- u64 orig_start = found_key.offset - nocow_args.extent_offset;
- struct extent_map *em;
-
- em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
- orig_start,
- nocow_args.disk_bytenr, /* block_start */
- nocow_args.num_bytes, /* block_len */
- nocow_args.disk_num_bytes, /* orig_block_len */
- ram_bytes, BTRFS_COMPRESS_NONE,
- BTRFS_ORDERED_PREALLOC);
- if (IS_ERR(em)) {
- btrfs_dec_nocow_writers(nocow_bg);
- ret = PTR_ERR(em);
- goto error;
- }
- free_extent_map(em);
- }
-
- ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- nocow_args.num_bytes, nocow_args.num_bytes,
- nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
- is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW),
- BTRFS_COMPRESS_NONE);
+ ret = nocow_one_range(inode, locked_folio, &cached_state,
+ &nocow_args, cur_offset,
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (IS_ERR(ordered)) {
- if (is_prealloc) {
- btrfs_drop_extent_map_range(inode, cur_offset,
- nocow_end, false);
- }
- ret = PTR_ERR(ordered);
+ if (ret < 0)
goto error;
- }
-
- if (btrfs_is_data_reloc_root(root))
- /*
- * Error handled later, as we must prevent
- * extent_clear_unlock_delalloc() in error handler
- * from freeing metadata of created ordered extent.
- */
- ret = btrfs_reloc_clone_csums(ordered);
- btrfs_put_ordered_extent(ordered);
-
- extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
-
cur_offset = extent_end;
-
- /*
- * btrfs_reloc_clone_csums() error, now we're OK to call error
- * handler, as metadata for created ordered extent will only
- * be freed by btrfs_finish_ordered_io().
- */
- if (ret)
- goto error;
- if (cur_offset > end)
- break;
}
btrfs_release_path(path);
@@ -2215,11 +2212,12 @@ must_cow:
cow_start = cur_offset;
if (cow_start != (u64)-1) {
- cur_offset = end;
- ret = fallback_to_cow(inode, locked_page, cow_start, end);
- cow_start = (u64)-1;
- if (ret)
+ ret = fallback_to_cow(inode, locked_folio, cow_start, end);
+ if (ret) {
+ cow_end = end;
goto error;
+ }
+ cow_start = (u64)-1;
}
btrfs_free_path(path);
@@ -2227,20 +2225,81 @@ must_cow:
error:
/*
- * If an error happened while a COW region is outstanding, cur_offset
- * needs to be reset to cow_start to ensure the COW region is unlocked
- * as well.
+ * There are several error cases:
+ *
+ * 1) Failed without falling back to COW
+ * start cur_offset end
+ * |/////////////| |
+ *
+ * In this case, cow_start should be (u64)-1.
+ *
+ * For range [start, cur_offset) the folios are already unlocked (except
+ * @locked_folio), EXTENT_DELALLOC already removed.
+ * Need to clear the dirty flags and finish the ordered extents.
+ *
+ * 2) Failed with error before calling fallback_to_cow()
+ *
+ * start cow_start end
+ * |/////////////| |
+ *
+ * In this case, only @cow_start is set, @cur_offset is between
+ * [cow_start, end)
+ *
+ * It's mostly the same as case 1), just replace @cur_offset with
+ * @cow_start.
+ *
+ * 3) Failed with error from fallback_to_cow()
+ *
+ * start cow_start cow_end end
+ * |/////////////|-----------| |
+ *
+ * In this case, both @cow_start and @cow_end is set.
+ *
+ * For range [start, cow_start) it's the same as case 1).
+ * But for range [cow_start, cow_end), all the cleanup is handled by
+ * cow_file_range(), we should not touch anything in that range.
+ *
+ * So for all above cases, if @cow_start is set, cleanup ordered extents
+ * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
*/
if (cow_start != (u64)-1)
cur_offset = cow_start;
- if (cur_offset < end)
+
+ if (cur_offset > start) {
+ btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
+ cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+ }
+
+ /*
+ * If an error happened while a COW region is outstanding, cur_offset
+ * needs to be reset to @cow_end + 1 to skip the COW range, as
+ * cow_file_range() will do the proper cleanup at error.
+ */
+ if (cow_end)
+ cur_offset = cow_end + 1;
+
+ /*
+ * We need to lock the extent here because we're clearing DELALLOC and
+ * we're not locked at this point.
+ */
+ if (cur_offset < end) {
+ struct extent_state *cached = NULL;
+
+ btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached);
extent_clear_unlock_delalloc(inode, cur_offset, end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_DEFRAG |
+ locked_folio, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
+ btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
+ }
btrfs_free_path(path);
+ btrfs_err_rl(fs_info,
+ "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), start, end + 1 - start, ret);
return ret;
}
@@ -2248,7 +2307,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes &&
- test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
+ btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
return false;
return true;
}
@@ -2259,40 +2318,35 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc)
{
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
int ret;
/*
- * The range must cover part of the @locked_page, or a return of 1
+ * The range must cover part of the @locked_folio, or a return of 1
* can confuse the caller.
*/
- ASSERT(!(end <= page_offset(locked_page) ||
- start >= page_offset(locked_page) + PAGE_SIZE));
+ ASSERT(!(end <= folio_pos(locked_folio) ||
+ start >= folio_pos(locked_folio) + folio_size(locked_folio)));
if (should_nocow(inode, start, end)) {
- ret = run_delalloc_nocow(inode, locked_page, start, end);
- goto out;
+ ret = run_delalloc_nocow(inode, locked_folio, start, end);
+ return ret;
}
if (btrfs_inode_can_compress(inode) &&
inode_need_compress(inode, start, end) &&
- run_delalloc_compressed(inode, locked_page, start, end, wbc))
+ run_delalloc_compressed(inode, locked_folio, start, end, wbc))
return 1;
if (zoned)
- ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
+ ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
true);
else
- ret = cow_file_range(inode, locked_page, start, end, NULL,
+ ret = cow_file_range(inode, locked_folio, start, end, NULL,
false, false);
-
-out:
- if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_page, start,
- end - start + 1);
return ret;
}
@@ -2542,7 +2596,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
!btrfs_is_free_space_inode(inode) &&
!(state->state & EXTENT_NORESERVE) &&
(bits & EXTENT_CLEAR_DATA_RESV))
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
@@ -2575,44 +2629,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
}
}
-static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
- struct btrfs_ordered_extent *ordered)
-{
- u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
- u64 len = bbio->bio.bi_iter.bi_size;
- struct btrfs_ordered_extent *new;
- int ret;
-
- /* Must always be called for the beginning of an ordered extent. */
- if (WARN_ON_ONCE(start != ordered->disk_bytenr))
- return -EINVAL;
-
- /* No need to split if the ordered extent covers the entire bio. */
- if (ordered->disk_num_bytes == len) {
- refcount_inc(&ordered->refs);
- bbio->ordered = ordered;
- return 0;
- }
-
- /*
- * Don't split the extent_map for NOCOW extents, as we're writing into
- * a pre-existing one.
- */
- if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = split_extent_map(bbio->inode, bbio->file_offset,
- ordered->num_bytes, len,
- ordered->disk_bytenr);
- if (ret)
- return ret;
- }
-
- new = btrfs_split_ordered_extent(ordered, len);
- if (IS_ERR(new))
- return PTR_ERR(new);
- bbio->ordered = new;
- return 0;
-}
-
/*
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
@@ -2655,7 +2671,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
if (IS_ERR(em))
return PTR_ERR(em);
- if (em->block_start != EXTENT_MAP_HOLE)
+ if (em->disk_bytenr != EXTENT_MAP_HOLE)
goto next;
em_len = em->len;
@@ -2664,12 +2680,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
if (em_len > search_len)
em_len = search_len;
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW, cached_state);
+ ret = btrfs_set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW, cached_state);
next:
- search_start = extent_map_end(em);
- free_extent_map(em);
+ search_start = btrfs_extent_map_end(em);
+ btrfs_free_extent_map(em);
if (ret)
return ret;
}
@@ -2699,13 +2715,13 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
return ret;
}
- return set_extent_bit(&inode->io_tree, start, end,
- EXTENT_DELALLOC | extra_bits, cached_state);
+ return btrfs_set_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC | extra_bits, cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
- struct page *page;
+ struct folio *folio;
struct btrfs_inode *inode;
struct btrfs_work work;
};
@@ -2717,50 +2733,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- struct page *page = fixup->page;
+ struct folio *folio = fixup->folio;
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 page_start = page_offset(page);
- u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
/*
* This is similar to page_mkwrite, we need to reserve the space before
- * we take the page lock.
+ * we take the folio lock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- PAGE_SIZE);
+ folio_size(folio));
again:
- lock_page(page);
+ folio_lock(folio);
/*
- * Before we queued this fixup, we took a reference on the page.
- * page->mapping may go NULL, but it shouldn't be moved to a different
+ * Before we queued this fixup, we took a reference on the folio.
+ * folio->mapping may go NULL, but it shouldn't be moved to a different
* address space.
*/
- if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+ if (!folio->mapping || !folio_test_dirty(folio) ||
+ !folio_test_checked(folio)) {
/*
* Unfortunately this is a little tricky, either
*
- * 1) We got here and our page had already been dealt with and
+ * 1) We got here and our folio had already been dealt with and
* we reserved our space, thus ret == 0, so we need to just
* drop our space reservation and bail. This can happen the
* first time we come into the fixup worker, or could happen
* while waiting for the ordered extent.
- * 2) Our page was already dealt with, but we happened to get an
+ * 2) Our folio was already dealt with, but we happened to get an
* ENOSPC above from the btrfs_delalloc_reserve_space. In
* this case we obviously don't have anything to release, but
- * because the page was already dealt with we don't want to
- * mark the page with an error, so make sure we're resetting
+ * because the folio was already dealt with we don't want to
+ * mark the folio with an error, so make sure we're resetting
* ret to 0. This is why we have this check _before_ the ret
* check, because we do not want to have a surprise ENOSPC
- * when the page was already properly dealt with.
+ * when the folio was already properly dealt with.
*/
if (!ret) {
- btrfs_delalloc_release_extents(inode, PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, folio_size(folio));
btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE,
+ page_start, folio_size(folio),
true);
}
ret = 0;
@@ -2768,23 +2785,23 @@ again:
}
/*
- * We can't mess with the page state unless it is locked, so now that
+ * We can't mess with the folio state unless it is locked, so now that
* it is locked bail if we failed to make our space reservation.
*/
if (ret)
goto out_page;
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
- if (PageOrdered(page))
+ if (folio_test_ordered(folio))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
- unlock_page(page);
+ btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -2802,28 +2819,28 @@ again:
*
* The page was dirty when we started, nothing should have cleaned it.
*/
- BUG_ON(!PageDirty(page));
+ BUG_ON(!folio_test_dirty(folio));
free_delalloc_space = false;
out_reserved:
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
if (ret) {
/*
* We hit ENOSPC or other errors. Update the mapping and page
* to reflect the errors and clean the page.
*/
- mapping_set_error(page->mapping, ret);
- btrfs_mark_ordered_io_finished(inode, page, page_start,
- PAGE_SIZE, !ret);
- clear_page_dirty_for_io(page);
- }
- btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
- unlock_page(page);
- put_page(page);
+ mapping_set_error(folio->mapping, ret);
+ btrfs_mark_ordered_io_finished(inode, folio, page_start,
+ folio_size(folio), !ret);
+ folio_clear_dirty_for_io(folio);
+ }
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ folio_unlock(folio);
+ folio_put(folio);
kfree(fixup);
extent_changeset_free(data_reserved);
/*
@@ -2836,33 +2853,49 @@ out_page:
/*
* There are a few paths in the higher layers of the kernel that directly
- * set the page dirty bit without asking the filesystem if it is a
+ * set the folio dirty bit without asking the filesystem if it is a
* good idea. This causes problems because we want to make sure COW
* properly happens and the data=ordered rules are followed.
*
* In our case any range that doesn't have the ORDERED bit set
* hasn't been properly setup for IO. We kick off an async process
* to fix it up. The async helper will wait for ordered extents, set
- * the delalloc bit and make it safe to write the page.
+ * the delalloc bit and make it safe to write the folio.
*/
-int btrfs_writepage_cow_fixup(struct page *page)
+int btrfs_writepage_cow_fixup(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_writepage_fixup *fixup;
- /* This page has ordered extent covering it already */
- if (PageOrdered(page))
+ /* This folio has ordered extent covering it already */
+ if (folio_test_ordered(folio))
return 0;
/*
- * PageChecked is set below when we create a fixup worker for this page,
- * don't try to create another one if we're already PageChecked()
+ * For experimental build, we error out instead of EAGAIN.
*
- * The extent_io writepage code will redirty the page if we send back
+ * We should not hit such out-of-band dirty folios anymore.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
+ DEBUG_WARN();
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)),
+ folio_pos(folio));
+ return -EUCLEAN;
+ }
+
+ /*
+ * folio_checked is set below when we create a fixup worker for this
+ * folio, don't try to create another one if we're already
+ * folio_test_checked.
+ *
+ * The extent_io writepage code will redirty the foio if we send back
* EAGAIN.
*/
- if (PageChecked(page))
+ if (folio_test_checked(folio))
return -EAGAIN;
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
@@ -2872,14 +2905,14 @@ int btrfs_writepage_cow_fixup(struct page *page)
/*
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
- * takes place outside of the page lock, and we can't trust
- * page->mapping outside of the page lock.
+ * takes place outside of the folio lock, and we can't trust
+ * folio->mapping outside of the folio lock.
*/
ihold(inode);
- btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
- get_page(page);
+ btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
+ folio_get(folio);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
- fixup->page = page;
+ fixup->folio = folio;
fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
@@ -2894,7 +2927,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
const u64 sectorsize = root->fs_info->sectorsize;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
@@ -2929,8 +2962,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!drop_args.extent_inserted) {
ins.objectid = btrfs_ino(inode);
- ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
+ ins.offset = file_pos;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
@@ -2943,7 +2976,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(struct btrfs_file_extent_item));
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -2966,8 +2998,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
ins.objectid = disk_bytenr;
- ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = disk_num_bytes;
ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
@@ -2977,8 +3009,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
file_pos - offset,
qgroup_reserved, &ins);
out:
- btrfs_free_path(path);
-
return ret;
}
@@ -3011,10 +3041,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
num_bytes = oe->truncated_len;
- ram_bytes = num_bytes;
- }
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
@@ -3030,7 +3058,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
- return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ return insert_reserved_file_extent(trans, oe->inode,
oe->file_offset, &stack_fi,
update_inode_bytes, oe->qgroup_rsv);
}
@@ -3042,7 +3070,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
*/
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
{
- struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+ struct btrfs_inode *inode = ordered_extent->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans = NULL;
@@ -3087,29 +3115,21 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
- BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-
- btrfs_inode_safe_disk_i_size_write(inode, 0);
- if (freespace_inode)
- trans = btrfs_join_transaction_spacecache(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
- trans->block_rsv = &inode->block_rsv;
- ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, ret);
- goto out;
+ /*
+ * If it's a COW write we need to lock the extent range as we will be
+ * inserting/replacing file extent items and unpinning an extent map.
+ * This must be taken before joining a transaction, as it's a higher
+ * level lock (like the inode's VFS lock), otherwise we can run into an
+ * ABBA deadlock with other tasks (transactions work like a lock,
+ * depending on their current state).
+ */
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED;
+ btrfs_lock_extent_bits(io_tree, start, end,
+ EXTENT_LOCKED | EXTENT_FINISHING_ORDERED,
+ &cached_state);
}
- clear_bits |= EXTENT_LOCKED;
- lock_extent(io_tree, start, end, &cached_state);
-
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@@ -3123,8 +3143,28 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ /* Logic error */
+ ASSERT(list_empty(&ordered_extent->list));
+ if (!list_empty(&ordered_extent->list)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ ret = btrfs_update_inode_fallback(trans, inode);
+ if (ret) {
+ /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, ret);
+ }
goto out;
+ }
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
@@ -3151,8 +3191,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- ret = unpin_extent_cache(inode, ordered_extent->file_offset,
- ordered_extent->num_bytes, trans->transid);
+ ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3171,9 +3211,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
*/
if ((clear_bits & EXTENT_DELALLOC_NEW) &&
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
- clear_extent_bit(&inode->io_tree, start, end,
- EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+ &cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
@@ -3181,17 +3221,14 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = 0;
out:
- clear_extent_bit(&inode->io_tree, start, end, clear_bits,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits,
+ &cached_state);
if (trans)
btrfs_end_transaction(trans);
if (ret || truncated) {
- u64 unwritten_start = start;
-
/*
* If we failed to finish this ordered extent for any reason we
* need to make sure BTRFS_ORDERED_IOERR is set on the ordered
@@ -3200,13 +3237,8 @@ out:
* set the mapping error, so we need to set it if we're the ones
* marking this ordered extent as failed.
*/
- if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
- &ordered_extent->flags))
- mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
-
- if (truncated)
- unwritten_start += logical_len;
- clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
+ if (ret)
+ btrfs_mark_ordered_extent_error(ordered_extent);
/*
* Drop extent maps for the part of the extent we didn't write.
@@ -3222,9 +3254,15 @@ out:
* we don't mess with the extent map tree in the NOCOW case, but
* for now simply skip this if we are the free space inode.
*/
- if (!btrfs_is_free_space_inode(inode))
+ if (!btrfs_is_free_space_inode(inode)) {
+ u64 unwritten_start = start;
+
+ if (truncated)
+ unwritten_start += logical_len;
+
btrfs_drop_extent_map_range(inode, unwritten_start,
end, false);
+ }
/*
* If the ordered extent had an IOERR or something else went
@@ -3251,12 +3289,12 @@ out:
NULL);
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes, 1);
+ ordered_extent->disk_num_bytes, true);
/*
* Actually free the qgroup rsv which was released when
* the ordered extent was created.
*/
- btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
ordered_extent->qgroup_rsv,
BTRFS_QGROUP_RSV_DATA);
}
@@ -3278,7 +3316,7 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
- if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) &&
+ if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
@@ -3288,20 +3326,16 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
/*
* Verify the checksum for a single sector without any extra action that depend
* on the type of I/O.
+ *
+ * @kaddr must be a properly kmapped address.
*/
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected)
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum,
+ const u8 * const csum_expected)
{
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
-
- ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
shash->tfm = fs_info->csum_shash;
-
- kaddr = kmap_local_page(page) + pgoff;
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
- kunmap_local(kaddr);
if (memcmp(csum, csum_expected, fs_info->csum_size))
return -EIO;
@@ -3330,6 +3364,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u64 end = file_offset + bv->bv_len - 1;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
+ void *kaddr;
ASSERT(bv->bv_len == fs_info->sectorsize);
@@ -3337,19 +3372,22 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
return true;
if (btrfs_is_data_reloc_root(inode->root) &&
- test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
- NULL)) {
+ btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
+ NULL)) {
/* Skip the range without csum for data reloc inode */
- clear_extent_bits(&inode->io_tree, file_offset, end,
- EXTENT_NODATASUM);
+ btrfs_clear_extent_bits(&inode->io_tree, file_offset, end,
+ EXTENT_NODATASUM);
return true;
}
csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
fs_info->csum_size;
- if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
- csum_expected))
+ kaddr = bvec_kmap_local(bv);
+ if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) {
+ kunmap_local(kaddr);
goto zeroit;
+ }
+ kunmap_local(kaddr);
return true;
zeroit:
@@ -3379,6 +3417,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode)
if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
+ WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
atomic_inc(&fs_info->nr_delayed_iputs);
/*
* Need to be irq safe here because we can be called from either an irq
@@ -3495,11 +3534,10 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
int btrfs_orphan_cleanup(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
- struct inode *inode;
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
@@ -3518,6 +3556,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -3572,7 +3612,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, last_objectid, root);
+ inode = btrfs_iget(last_objectid, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL;
@@ -3641,10 +3681,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* deleted but wasn't. The inode number may have been reused,
* but either way, we can delete the orphan item.
*/
- if (!inode || inode->i_nlink) {
+ if (!inode || inode->vfs_inode.i_nlink) {
if (inode) {
- ret = btrfs_drop_verity_items(BTRFS_I(inode));
- iput(inode);
+ ret = btrfs_drop_verity_items(inode);
+ iput(&inode->vfs_inode);
inode = NULL;
if (ret)
goto out;
@@ -3667,7 +3707,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
nr_unlink++;
/* this will do delete_inode and everything for us */
- iput(inode);
+ iput(&inode->vfs_inode);
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3684,19 +3724,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
out:
if (ret)
btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
- btrfs_free_path(path);
return ret;
}
/*
- * very simple check to peek ahead in the leaf looking for xattrs. If we
- * don't find any xattrs, we know there can't be any acls.
+ * Look ahead in the leaf for xattrs. If we don't find any then we know there
+ * can't be any ACLs.
*
- * slot is the slot the inode is in, objectid is the objectid of the inode
+ * @leaf: the eb leaf where to search
+ * @slot: the slot the inode is in
+ * @objectid: the objectid of the inode
+ *
+ * Return true if there is xattr/ACL, false otherwise.
*/
-static noinline int acls_after_inode_item(struct extent_buffer *leaf,
- int slot, u64 objectid,
- int *first_xattr_slot)
+static noinline bool acls_after_inode_item(struct extent_buffer *leaf,
+ int slot, u64 objectid,
+ int *first_xattr_slot)
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
@@ -3716,58 +3759,120 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
- /* we found a different objectid, there must not be acls */
+ /* We found a different objectid, there must be no ACLs. */
if (found_key.objectid != objectid)
- return 0;
+ return false;
- /* we found an xattr, assume we've got an acl */
+ /* We found an xattr, assume we've got an ACL. */
if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
if (*first_xattr_slot == -1)
*first_xattr_slot = slot;
if (found_key.offset == xattr_access ||
found_key.offset == xattr_default)
- return 1;
+ return true;
}
/*
- * we found a key greater than an xattr key, there can't
- * be any acls later on
+ * We found a key greater than an xattr key, there can't be any
+ * ACLs later on.
*/
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
- return 0;
+ return false;
slot++;
scanned++;
/*
- * it goes inode, inode backrefs, xattrs, extents,
- * so if there are a ton of hard links to an inode there can
- * be a lot of backrefs. Don't waste time searching too hard,
- * this is just an optimization
+ * The item order goes like:
+ * - inode
+ * - inode backrefs
+ * - xattrs
+ * - extents,
+ *
+ * so if there are lots of hard links to an inode there can be
+ * a lot of backrefs. Don't waste time searching too hard,
+ * this is just an optimization.
*/
if (scanned >= 8)
break;
}
- /* we hit the end of the leaf before we found an xattr or
- * something larger than an xattr. We have to assume the inode
- * has acls
+ /*
+ * We hit the end of the leaf before we found an xattr or something
+ * larger than an xattr. We have to assume the inode has ACLs.
*/
if (*first_xattr_slot == -1)
*first_xattr_slot = slot;
- return 1;
+ return true;
+}
+
+static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (WARN_ON_ONCE(inode->file_extent_tree))
+ return 0;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
+ return 0;
+ if (!S_ISREG(inode->vfs_inode.i_mode))
+ return 0;
+ if (btrfs_is_free_space_inode(inode))
+ return 0;
+
+ inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!inode->file_extent_tree)
+ return -ENOMEM;
+
+ btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
+
+ return 0;
+}
+
+static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_inode *existing;
+ const u64 ino = btrfs_ino(inode);
+ int ret;
+
+ if (inode_unhashed(&inode->vfs_inode))
+ return 0;
+
+ if (prealloc) {
+ ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
+
+ if (xa_is_err(existing)) {
+ ret = xa_err(existing);
+ ASSERT(ret != -EINVAL);
+ ASSERT(ret != -ENOMEM);
+ return ret;
+ } else if (existing) {
+ WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ }
+
+ return 0;
}
/*
- * read an inode from the btree into the in-memory inode
+ * Read a locked inode from the btree into the in-memory inode and add it to
+ * its root list/tree.
+ *
+ * On failure clean up the inode.
*/
-static int btrfs_read_locked_inode(struct inode *inode,
- struct btrfs_path *in_path)
+static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path = in_path;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode *vfs_inode = &inode->vfs_inode;
struct btrfs_key location;
unsigned long ptr;
int maybe_acls;
@@ -3776,23 +3881,27 @@ static int btrfs_read_locked_inode(struct inode *inode,
bool filled = false;
int first_xattr_slot;
+ ret = btrfs_init_file_extent_tree(inode);
+ if (ret)
+ goto out;
+
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
- if (!path) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- }
+ ASSERT(path);
- memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+ btrfs_get_inode_key(inode, &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
- if (path != in_path)
- btrfs_free_path(path);
- return ret;
+ /*
+ * ret > 0 can come from btrfs_search_slot called by
+ * btrfs_lookup_inode(), this means the inode was not found.
+ */
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
}
leaf = path->nodes[0];
@@ -3802,39 +3911,41 @@ static int btrfs_read_locked_inode(struct inode *inode,
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- inode->i_mode = btrfs_inode_mode(leaf, inode_item);
- set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
- i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
- i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
-
- inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+ vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
+ i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
+ i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
+ btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
+
+ inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
- inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
btrfs_timespec_nsec(leaf, &inode_item->mtime));
- inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
btrfs_timespec_nsec(leaf, &inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
+ inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+ inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
- inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
- BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
- BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+ inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
+ inode->generation = btrfs_inode_generation(leaf, inode_item);
+ inode->last_trans = btrfs_inode_transid(leaf, inode_item);
- inode_set_iversion_queried(inode,
- btrfs_inode_sequence(leaf, inode_item));
- inode->i_generation = BTRFS_I(inode)->generation;
- inode->i_rdev = 0;
+ inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
+ vfs_inode->i_generation = inode->generation;
+ vfs_inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
+
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
+ btrfs_update_inode_mapping_flags(inode);
cache_index:
/*
@@ -3846,9 +3957,8 @@ cache_index:
* This is required for both inode re-read from disk and delayed inode
* in the delayed_nodes xarray.
*/
- if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ if (inode->last_trans == btrfs_get_fs_generation(fs_info))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* We don't persist the id of the transaction where an unlink operation
@@ -3877,7 +3987,7 @@ cache_index:
* transaction commits on fsync if our inode is a directory, or if our
* inode is not a directory, logging its parent unnecessarily.
*/
- BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_unlink_trans = inode->last_trans;
/*
* Same logic as for last_unlink_trans. We don't persist the generation
@@ -3885,15 +3995,15 @@ cache_index:
* operation, so after eviction and reloading the inode we must be
* pessimistic and assume the last transaction that modified the inode.
*/
- BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_reflink_trans = inode->last_trans;
path->slots[0]++;
- if (inode->i_nlink != 1 ||
+ if (vfs_inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
goto cache_acl;
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
- if (location.objectid != btrfs_ino(BTRFS_I(inode)))
+ if (location.objectid != btrfs_ino(inode))
goto cache_acl;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3901,13 +4011,12 @@ cache_index:
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+ inode->dir_index = btrfs_inode_ref_index(leaf, ref);
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
- extref);
+ inode->dir_index = btrfs_inode_extref_index(leaf, extref);
}
cache_acl:
/*
@@ -3915,45 +4024,50 @@ cache_acl:
* any xattrs or acls
*/
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
- btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
+ btrfs_ino(inode), &first_xattr_slot);
if (first_xattr_slot != -1) {
path->slots[0] = first_xattr_slot;
ret = btrfs_load_inode_props(inode, path);
if (ret)
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)),
- root->root_key.objectid, ret);
+ btrfs_ino(inode), btrfs_root_id(root), ret);
}
- if (path != in_path)
- btrfs_free_path(path);
if (!maybe_acls)
- cache_no_acl(inode);
+ cache_no_acl(vfs_inode);
- switch (inode->i_mode & S_IFMT) {
+ switch (vfs_inode->i_mode & S_IFMT) {
case S_IFREG:
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_fop = &btrfs_file_operations;
+ vfs_inode->i_op = &btrfs_file_inode_operations;
break;
case S_IFDIR:
- inode->i_fop = &btrfs_dir_file_operations;
- inode->i_op = &btrfs_dir_inode_operations;
+ vfs_inode->i_fop = &btrfs_dir_file_operations;
+ vfs_inode->i_op = &btrfs_dir_inode_operations;
break;
case S_IFLNK:
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(vfs_inode);
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
+ vfs_inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
break;
}
btrfs_sync_inode_flags_to_i_flags(inode);
+
+ ret = btrfs_add_inode_to_root(inode, true);
+ if (ret)
+ goto out;
+
return 0;
+out:
+ iget_failed(vfs_inode);
+ return ret;
}
/*
@@ -4012,19 +4126,21 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
+ struct btrfs_key key;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
+ btrfs_get_inode_key(inode, &key);
+ ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto failed;
+ return ret;
}
leaf = path->nodes[0];
@@ -4032,12 +4148,8 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item);
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_set_inode_last_trans(trans, inode);
- ret = 0;
-failed:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -4182,6 +4294,7 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
+ inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
ret = btrfs_update_inode(trans, dir);
@@ -4282,9 +4395,9 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
/* This needs to handle no-key deletions later on */
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
- objectid = inode->root->root_key.objectid;
+ objectid = btrfs_root_id(inode->root);
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
- objectid = inode->location.objectid;
+ objectid = inode->ref_root_id;
} else {
WARN_ON(1);
fscrypt_free_filename(&fname);
@@ -4325,11 +4438,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
*/
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
- if (IS_ERR_OR_NULL(di)) {
- if (!di)
- ret = -ENOENT;
- else
- ret = PTR_ERR(di);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4340,7 +4450,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
} else {
ret = btrfs_del_root_ref(trans, objectid,
- root->root_key.objectid, dir_ino,
+ btrfs_root_id(root), dir_ino,
&index, &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -4373,7 +4483,7 @@ out:
static noinline int may_destroy_subvol(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
struct btrfs_key key;
struct fscrypt_str name = FSTR_INIT("default", 7);
@@ -4390,42 +4500,39 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
dir_id, &name, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.objectid == root->root_key.objectid) {
+ if (key.objectid == btrfs_root_id(root)) {
ret = -EPERM;
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
key.objectid);
- goto out;
+ return ret;
}
btrfs_release_path(path);
}
- key.objectid = root->root_key.objectid;
+ key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 0) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = 0;
if (path->slots[0] > 0) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == root->root_key.objectid &&
- key.type == BTRFS_ROOT_REF_KEY)
+ if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -4433,64 +4540,26 @@ out:
static void btrfs_prune_dentries(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
- u64 objectid = 0;
+ struct btrfs_inode *inode;
+ u64 min_ino = 0;
if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
+ inode = btrfs_find_first_inode(root, min_ino);
+ while (inode) {
+ if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ d_prune_aliases(&inode->vfs_inode);
- if (objectid < btrfs_ino(entry))
- node = node->rb_left;
- else if (objectid > btrfs_ino(entry))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(entry)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- objectid = btrfs_ino(entry) + 1;
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- if (atomic_read(&inode->i_count) > 1)
- d_prune_aliases(inode);
- /*
- * btrfs_drop_inode will have it removed from the inode
- * cache when its usage count hits zero.
- */
- iput(inode);
- cond_resched();
- spin_lock(&root->inode_lock);
- goto again;
- }
-
- if (cond_resched_lock(&root->inode_lock))
- goto again;
-
- node = rb_next(node);
+ min_ino = btrfs_ino(inode) + 1;
+ /*
+ * btrfs_drop_inode() will have it removed from the inode
+ * cache when its usage count hits zero.
+ */
+ iput(&inode->vfs_inode);
+ cond_resched();
+ inode = btrfs_find_first_inode(root, min_ino);
}
- spin_unlock(&root->inode_lock);
}
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
@@ -4517,7 +4586,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
- dest->root_key.objectid);
+ btrfs_root_id(dest));
ret = -EPERM;
goto out_up_write;
}
@@ -4525,7 +4594,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EPERM;
goto out_up_write;
}
@@ -4554,11 +4623,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = PTR_ERR(trans);
goto out_release;
}
- ret = btrfs_record_root_in_trans(trans, root);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_end_trans;
- }
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
@@ -4586,7 +4650,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4594,8 +4658,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
}
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- dest->root_key.objectid);
+ BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4604,7 +4667,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_uuid_tree_remove(trans,
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4645,7 +4708,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- int err = 0;
+ int ret = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
struct fscrypt_name fname;
@@ -4661,33 +4724,33 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
}
- err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
- if (err)
- return err;
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
/* This needs to handle no-key deletions later on */
trans = __unlink_start_trans(BTRFS_I(dir));
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_notrans;
}
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
}
- err = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret)
goto out;
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
- if (!err) {
+ if (!ret) {
btrfs_i_size_write(BTRFS_I(inode), 0);
/*
* Propagate the last_unlink_trans value of the deleted dir to
@@ -4709,23 +4772,83 @@ out_notrans:
btrfs_btree_balance_dirty(fs_info);
fscrypt_free_filename(&fname);
- return err;
+ return ret;
+}
+
+static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize)
+{
+ ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u",
+ blockstart, blocksize);
+
+ if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1)
+ return true;
+ return false;
+}
+
+static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start)
+{
+ const pgoff_t index = (start >> PAGE_SHIFT);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct folio *folio;
+ u64 zero_start;
+ u64 zero_end;
+ int ret = 0;
+
+again:
+ folio = filemap_lock_folio(mapping, index);
+ /* No folio present. */
+ if (IS_ERR(folio))
+ return 0;
+
+ if (!folio_test_uptodate(folio)) {
+ ret = btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
+ if (!folio_test_uptodate(folio)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ folio_wait_writeback(folio);
+
+ /*
+ * We do not need to lock extents nor wait for OE, as it's already
+ * beyond EOF.
+ */
+
+ zero_start = max_t(u64, folio_pos(folio), start);
+ zero_end = folio_pos(folio) + folio_size(folio) - 1;
+ folio_zero_range(folio, zero_start - folio_pos(folio),
+ zero_end - zero_start + 1);
+
+out_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
}
/*
- * Read, zero a chunk and write a block.
+ * Handle the truncation of a fs block.
*
- * @inode - inode that we're zeroing
- * @from - the offset to start zeroing
- * @len - the length to zero, 0 to zero the entire range respective to the
- * offset
- * @front - zero up to the offset instead of from the offset on
+ * @inode - inode that we're zeroing
+ * @offset - the file offset of the block to truncate
+ * The value must be inside [@start, @end], and the function will do
+ * extra checks if the block that covers @offset needs to be zeroed.
+ * @start - the start file offset of the range we want to zero
+ * @end - the end (inclusive) file offset of the range we want to zero.
*
- * This will find the block for the "from" offset and cow the block and zero the
- * part we want to zero. This is used with truncate and hole punching.
+ * If the range is not block aligned, read out the folio that covers @offset,
+ * and if needed zero blocks that are inside the folio and covered by [@start, @end).
+ * If @start or @end + 1 lands inside a block, that block will be marked dirty
+ * for writeback.
+ *
+ * This is utilized by hole punch, zero range, file expansion.
*/
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front)
+int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
@@ -4735,20 +4858,56 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
struct extent_changeset *data_reserved = NULL;
bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (blocksize - 1);
+ pgoff_t index = (offset >> PAGE_SHIFT);
struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
size_t write_bytes = blocksize;
int ret = 0;
+ const bool in_head_block = is_inside_block(offset, round_down(start, blocksize),
+ blocksize);
+ const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize),
+ blocksize);
+ bool need_truncate_head = false;
+ bool need_truncate_tail = false;
+ u64 zero_start;
+ u64 zero_end;
u64 block_start;
u64 block_end;
- if (IS_ALIGNED(offset, blocksize) &&
- (!len || IS_ALIGNED(len, blocksize)))
+ /* @offset should be inside the range. */
+ ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu",
+ offset, start, end);
+
+ /* The range is aligned at both ends. */
+ if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) {
+ /*
+ * For block size < page size case, we may have polluted blocks
+ * beyond EOF. So we also need to zero them out.
+ */
+ if (end == (u64)-1 && blocksize < PAGE_SIZE)
+ ret = truncate_block_zero_beyond_eof(inode, start);
goto out;
+ }
- block_start = round_down(from, blocksize);
+ /*
+ * @offset may not be inside the head nor tail block. In that case we
+ * don't need to do anything.
+ */
+ if (!in_head_block && !in_tail_block)
+ goto out;
+
+ /*
+ * Skip the truncatioin if the range in the target block is already aligned.
+ * The seemingly complex check will also handle the same block case.
+ */
+ if (in_head_block && !IS_ALIGNED(start, blocksize))
+ need_truncate_head = true;
+ if (in_tail_block && !IS_ALIGNED(end + 1, blocksize))
+ need_truncate_tail = true;
+ if (!need_truncate_head && !need_truncate_tail)
+ goto out;
+
+ block_start = round_down(offset, blocksize);
block_end = block_start + blocksize - 1;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
@@ -4772,10 +4931,13 @@ again:
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio)) {
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
- ret = -ENOMEM;
+ ret = PTR_ERR(folio);
goto out;
}
@@ -4805,11 +4967,11 @@ again:
folio_wait_writeback(folio);
- lock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
folio_unlock(folio);
folio_put(folio);
btrfs_start_ordered_extent(ordered);
@@ -4817,37 +4979,46 @@ again:
goto again;
}
- clear_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
}
- if (offset != blocksize) {
- if (!len)
- len = blocksize - offset;
- if (front)
- folio_zero_range(folio, block_start - folio_pos(folio),
- offset);
- else
- folio_zero_range(folio,
- (block_start - folio_pos(folio)) + offset,
- len);
+ if (end == (u64)-1) {
+ /*
+ * We're truncating beyond EOF, the remaining blocks normally are
+ * already holes thus no need to zero again, but it's possible for
+ * fs block size < page size cases to have memory mapped writes
+ * to pollute ranges beyond EOF.
+ *
+ * In that case although such polluted blocks beyond EOF will
+ * not reach disk, it still affects our page caches.
+ */
+ zero_start = max_t(u64, folio_pos(folio), start);
+ zero_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1,
+ end);
+ } else {
+ zero_start = max_t(u64, block_start, start);
+ zero_end = min_t(u64, block_end, end);
}
+ folio_zero_range(folio, zero_start - folio_pos(folio),
+ zero_end - zero_start + 1);
+
btrfs_folio_clear_checked(fs_info, folio, block_start,
block_end + 1 - block_start);
btrfs_folio_set_dirty(fs_info, folio, block_start,
block_end + 1 - block_start);
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
- set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, NULL);
+ btrfs_set_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_NORESERVE, NULL);
out_unlock:
if (ret) {
@@ -4933,16 +5104,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err = 0;
+ int ret = 0;
/*
* If our size started in the middle of a block we need to zero out the
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- err = btrfs_truncate_block(inode, oldsize, 0, 0);
- if (err)
- return err;
+ ret = btrfs_truncate_block(inode, oldsize, oldsize, -1);
+ if (ret)
+ return ret;
if (size <= hole_start)
return 0;
@@ -4953,27 +5124,27 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
while (1) {
em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
if (IS_ERR(em)) {
- err = PTR_ERR(em);
+ ret = PTR_ERR(em);
em = NULL;
break;
}
- last_byte = min(extent_map_end(em), block_end);
+ last_byte = min(btrfs_extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
- err = maybe_insert_hole(inode, cur_offset, hole_size);
- if (err)
+ ret = maybe_insert_hole(inode, cur_offset, hole_size);
+ if (ret)
break;
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
- hole_em = alloc_extent_map();
+ hole_em = btrfs_alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_map_range(inode, cur_offset,
cur_offset + hole_size - 1,
@@ -4983,32 +5154,30 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
}
hole_em->start = cur_offset;
hole_em->len = hole_size;
- hole_em->orig_start = cur_offset;
- hole_em->block_start = EXTENT_MAP_HOLE;
- hole_em->block_len = 0;
- hole_em->orig_block_len = 0;
+ hole_em->disk_bytenr = EXTENT_MAP_HOLE;
+ hole_em->disk_num_bytes = 0;
hole_em->ram_bytes = hole_size;
hole_em->generation = btrfs_get_fs_generation(fs_info);
- err = btrfs_replace_extent_map_range(inode, hole_em, true);
- free_extent_map(hole_em);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
+ btrfs_free_extent_map(hole_em);
} else {
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
}
next:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
- free_extent_map(em);
- unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
- return err;
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
+ return ret;
}
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
@@ -5065,7 +5234,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
if (btrfs_is_zoned(fs_info)) {
- ret = btrfs_wait_ordered_range(inode,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode),
ALIGN(newsize, fs_info->sectorsize),
(u64)-1);
if (ret)
@@ -5095,7 +5264,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
- err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (err)
return err;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
@@ -5191,7 +5360,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
state_flags = state->state;
spin_unlock(&io_tree->lock);
- lock_extent(io_tree, start, end, &cached_state);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5205,9 +5374,9 @@ static void evict_inode_truncate_pages(struct inode *inode)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
end - start + 1, NULL);
- clear_extent_bit(io_tree, start, end,
- EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
- &cached_state);
+ btrfs_clear_extent_bit(io_tree, start, end,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
+ &cached_state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -5284,7 +5453,7 @@ void btrfs_evict_inode(struct inode *inode)
if (inode->i_nlink &&
((btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
btrfs_is_free_space_inode(BTRFS_I(inode))))
goto out;
@@ -5296,7 +5465,7 @@ void btrfs_evict_inode(struct inode *inode)
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
goto out;
}
@@ -5393,7 +5562,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
struct btrfs_key *location, u8 *type)
{
struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = dir->root;
int ret = 0;
struct fscrypt_name fname;
@@ -5404,7 +5573,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
if (ret < 0)
- goto out;
+ return ret;
/*
* fscrypt_setup_filename() should never return a positive value, but
* gcc on sparc/parisc thinks it can, so assert that doesn't happen.
@@ -5433,7 +5602,6 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
*type = btrfs_dir_ftype(path->nodes[0], di);
out:
fscrypt_free_filename(&fname);
- btrfs_free_path(path);
return ret;
}
@@ -5448,7 +5616,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
struct btrfs_root **sub_root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
@@ -5468,7 +5636,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
}
err = -ENOENT;
- key.objectid = dir->root->root_key.objectid;
+ key.objectid = btrfs_root_id(dir->root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
@@ -5504,64 +5672,28 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
location->offset = 0;
err = 0;
out:
- btrfs_free_path(path);
fscrypt_free_filename(&fname);
return err;
}
-static void inode_tree_add(struct btrfs_inode *inode)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_inode *entry;
- struct rb_node **p;
- struct rb_node *parent;
- struct rb_node *new = &inode->rb_node;
- u64 ino = btrfs_ino(inode);
-
- if (inode_unhashed(&inode->vfs_inode))
- return;
- parent = NULL;
- spin_lock(&root->inode_lock);
- p = &root->inode_tree.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct btrfs_inode, rb_node);
- if (ino < btrfs_ino(entry))
- p = &parent->rb_left;
- else if (ino > btrfs_ino(entry))
- p = &parent->rb_right;
- else {
- WARN_ON(!(entry->vfs_inode.i_state &
- (I_WILL_FREE | I_FREEING)));
- rb_replace_node(parent, new, &root->inode_tree);
- RB_CLEAR_NODE(parent);
- spin_unlock(&root->inode_lock);
- return;
- }
- }
- rb_link_node(new, parent, p);
- rb_insert_color(new, &root->inode_tree);
- spin_unlock(&root->inode_lock);
-}
-static void inode_tree_del(struct btrfs_inode *inode)
+static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
- int empty = 0;
+ struct btrfs_inode *entry;
+ bool empty = false;
- spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&inode->rb_node)) {
- rb_erase(&inode->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&inode->rb_node);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- }
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->inodes);
+ entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ if (entry == inode)
+ empty = xa_empty(&root->inodes);
+ xa_unlock(&root->inodes);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- spin_lock(&root->inode_lock);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->inodes);
+ empty = xa_empty(&root->inodes);
+ xa_unlock(&root->inodes);
if (empty)
btrfs_add_dead_root(root);
}
@@ -5572,10 +5704,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->ino;
- BTRFS_I(inode)->location.objectid = args->ino;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
+ btrfs_set_inode_number(BTRFS_I(inode), args->ino);
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
if (args->root && args->root == args->root->fs_info->tree_root &&
@@ -5589,12 +5718,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->ino == BTRFS_I(inode)->location.objectid &&
+ return args->ino == btrfs_ino(BTRFS_I(inode)) &&
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
- struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
@@ -5603,87 +5731,106 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
- inode = iget5_locked(s, hashval, btrfs_find_actor,
+ inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
- return inode;
+ if (!inode)
+ return NULL;
+ return BTRFS_I(inode);
}
/*
- * Get an inode object given its inode number and corresponding root.
- * Path can be preallocated to prevent recursing back to iget through
- * allocator. NULL is also valid but may require an additional allocation
- * later.
+ * Get an inode object given its inode number and corresponding root. Path is
+ * preallocated to prevent recursing back to iget through allocator.
*/
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path)
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
+ int ret;
- inode = btrfs_iget_locked(s, ino, root);
+ inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- int ret;
+ if (!(inode->vfs_inode.i_state & I_NEW))
+ return inode;
- ret = btrfs_read_locked_inode(inode, path);
- if (!ret) {
- inode_tree_add(BTRFS_I(inode));
- unlock_new_inode(inode);
- } else {
- iget_failed(inode);
- /*
- * ret > 0 can come from btrfs_search_slot called by
- * btrfs_read_locked_inode, this means the inode item
- * was not found.
- */
- if (ret > 0)
- ret = -ENOENT;
- inode = ERR_PTR(ret);
- }
- }
+ ret = btrfs_read_locked_inode(inode, path);
+ if (ret)
+ return ERR_PTR(ret);
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
+/*
+ * Get an inode object given its inode number and corresponding root.
+ */
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(s, ino, root, NULL);
+ struct btrfs_inode *inode;
+ struct btrfs_path *path;
+ int ret;
+
+ inode = btrfs_iget_locked(ino, root);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!(inode->vfs_inode.i_state & I_NEW))
+ return inode;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ iget_failed(&inode->vfs_inode);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ret = btrfs_read_locked_inode(inode, path);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ unlock_new_inode(&inode->vfs_inode);
+ return inode;
}
-static struct inode *new_simple_dir(struct inode *dir,
- struct btrfs_key *key,
- struct btrfs_root *root)
+static struct btrfs_inode *new_simple_dir(struct inode *dir,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
{
struct timespec64 ts;
- struct inode *inode = new_inode(dir->i_sb);
+ struct inode *vfs_inode;
+ struct btrfs_inode *inode;
- if (!inode)
+ vfs_inode = new_inode(dir->i_sb);
+ if (!vfs_inode)
return ERR_PTR(-ENOMEM);
- BTRFS_I(inode)->root = btrfs_grab_root(root);
- memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
- set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+ inode = BTRFS_I(vfs_inode);
+ inode->root = btrfs_grab_root(root);
+ inode->ref_root_id = key->objectid;
+ set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
+ set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
- inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
*/
- inode->i_op = &simple_dir_inode_operations;
- inode->i_opflags &= ~IOP_XATTR;
- inode->i_fop = &simple_dir_operations;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ vfs_inode->i_op = &simple_dir_inode_operations;
+ vfs_inode->i_opflags &= ~IOP_XATTR;
+ vfs_inode->i_fop = &simple_dir_operations;
+ vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- ts = inode_set_ctime_current(inode);
- inode_set_mtime_to_ts(inode, ts);
- inode_set_atime_to_ts(inode, inode_get_atime(dir));
- BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
- BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+ ts = inode_set_ctime_current(vfs_inode);
+ inode_set_mtime_to_ts(vfs_inode, ts);
+ inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
+ inode->i_otime_sec = ts.tv_sec;
+ inode->i_otime_nsec = ts.tv_nsec;
- inode->i_uid = dir->i_uid;
- inode->i_gid = dir->i_gid;
+ vfs_inode->i_uid = dir->i_uid;
+ vfs_inode->i_gid = dir->i_gid;
return inode;
}
@@ -5697,18 +5844,18 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO);
static_assert(BTRFS_FT_SOCK == FT_SOCK);
static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
-static inline u8 btrfs_inode_type(struct inode *inode)
+static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
{
- return fs_umode_to_ftype(inode->i_mode);
+ return fs_umode_to_ftype(inode->vfs_inode.i_mode);
}
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
- struct btrfs_key location;
+ struct btrfs_key location = { 0 };
u8 di_type = 0;
int ret = 0;
@@ -5720,20 +5867,20 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, location.objectid, root);
+ inode = btrfs_iget(location.objectid, root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
/* Do extra check against inode mode with di_type */
if (btrfs_inode_type(inode) != di_type) {
btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
- inode->i_mode, btrfs_inode_type(inode),
+ inode->vfs_inode.i_mode, btrfs_inode_type(inode),
di_type);
- iput(inode);
+ iput(&inode->vfs_inode);
return ERR_PTR(-EUCLEAN);
}
- return inode;
+ return &inode->vfs_inode;
}
ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
@@ -5744,23 +5891,26 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir, &location, root);
} else {
- inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
+ inode = btrfs_iget(location.objectid, sub_root);
btrfs_put_root(sub_root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
down_read(&fs_info->cleanup_work_sem);
- if (!sb_rdonly(inode->i_sb))
+ if (!sb_rdonly(inode->vfs_inode.i_sb))
ret = btrfs_orphan_cleanup(sub_root);
up_read(&fs_info->cleanup_work_sem);
if (ret) {
- iput(inode);
+ iput(&inode->vfs_inode);
inode = ERR_PTR(ret);
}
}
- return inode;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return &inode->vfs_inode;
}
static int btrfs_dentry_delete(const struct dentry *dentry)
@@ -5800,7 +5950,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key, found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
int ret;
@@ -5814,15 +5964,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
/* FIXME: we should be able to handle this */
if (ret == 0)
- goto out;
- ret = 0;
+ return ret;
if (path->slots[0] == 0) {
inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
+ return 0;
}
path->slots[0]--;
@@ -5833,13 +5982,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
+ return 0;
}
inode->index_cnt = found_key.offset + 1;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
@@ -5942,7 +6090,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
void *addr;
LIST_HEAD(ins_list);
LIST_HEAD(del_list);
@@ -5964,7 +6112,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
addr = private->filldir_buf;
path->reada = READA_FORWARD;
- put = btrfs_readdir_get_delayed_items(inode, private->last_index,
+ put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
&ins_list, &del_list);
again:
@@ -6037,7 +6185,7 @@ again:
* offset. This means that new entries created during readdir
* are *guaranteed* to be seen in the future by that readdir.
* This has broken buggy programs which operate on names as
- * they're returned by readdir. Until we re-use freed offsets
+ * they're returned by readdir. Until we reuse freed offsets
* we have this hack to stop new entries from being returned
* under the assumption that they'll never reach this huge
* offset.
@@ -6054,8 +6202,7 @@ nopos:
ret = 0;
err:
if (put)
- btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
- btrfs_free_path(path);
+ btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
return ret;
}
@@ -6139,7 +6286,7 @@ static int btrfs_insert_inode_locked(struct inode *inode)
{
struct btrfs_iget_args args;
- args.ino = BTRFS_I(inode)->location.objectid;
+ args.ino = btrfs_ino(BTRFS_I(inode));
args.root = BTRFS_I(inode)->root;
return insert_inode_locked4(inode,
@@ -6233,7 +6380,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
inode->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
}
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
@@ -6246,7 +6393,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
- struct btrfs_key *location;
struct btrfs_path *path;
u64 objectid;
struct btrfs_inode_ref *ref;
@@ -6255,6 +6401,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_item_batch batch;
unsigned long ptr;
int ret;
+ bool xa_reserved = false;
path = btrfs_alloc_path();
if (!path)
@@ -6264,10 +6411,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
root = BTRFS_I(inode)->root;
+ ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ if (ret)
+ goto out;
+
ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
goto out;
- inode->i_ino = objectid;
+ btrfs_set_inode_number(BTRFS_I(inode), objectid);
+
+ ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
+ if (ret)
+ goto out;
+ xa_reserved = true;
if (args->orphan) {
/*
@@ -6282,8 +6438,10 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- /* index_cnt is ignored for everything but a dir. */
- BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
@@ -6308,13 +6466,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (btrfs_test_opt(fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
+ btrfs_update_inode_mapping_flags(BTRFS_I(inode));
}
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- location->type = BTRFS_INODE_ITEM_KEY;
-
ret = btrfs_insert_inode_locked(inode);
if (ret < 0) {
if (!args->orphan)
@@ -6397,7 +6551,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
/*
* We don't need the path anymore, plus inheriting properties, adding
* ACLs, security xattrs, orphan item or adding the link, will result in
@@ -6407,28 +6560,28 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
path = NULL;
if (args->subvol) {
- struct inode *parent;
+ struct btrfs_inode *parent;
/*
* Subvolumes inherit properties from their parent subvolume,
* not the directory they were created in.
*/
- parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
- BTRFS_I(dir)->root);
+ parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
- ret = btrfs_inode_inherit_props(trans, inode, parent);
- iput(parent);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ parent);
+ iput(&parent->vfs_inode);
}
} else {
- ret = btrfs_inode_inherit_props(trans, inode, dir);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ BTRFS_I(dir));
}
if (ret) {
btrfs_err(fs_info,
"error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
- ret);
+ btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
}
/*
@@ -6443,7 +6596,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- inode_tree_add(BTRFS_I(inode));
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
+ if (WARN_ON(ret)) {
+ /* Shouldn't happen, we used xa_reserve() before. */
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
@@ -6471,6 +6629,9 @@ discard:
ihold(inode);
discard_new_inode(inode);
out:
+ if (xa_reserved)
+ xa_release(&root->inodes, objectid);
+
btrfs_free_path(path);
return ret;
}
@@ -6501,7 +6662,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
index, name);
} else if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root, name,
@@ -6513,7 +6674,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
- btrfs_inode_type(&inode->vfs_inode), index);
+ btrfs_inode_type(inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) {
@@ -6544,7 +6705,7 @@ fail_dir_item:
u64 local_index;
int err;
err = btrfs_del_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
&local_index, name);
if (err)
btrfs_abort_transaction(trans, err);
@@ -6642,7 +6803,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
+ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6713,26 +6874,27 @@ fail:
return err;
}
-static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = new_inode(dir->i_sb);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- return btrfs_create_common(dir, dentry, inode);
+ return ERR_PTR(btrfs_create_common(dir, dentry, inode));
}
static noinline int uncompress_inline(struct btrfs_path *path,
- struct page *page,
+ struct folio *folio,
struct btrfs_file_extent_item *item)
{
int ret;
struct extent_buffer *leaf = path->nodes[0];
+ const u32 blocksize = leaf->fs_info->sectorsize;
char *tmp;
size_t max_size;
unsigned long inline_size;
@@ -6749,8 +6911,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
- max_size = min_t(unsigned long, PAGE_SIZE, max_size);
- ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
+ max_size = min_t(unsigned long, blocksize, max_size);
+ ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
+ max_size);
/*
* decompression code contains a memset to fill in any space between the end
@@ -6760,37 +6923,37 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size < PAGE_SIZE)
- memzero_page(page, max_size, PAGE_SIZE - max_size);
+ if (max_size < blocksize)
+ folio_zero_range(folio, max_size, blocksize - max_size);
kfree(tmp);
return ret;
}
-static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
- struct page *page)
+static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
{
+ const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
- if (!page || PageUptodate(page))
+ if (!folio || folio_test_uptodate(folio))
return 0;
- ASSERT(page_offset(page) == 0);
+ ASSERT(folio_pos(folio) == 0);
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
- return uncompress_inline(path, page, fi);
+ return uncompress_inline(path, folio, fi);
- copy_size = min_t(u64, PAGE_SIZE,
+ copy_size = min_t(u64, blocksize,
btrfs_file_extent_ram_bytes(path->nodes[0], fi));
- kaddr = kmap_local_page(page);
+ kaddr = kmap_local_folio(folio, 0);
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
- if (copy_size < PAGE_SIZE)
- memzero_page(page, copy_size, PAGE_SIZE - copy_size);
+ if (copy_size < blocksize)
+ folio_zero_range(folio, copy_size, blocksize - copy_size);
return 0;
}
@@ -6812,7 +6975,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
* Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, u64 start, u64 len)
+ struct folio *folio, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
@@ -6829,26 +6992,25 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct extent_map_tree *em_tree = &inode->extent_tree;
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
- free_extent_map(em);
- else if (em->block_start == EXTENT_MAP_INLINE && page)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
+ else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
+ btrfs_free_extent_map(em);
else
goto out;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto out;
}
em->start = EXTENT_MAP_HOLE;
- em->orig_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
em->len = (u64)-1;
- em->block_len = (u64)-1;
path = btrfs_alloc_path();
if (!path) {
@@ -6938,9 +7100,8 @@ next:
/* New extent overlaps with existing one */
em->start = start;
- em->orig_start = start;
em->len = found_key.offset - start;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
goto insert;
}
@@ -6964,23 +7125,22 @@ next:
*
* Other members are not utilized for inline extents.
*/
- ASSERT(em->block_start == EXTENT_MAP_INLINE);
+ ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
- ret = read_inline_extent(inode, path, page);
+ ret = read_inline_extent(path, folio);
if (ret < 0)
goto out;
goto insert;
}
not_found:
em->start = start;
- em->orig_start = start;
em->len = len;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
insert:
ret = 0;
btrfs_release_path(path);
- if (em->start > start || extent_map_end(em) <= start) {
+ if (em->start > start || btrfs_extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
@@ -6989,7 +7149,7 @@ insert:
}
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
@@ -6997,90 +7157,12 @@ out:
trace_btrfs_get_extent(root, inode, em);
if (ret) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
return em;
}
-static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
- struct btrfs_dio_data *dio_data,
- const u64 start,
- const u64 len,
- const u64 orig_start,
- const u64 block_start,
- const u64 block_len,
- const u64 orig_block_len,
- const u64 ram_bytes,
- const int type)
-{
- struct extent_map *em = NULL;
- struct btrfs_ordered_extent *ordered;
-
- if (type != BTRFS_ORDERED_NOCOW) {
- em = create_io_em(inode, start, len, orig_start, block_start,
- block_len, orig_block_len, ram_bytes,
- BTRFS_COMPRESS_NONE, /* compress_type */
- type);
- if (IS_ERR(em))
- goto out;
- }
- ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
- block_start, block_len, 0,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT),
- BTRFS_COMPRESS_NONE);
- if (IS_ERR(ordered)) {
- if (em) {
- free_extent_map(em);
- btrfs_drop_extent_map_range(inode, start,
- start + len - 1, false);
- }
- em = ERR_CAST(ordered);
- } else {
- ASSERT(!dio_data->ordered);
- dio_data->ordered = ordered;
- }
- out:
-
- return em;
-}
-
-static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
- struct btrfs_dio_data *dio_data,
- u64 start, u64 len)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_map *em;
- struct btrfs_key ins;
- u64 alloc_hint;
- int ret;
-
- alloc_hint = get_extent_allocation_hint(inode, start, len);
-again:
- ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
- 0, alloc_hint, &ins, 1, 1);
- if (ret == -EAGAIN) {
- ASSERT(btrfs_is_zoned(fs_info));
- wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
- TASK_UNINTERRUPTIBLE);
- goto again;
- }
- if (ret)
- return ERR_PTR(ret);
-
- em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
- ins.objectid, ins.offset, ins.offset,
- ins.offset, BTRFS_ORDERED_REGULAR);
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
- 1);
-
- return em;
-}
-
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *block_group;
@@ -7103,8 +7185,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
- * @strict: if true, omit optimizations that might force us into unnecessary
- * cow. e.g., don't trust generation number.
*
* Return:
* >0 and update @len if we can do nocow write
@@ -7114,17 +7194,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* NOTE: This only checks the file extents, caller is responsible to wait for
* any ordered extents.
*/
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool nowait, bool strict)
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
+ struct btrfs_file_extent *file_extent,
+ bool nowait)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct can_nocow_file_extent_args nocow_args = { 0 };
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *leaf;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
int found_type;
@@ -7134,806 +7214,146 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
return -ENOMEM;
path->nowait = nowait;
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), offset, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ offset, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 1) {
if (path->slots[0] == 0) {
- /* can't find the item, must cow */
- ret = 0;
- goto out;
+ /* Can't find the item, must COW. */
+ return 0;
}
path->slots[0]--;
}
ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
- /* not our file or wrong item type, must cow */
- goto out;
+ /* Not our file or wrong item type, must COW. */
+ return 0;
}
if (key.offset > offset) {
- /* Wrong offset, must cow */
- goto out;
+ /* Wrong offset, must COW. */
+ return 0;
}
if (btrfs_file_extent_end(path) <= offset)
- goto out;
+ return 0;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
- if (ram_bytes)
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
nocow_args.start = offset;
nocow_args.end = offset + *len - 1;
- nocow_args.strict = strict;
nocow_args.free_path = true;
- ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
/* can_nocow_file_extent() has freed the path. */
path = NULL;
if (ret != 1) {
/* Treat errors as not being able to NOCOW. */
- ret = 0;
- goto out;
+ return 0;
}
- ret = 0;
- if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
- goto out;
+ if (btrfs_extent_readonly(fs_info,
+ nocow_args.file_extent.disk_bytenr +
+ nocow_args.file_extent.offset))
+ return 0;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + nocow_args.num_bytes,
+ range_end = round_up(offset + nocow_args.file_extent.num_bytes,
root->fs_info->sectorsize) - 1;
- ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
- if (ret) {
- ret = -EAGAIN;
- goto out;
- }
+ ret = btrfs_test_range_bit_exists(io_tree, offset, range_end,
+ EXTENT_DELALLOC);
+ if (ret)
+ return -EAGAIN;
}
- if (orig_start)
- *orig_start = key.offset - nocow_args.extent_offset;
- if (orig_block_len)
- *orig_block_len = nocow_args.disk_num_bytes;
-
- *len = nocow_args.num_bytes;
- ret = 1;
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state,
- unsigned int iomap_flags)
-{
- const bool writing = (iomap_flags & IOMAP_WRITE);
- const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- int ret = 0;
-
- while (1) {
- if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend,
- cached_state))
- return -EAGAIN;
- } else {
- lock_extent(io_tree, lockstart, lockend, cached_state);
- }
- /*
- * We're concerned with the entire range that we're going to be
- * doing DIO to, so we need to make sure there's no ordered
- * extents in this range.
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
- lockend - lockstart + 1);
-
- /*
- * We need to make sure there are no buffered pages in this
- * range either, we could have raced between the invalidate in
- * generic_file_direct_write and locking the extent. The
- * invalidate needs to happen so that reads after a write do not
- * get stale data.
- */
- if (!ordered &&
- (!writing || !filemap_range_has_page(inode->i_mapping,
- lockstart, lockend)))
- break;
+ if (file_extent)
+ memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
- unlock_extent(io_tree, lockstart, lockend, cached_state);
+ *len = nocow_args.file_extent.num_bytes;
- if (ordered) {
- if (nowait) {
- btrfs_put_ordered_extent(ordered);
- ret = -EAGAIN;
- break;
- }
- /*
- * If we are doing a DIO read and the ordered extent we
- * found is for a buffered write, we can not wait for it
- * to complete and retry, because if we do so we can
- * deadlock with concurrent buffered writes on page
- * locks. This happens only if our DIO read covers more
- * than one extent map, if at this point has already
- * created an ordered extent for a previous extent map
- * and locked its range in the inode's io tree, and a
- * concurrent write against that previous extent map's
- * range and this range started (we unlock the ranges
- * in the io tree only when the bios complete and
- * buffered writes always lock pages before attempting
- * to lock range in the io tree).
- */
- if (writing ||
- test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(ordered);
- else
- ret = nowait ? -EAGAIN : -ENOTBLK;
- btrfs_put_ordered_extent(ordered);
- } else {
- /*
- * We could trigger writeback for this range (and wait
- * for it to complete) and then invalidate the pages for
- * this range (through invalidate_inode_pages2_range()),
- * but that can lead us to a deadlock with a concurrent
- * call to readahead (a buffered read or a defrag call
- * triggered a readahead) on a page lock due to an
- * ordered dio extent we created before but did not have
- * yet a corresponding bio submitted (whence it can not
- * complete), which makes readahead wait for that
- * ordered extent to complete while holding a lock on
- * that page.
- */
- ret = nowait ? -EAGAIN : -ENOTBLK;
- }
-
- if (ret)
- break;
-
- cond_resched();
- }
-
- return ret;
+ return 1;
}
/* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
- u64 len, u64 orig_start, u64 block_start,
- u64 block_len, u64 orig_block_len,
- u64 ram_bytes, int compress_type,
- int type)
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+ const struct btrfs_file_extent *file_extent,
+ int type)
{
struct extent_map *em;
int ret;
+ /*
+ * Note the missing NOCOW type.
+ *
+ * For pure NOCOW writes, we should not create an io extent map, but
+ * just reusing the existing one.
+ * Only PREALLOC writes (NOCOW write into preallocated range) can
+ * create an io extent map.
+ */
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
type == BTRFS_ORDERED_COMPRESSED ||
- type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
- em = alloc_extent_map();
+ switch (type) {
+ case BTRFS_ORDERED_PREALLOC:
+ /* We're only referring part of a larger preallocated extent. */
+ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
+ break;
+ case BTRFS_ORDERED_REGULAR:
+ /* COW results a new extent matching our file extent size. */
+ ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
+ ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
+
+ /* Since it's a new extent, we should not have any offset. */
+ ASSERT(file_extent->offset == 0);
+ break;
+ case BTRFS_ORDERED_COMPRESSED:
+ /* Must be compressed. */
+ ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
+
+ /*
+ * Encoded write can make us to refer to part of the
+ * uncompressed extent.
+ */
+ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
+ break;
+ }
+
+ em = btrfs_alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
em->start = start;
- em->orig_start = orig_start;
- em->len = len;
- em->block_len = block_len;
- em->block_start = block_start;
- em->orig_block_len = orig_block_len;
- em->ram_bytes = ram_bytes;
+ em->len = file_extent->num_bytes;
+ em->disk_bytenr = file_extent->disk_bytenr;
+ em->disk_num_bytes = file_extent->disk_num_bytes;
+ em->ram_bytes = file_extent->ram_bytes;
em->generation = -1;
+ em->offset = file_extent->offset;
em->flags |= EXTENT_FLAG_PINNED;
- if (type == BTRFS_ORDERED_PREALLOC)
- em->flags |= EXTENT_FLAG_FILLING;
- else if (type == BTRFS_ORDERED_COMPRESSED)
- extent_map_set_compression(em, compress_type);
+ if (type == BTRFS_ORDERED_COMPRESSED)
+ btrfs_extent_map_set_compression(em, file_extent->compression);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
- /* em got 2 refs now, callers needs to do free_extent_map once. */
+ /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */
return em;
}
-
-static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct inode *inode,
- struct btrfs_dio_data *dio_data,
- u64 start, u64 *lenp,
- unsigned int iomap_flags)
-{
- const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_map *em = *map;
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
- struct btrfs_block_group *bg;
- bool can_nocow = false;
- bool space_reserved = false;
- u64 len = *lenp;
- u64 prev_len;
- int ret = 0;
-
- /*
- * We don't allocate a new extent in the following cases
- *
- * 1) The inode is marked as NODATACOW. In this case we'll just use the
- * existing extent.
- * 2) The extent is marked as PREALLOC. We're good to go here and can
- * just use the extent.
- *
- */
- if ((em->flags & EXTENT_FLAG_PREALLOC) ||
- ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- em->block_start != EXTENT_MAP_HOLE)) {
- if (em->flags & EXTENT_FLAG_PREALLOC)
- type = BTRFS_ORDERED_PREALLOC;
- else
- type = BTRFS_ORDERED_NOCOW;
- len = min(len, em->len - (start - em->start));
- block_start = em->block_start + (start - em->start);
-
- if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false, false) == 1) {
- bg = btrfs_inc_nocow_writers(fs_info, block_start);
- if (bg)
- can_nocow = true;
- }
- }
-
- prev_len = len;
- if (can_nocow) {
- struct extent_map *em2;
-
- /* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
- nowait);
- if (ret < 0) {
- /* Our caller expects us to free the input extent map. */
- free_extent_map(em);
- *map = NULL;
- btrfs_dec_nocow_writers(bg);
- if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
- ret = -EAGAIN;
- goto out;
- }
- space_reserved = true;
-
- em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
- btrfs_dec_nocow_writers(bg);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- *map = em2;
- em = em2;
- }
-
- if (IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto out;
- }
-
- dio_data->nocow_done = true;
- } else {
- /* Our caller expects us to free the input extent map. */
- free_extent_map(em);
- *map = NULL;
-
- if (nowait) {
- ret = -EAGAIN;
- goto out;
- }
-
- /*
- * If we could not allocate data space before locking the file
- * range and we can't do a NOCOW write, then we have to fail.
- */
- if (!dio_data->data_space_reserved) {
- ret = -ENOSPC;
- goto out;
- }
-
- /*
- * We have to COW and we have already reserved data space before,
- * so now we reserve only metadata.
- */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
- false);
- if (ret < 0)
- goto out;
- space_reserved = true;
-
- em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
- }
- *map = em;
- len = min(len, em->len - (start - em->start));
- if (len < prev_len)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- prev_len - len, true);
- }
-
- /*
- * We have created our ordered extent, so we can now release our reservation
- * for an outstanding extent.
- */
- btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
-
- /*
- * Need to update the i_size under the extent lock so buffered
- * readers will get the updated i_size when we unlock.
- */
- if (start + len > i_size_read(inode))
- i_size_write(inode, start + len);
-out:
- if (ret && space_reserved) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), len);
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
- }
- *lenp = len;
- return ret;
-}
-
-static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
- loff_t length, unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
-{
- struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
- struct btrfs_dio_data *dio_data = iter->private;
- u64 lockstart, lockend;
- const bool write = !!(flags & IOMAP_WRITE);
- int ret = 0;
- u64 len = length;
- const u64 data_alloc_len = length;
- bool unlock_extents = false;
-
- /*
- * We could potentially fault if we have a buffer > PAGE_SIZE, and if
- * we're NOWAIT we may submit a bio for a partial range and return
- * EIOCBQUEUED, which would result in an errant short read.
- *
- * The best way to handle this would be to allow for partial completions
- * of iocb's, so we could submit the partial bio, return and fault in
- * the rest of the pages, and then submit the io for the rest of the
- * range. However we don't have that currently, so simply return
- * -EAGAIN at this point so that the normal path is used.
- */
- if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
- return -EAGAIN;
-
- /*
- * Cap the size of reads to that usually seen in buffered I/O as we need
- * to allocate a contiguous array for the checksums.
- */
- if (!write)
- len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
-
- lockstart = start;
- lockend = start + len - 1;
-
- /*
- * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
- * enough if we've written compressed pages to this area, so we need to
- * flush the dirty pages again to make absolutely sure that any
- * outstanding dirty pages are on disk - the first flush only starts
- * compression on the data, while keeping the pages locked, so by the
- * time the second flush returns we know bios for the compressed pages
- * were submitted and finished, and the pages no longer under writeback.
- *
- * If we have a NOWAIT request and we have any pages in the range that
- * are locked, likely due to compression still in progress, we don't want
- * to block on page locks. We also don't want to block on pages marked as
- * dirty or under writeback (same as for the non-compression case).
- * iomap_dio_rw() did the same check, but after that and before we got
- * here, mmap'ed writes may have happened or buffered reads started
- * (readpage() and readahead(), which lock pages), as we haven't locked
- * the file range yet.
- */
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags)) {
- if (flags & IOMAP_NOWAIT) {
- if (filemap_range_needs_writeback(inode->i_mapping,
- lockstart, lockend))
- return -EAGAIN;
- } else {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- start + length - 1);
- if (ret)
- return ret;
- }
- }
-
- memset(dio_data, 0, sizeof(*dio_data));
-
- /*
- * We always try to allocate data space and must do it before locking
- * the file range, to avoid deadlocks with concurrent writes to the same
- * range if the range has several extents and the writes don't expand the
- * current i_size (the inode lock is taken in shared mode). If we fail to
- * allocate data space here we continue and later, after locking the
- * file range, we fail with ENOSPC only if we figure out we can not do a
- * NOCOW write.
- */
- if (write && !(flags & IOMAP_NOWAIT)) {
- ret = btrfs_check_data_free_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, data_alloc_len, false);
- if (!ret)
- dio_data->data_space_reserved = true;
- else if (ret && !(BTRFS_I(inode)->flags &
- (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
- goto err;
- }
-
- /*
- * If this errors out it's because we couldn't invalidate pagecache for
- * this range and we need to fallback to buffered IO, or we are doing a
- * NOWAIT read/write and we need to block.
- */
- ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
- if (ret < 0)
- goto err;
-
- em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
-
- /*
- * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
- * io. INLINE is special, and we could probably kludge it in here, but
- * it's still buffered so for safety lets just fall back to the generic
- * buffered path.
- *
- * For COMPRESSED we _have_ to read the entire extent in so we can
- * decompress it, so there will be buffering required no matter what we
- * do, so go ahead and fallback to buffered.
- *
- * We return -ENOTBLK because that's what makes DIO go ahead and go back
- * to buffered IO. Don't blame me, this is the price we pay for using
- * the generic code.
- */
- if (extent_map_is_compressed(em) ||
- em->block_start == EXTENT_MAP_INLINE) {
- free_extent_map(em);
- /*
- * If we are in a NOWAIT context, return -EAGAIN in order to
- * fallback to buffered IO. This is not only because we can
- * block with buffered IO (no support for NOWAIT semantics at
- * the moment) but also to avoid returning short reads to user
- * space - this happens if we were able to read some data from
- * previous non-compressed extents and then when we fallback to
- * buffered IO, at btrfs_file_read_iter() by calling
- * filemap_read(), we fail to fault in pages for the read buffer,
- * in which case filemap_read() returns a short read (the number
- * of bytes previously read is > 0, so it does not return -EFAULT).
- */
- ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
- goto unlock_err;
- }
-
- len = min(len, em->len - (start - em->start));
-
- /*
- * If we have a NOWAIT request and the range contains multiple extents
- * (or a mix of extents and holes), then we return -EAGAIN to make the
- * caller fallback to a context where it can do a blocking (without
- * NOWAIT) request. This way we avoid doing partial IO and returning
- * success to the caller, which is not optimal for writes and for reads
- * it can result in unexpected behaviour for an application.
- *
- * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
- * iomap_dio_rw(), we can end up returning less data then what the caller
- * asked for, resulting in an unexpected, and incorrect, short read.
- * That is, the caller asked to read N bytes and we return less than that,
- * which is wrong unless we are crossing EOF. This happens if we get a
- * page fault error when trying to fault in pages for the buffer that is
- * associated to the struct iov_iter passed to iomap_dio_rw(), and we
- * have previously submitted bios for other extents in the range, in
- * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
- * those bios have completed by the time we get the page fault error,
- * which we return back to our caller - we should only return EIOCBQUEUED
- * after we have submitted bios for all the extents in the range.
- */
- if ((flags & IOMAP_NOWAIT) && len < length) {
- free_extent_map(em);
- ret = -EAGAIN;
- goto unlock_err;
- }
-
- if (write) {
- ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, &len, flags);
- if (ret < 0)
- goto unlock_err;
- unlock_extents = true;
- /* Recalc len in case the new em is smaller than requested */
- len = min(len, em->len - (start - em->start));
- if (dio_data->data_space_reserved) {
- u64 release_offset;
- u64 release_len = 0;
-
- if (dio_data->nocow_done) {
- release_offset = start;
- release_len = data_alloc_len;
- } else if (len < data_alloc_len) {
- release_offset = start + len;
- release_len = data_alloc_len - len;
- }
-
- if (release_len > 0)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- dio_data->data_reserved,
- release_offset,
- release_len);
- }
- } else {
- /*
- * We need to unlock only the end area that we aren't using.
- * The rest is going to be unlocked by the endio routine.
- */
- lockstart = start + len;
- if (lockstart < lockend)
- unlock_extents = true;
- }
-
- if (unlock_extents)
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
- else
- free_extent_state(cached_state);
-
- /*
- * Translate extent map information to iomap.
- * We trim the extents (and move the addr) even though iomap code does
- * that, since we have locked only the parts we are performing I/O in.
- */
- if ((em->block_start == EXTENT_MAP_HOLE) ||
- ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
- } else {
- iomap->addr = em->block_start + (start - em->start);
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
- iomap->length = len;
- free_extent_map(em);
-
- return 0;
-
-unlock_err:
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
-err:
- if (dio_data->data_space_reserved) {
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start, data_alloc_len);
- extent_changeset_free(dio_data->data_reserved);
- }
-
- return ret;
-}
-
-static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
- ssize_t written, unsigned int flags, struct iomap *iomap)
-{
- struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_dio_data *dio_data = iter->private;
- size_t submitted = dio_data->submitted;
- const bool write = !!(flags & IOMAP_WRITE);
- int ret = 0;
-
- if (!write && (iomap->type == IOMAP_HOLE)) {
- /* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
- NULL);
- return 0;
- }
-
- if (submitted < length) {
- pos += submitted;
- length -= submitted;
- if (write)
- btrfs_finish_ordered_extent(dio_data->ordered, NULL,
- pos, length, false);
- else
- unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
- ret = -ENOTBLK;
- }
- if (write) {
- btrfs_put_ordered_extent(dio_data->ordered);
- dio_data->ordered = NULL;
- }
-
- if (write)
- extent_changeset_free(dio_data->data_reserved);
- return ret;
-}
-
-static void btrfs_dio_end_io(struct btrfs_bio *bbio)
-{
- struct btrfs_dio_private *dip =
- container_of(bbio, struct btrfs_dio_private, bbio);
- struct btrfs_inode *inode = bbio->inode;
- struct bio *bio = &bbio->bio;
-
- if (bio->bi_status) {
- btrfs_warn(inode->root->fs_info,
- "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
- btrfs_ino(inode), bio->bi_opf,
- dip->file_offset, dip->bytes, bio->bi_status);
- }
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- btrfs_finish_ordered_extent(bbio->ordered, NULL,
- dip->file_offset, dip->bytes,
- !bio->bi_status);
- } else {
- unlock_extent(&inode->io_tree, dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
- }
-
- bbio->bio.bi_private = bbio->private;
- iomap_dio_bio_end_io(bio);
-}
-
-static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
- loff_t file_offset)
-{
- struct btrfs_bio *bbio = btrfs_bio(bio);
- struct btrfs_dio_private *dip =
- container_of(bbio, struct btrfs_dio_private, bbio);
- struct btrfs_dio_data *dio_data = iter->private;
-
- btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
- btrfs_dio_end_io, bio->bi_private);
- bbio->inode = BTRFS_I(iter->inode);
- bbio->file_offset = file_offset;
-
- dip->file_offset = file_offset;
- dip->bytes = bio->bi_iter.bi_size;
-
- dio_data->submitted += bio->bi_iter.bi_size;
-
- /*
- * Check if we are doing a partial write. If we are, we need to split
- * the ordered extent to match the submitted bio. Hang on to the
- * remaining unfinishable ordered_extent in dio_data so that it can be
- * cancelled in iomap_end to avoid a deadlock wherein faulting the
- * remaining pages is blocked on the outstanding ordered extent.
- */
- if (iter->flags & IOMAP_WRITE) {
- int ret;
-
- ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
- if (ret) {
- btrfs_finish_ordered_extent(dio_data->ordered, NULL,
- file_offset, dip->bytes,
- !ret);
- bio->bi_status = errno_to_blk_status(ret);
- iomap_dio_bio_end_io(bio);
- return;
- }
- }
-
- btrfs_submit_bio(bbio, 0);
-}
-
-static const struct iomap_ops btrfs_dio_iomap_ops = {
- .iomap_begin = btrfs_dio_iomap_begin,
- .iomap_end = btrfs_dio_iomap_end,
-};
-
-static const struct iomap_dio_ops btrfs_dio_ops = {
- .submit_io = btrfs_dio_submit_io,
- .bio_set = &btrfs_dio_bioset,
-};
-
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
-{
- struct btrfs_dio_data data = { 0 };
-
- return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before)
-{
- struct btrfs_dio_data data = { 0 };
-
- return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
- int ret;
-
- ret = fiemap_prep(inode, fieinfo, start, &len, 0);
- if (ret)
- return ret;
-
- /*
- * fiemap_prep() called filemap_write_and_wait() for the whole possible
- * file range (0 to LLONG_MAX), but that is not enough if we have
- * compression enabled. The first filemap_fdatawrite_range() only kicks
- * in the compression of data (in an async thread) and will return
- * before the compression is done and writeback is started. A second
- * filemap_fdatawrite_range() is needed to wait for the compression to
- * complete and writeback to start. We also need to wait for ordered
- * extents to complete, because our fiemap implementation uses mainly
- * file extent items to list the extents, searching for extent maps
- * only for file ranges with holes or prealloc extents to figure out
- * if we have delalloc in those ranges.
- */
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
- ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
- if (ret)
- return ret;
- }
-
- btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
-
- /*
- * We did an initial flush to avoid holding the inode's lock while
- * triggering writeback and waiting for the completion of IO and ordered
- * extents. Now after we locked the inode we do it again, because it's
- * possible a new write may have happened in between those two steps.
- */
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
- ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
- if (ret) {
- btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
- return ret;
- }
- }
-
- ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
- btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
-
- return ret;
-}
-
-static int btrfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return extent_writepages(mapping, wbc);
-}
-
-static void btrfs_readahead(struct readahead_control *rac)
-{
- extent_readahead(rac);
-}
-
/*
* For release_folio() and invalidate_folio() we have a race window where
* folio_end_writeback() is called but the subpage spinlock is not yet released.
@@ -7941,13 +7361,12 @@ static void btrfs_readahead(struct readahead_control *rac)
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
*/
-static void wait_subpage_spinlock(struct page *page)
+static void wait_subpage_spinlock(struct folio *folio)
{
- struct btrfs_fs_info *fs_info = page_to_fs_info(page);
- struct folio *folio = page_folio(page);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
@@ -7968,15 +7387,20 @@ static void wait_subpage_spinlock(struct page *page)
spin_unlock_irq(&subpage->lock);
}
-static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
+static int btrfs_launder_folio(struct folio *folio)
{
- int ret = try_release_extent_mapping(&folio->page, gfp_flags);
+ return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
+ folio_size(folio), NULL);
+}
- if (ret == 1) {
- wait_subpage_spinlock(&folio->page);
- clear_page_extent_mapped(&folio->page);
+static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
+{
+ if (try_release_extent_mapping(folio, gfp_flags)) {
+ wait_subpage_spinlock(folio);
+ clear_folio_extent_mapped(folio);
+ return true;
}
- return ret;
+ return false;
}
static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -8025,7 +7449,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*
* But already submitted bio can still be finished on this folio.
* Furthermore, endio function won't skip folio which has Ordered
- * (Private2) already cleared, so it's possible for endio and
+ * already cleared, so it's possible for endio and
* invalidate_folio to do the same ordered extent accounting twice
* on one folio.
*
@@ -8033,7 +7457,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* do double ordered extent accounting on the same folio.
*/
folio_wait_writeback(folio);
- wait_subpage_spinlock(&folio->page);
+ wait_subpage_spinlock(folio);
/*
* For subpage case, we have call sites like
@@ -8053,7 +7477,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
}
if (!inode_evicting)
- lock_extent(tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(tree, page_start, page_end, &cached_state);
cur = page_start;
while (cur < page_end) {
@@ -8091,7 +7515,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
range_len = range_end + 1 - cur;
if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
- * If Ordered (Private2) is cleared, it means endio has
+ * If Ordered is cleared, it means endio has
* already been executed for the range.
* We can't delete the extent states as
* btrfs_finish_ordered_io() may still use some of them.
@@ -8109,10 +7533,10 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, cur, range_end,
- EXTENT_DELALLOC |
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
+ btrfs_clear_extent_bit(tree, cur, range_end,
+ EXTENT_DELALLOC |
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
spin_lock_irq(&inode->ordered_tree_lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8154,191 +7578,23 @@ next:
* Since the IO will never happen for this page.
*/
btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
- if (!inode_evicting) {
- clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
- extra_flags, &cached_state);
- }
+ if (!inode_evicting)
+ btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG | extra_flags,
+ &cached_state);
cur = range_end + 1;
}
/*
* We have iterated through all ordered extents of the page, the page
- * should not have Ordered (Private2) anymore, or the above iteration
+ * should not have Ordered anymore, or the above iteration
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
- clear_page_extent_mapped(&folio->page);
-}
-
-/*
- * btrfs_page_mkwrite() is not allowed to change the file size as it gets
- * called from a page fault handler when a page is first dirtied. Hence we must
- * be careful to check for EOF conditions here. We set the page up correctly
- * for a written page which means we get ENOSPC checking when writing into
- * holes and correct delalloc and unwritten extent mapping on filesystems that
- * support these features.
- *
- * We are not allowed to take the i_mutex here so we have to play games to
- * protect against truncate races as the page could now be beyond EOF. Because
- * truncate_setsize() writes the inode size before removing pages, once we have
- * the page lock we can determine safely if the page is beyond EOF. If it is not
- * beyond EOF, then the page is guaranteed safe against truncation until we
- * unlock the page.
- */
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
-{
- struct page *page = vmf->page;
- struct folio *folio = page_folio(page);
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_changeset *data_reserved = NULL;
- unsigned long zero_start;
- loff_t size;
- vm_fault_t ret;
- int ret2;
- int reserved = 0;
- u64 reserved_space;
- u64 page_start;
- u64 page_end;
- u64 end;
-
- ASSERT(folio_order(folio) == 0);
-
- reserved_space = PAGE_SIZE;
-
- sb_start_pagefault(inode->i_sb);
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- end = page_end;
-
- /*
- * Reserving delalloc space after obtaining the page lock can lead to
- * deadlock. For example, if a dirty page is locked by this function
- * and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepages() function could
- * end up waiting indefinitely to get a lock on the page currently
- * being processed by btrfs_page_mkwrite() function.
- */
- ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (!ret2) {
- ret2 = file_update_time(vmf->vma->vm_file);
- reserved = 1;
- }
- if (ret2) {
- ret = vmf_error(ret2);
- if (reserved)
- goto out;
- goto out_noreserve;
- }
-
- ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
-again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
- lock_page(page);
- size = i_size_read(inode);
-
- if ((page->mapping != inode->i_mapping) ||
- (page_start >= size)) {
- /* page got truncated out from underneath us */
- goto out_unlock;
- }
- wait_on_page_writeback(page);
-
- lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_page_extent_mapped(page);
- if (ret2 < 0) {
- ret = vmf_error(ret2);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- goto out_unlock;
- }
-
- /*
- * we can't set the delalloc bits if there are pending ordered
- * extents. Drop our locks and wait for them to finish
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_start_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
-
- if (page->index == ((size - 1) >> PAGE_SHIFT)) {
- reserved_space = round_up(size - page_start,
- fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
- end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
- }
- }
-
- /*
- * page_mkwrite gets called when the page is firstly dirtied after it's
- * faulted in, but write(2) could also dirty a page and set delalloc
- * bits, thus in this case for space account reason, we still need to
- * clear any delalloc bits within this page range since we have to
- * reserve data&meta space before lock_page() (see above comments).
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
- if (ret2) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- ret = VM_FAULT_SIGBUS;
- goto out_unlock;
- }
-
- /* page is wholly or partially inside EOF */
- if (page_start + PAGE_SIZE > size)
- zero_start = offset_in_page(size);
- else
- zero_start = PAGE_SIZE;
-
- if (zero_start != PAGE_SIZE)
- memzero_page(page, zero_start, PAGE_SIZE - zero_start);
-
- btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
- btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
- btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
-
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
-
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return VM_FAULT_LOCKED;
-
-out_unlock:
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, (ret != 0));
-out_noreserve:
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return ret;
+ clear_folio_extent_mapped(folio);
}
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
@@ -8358,7 +7614,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
if (!skip_writeback) {
- ret = btrfs_wait_ordered_range(&inode->vfs_inode,
+ ret = btrfs_wait_ordered_range(inode,
inode->vfs_inode.i_size & (~mask),
(u64)-1);
if (ret)
@@ -8430,7 +7686,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
control.new_size = new_size;
- lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
/*
* We want to drop from the next block forward in case this new
* size is not block aligned since we will be keeping the last
@@ -8445,7 +7701,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
@@ -8489,7 +7745,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
+ ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size,
+ inode->vfs_inode.i_size, (u64)-1);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -8559,20 +7816,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
- struct extent_io_tree *file_extent_tree = NULL;
-
- /* Self tests may pass a NULL fs_info. */
- if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
- file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
- if (!file_extent_tree)
- return NULL;
- }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei) {
- kfree(file_extent_tree);
+ if (!ei)
return NULL;
- }
ei->root = NULL;
ei->generation = 0;
@@ -8585,8 +7832,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->disk_i_size = 0;
ei->flags = 0;
ei->ro_flags = 0;
+ /*
+ * ->index_cnt will be properly initialized later when creating a new
+ * inode (btrfs_create_new_inode()) or when reading an existing inode
+ * from disk (btrfs_read_locked_inode()).
+ */
ei->csum_bytes = 0;
- ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_reflink_trans = 0;
@@ -8607,26 +7858,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->i_otime_nsec = 0;
inode = &ei->vfs_inode;
- extent_map_tree_init(&ei->extent_tree);
+ btrfs_extent_map_tree_init(&ei->extent_tree);
/* This io tree sets the valid inode. */
- extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
+ btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- ei->file_extent_tree = file_extent_tree;
- if (file_extent_tree) {
- extent_io_tree_init(fs_info, ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
- /* Lockdep class is set only for the file extent tree. */
- lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
- }
+ ei->file_extent_tree = NULL;
+
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
ei->ordered_tree_last = NULL;
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
- RB_CLEAR_NODE(&ei->rb_node);
init_rwsem(&ei->i_mmap_lock);
return inode;
@@ -8662,9 +7907,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
if (!S_ISDIR(vfs_inode->i_mode)) {
WARN_ON(inode->delalloc_bytes);
WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
}
- WARN_ON(inode->csum_bytes);
- WARN_ON(inode->defrag_bytes);
+ if (!root || !btrfs_is_data_reloc_root(root))
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8698,7 +7944,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
}
}
btrfs_qgroup_check_reserved_leak(inode);
- inode_tree_del(inode);
+ btrfs_del_inode_from_root(inode);
btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
@@ -8732,7 +7978,6 @@ void __cold btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
- bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
}
@@ -8743,17 +7988,9 @@ int __init btrfs_init_cachep(void)
SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
init_once);
if (!btrfs_inode_cachep)
- goto fail;
-
- if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_dio_private, bbio.bio),
- BIOSET_NEED_BVECS))
- goto fail;
+ return -ENOMEM;
return 0;
-fail:
- btrfs_destroy_cachep();
- return -ENOMEM;
}
static int btrfs_getattr(struct mnt_idmap *idmap,
@@ -8789,6 +8026,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
+ stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->result_mask |= STATX_SUBVOL;
+
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
@@ -8952,31 +8192,45 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
@@ -9212,16 +8466,23 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else {
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
if (new_inode) {
@@ -9229,18 +8490,27 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
&new_fname.disk_name);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
- if (!ret && new_inode->i_nlink == 0)
+ if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans,
BTRFS_I(d_inode(new_dentry)));
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
}
@@ -9355,8 +8625,6 @@ static int start_delalloc_inodes(struct btrfs_root *root,
struct writeback_control *wbc, bool snapshot,
bool in_reclaim_context)
{
- struct btrfs_inode *binode;
- struct inode *inode;
struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice);
@@ -9367,30 +8635,30 @@ static int start_delalloc_inodes(struct btrfs_root *root,
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
- binode = list_entry(splice.next, struct btrfs_inode,
- delalloc_inodes);
+ struct btrfs_inode *inode;
+ struct inode *tmp_inode;
+
+ inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes);
- list_move_tail(&binode->delalloc_inodes,
- &root->delalloc_inodes);
+ list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
if (in_reclaim_context &&
- test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
continue;
- inode = igrab(&binode->vfs_inode);
- if (!inode) {
+ tmp_inode = igrab(&inode->vfs_inode);
+ if (!tmp_inode) {
cond_resched_lock(&root->delalloc_lock);
continue;
}
spin_unlock(&root->delalloc_lock);
if (snapshot)
- set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
- &binode->runtime_flags);
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
if (full_flush) {
- work = btrfs_alloc_delalloc_work(inode);
+ work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
if (!work) {
- iput(inode);
+ iput(&inode->vfs_inode);
ret = -ENOMEM;
goto out;
}
@@ -9398,8 +8666,8 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
+ btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
}
@@ -9516,7 +8784,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct extent_buffer *leaf;
name_len = strlen(symname);
- if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ /*
+ * Symlinks utilize uncompressed inline extent data, which should not
+ * reach block size.
+ */
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
+ name_len >= fs_info->sectorsize)
return -ENAMETOOLONG;
inode = new_inode(dir->i_sb);
@@ -9555,8 +8828,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(name_len);
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
@@ -9580,7 +8853,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
ptr = btrfs_file_extent_inline_start(ei);
write_extent_buffer(leaf, symname, ptr, name_len);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
d_instantiate_new(dentry, inode);
@@ -9668,7 +8940,7 @@ free_qgroup:
* or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid, qgroup_released,
+ btrfs_root_id(inode->root), qgroup_released,
BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(ret);
}
@@ -9730,11 +9002,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
- ins.offset, 0);
+ ins.offset, false);
break;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
cur_offset + ins.offset - 1, false);
@@ -9743,17 +9015,16 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
em->start = cur_offset;
- em->orig_start = cur_offset;
em->len = ins.offset;
- em->block_start = ins.objectid;
- em->block_len = ins.offset;
- em->orig_block_len = ins.offset;
+ em->disk_bytenr = ins.objectid;
+ em->offset = 0;
+ em->disk_num_bytes = ins.offset;
em->ram_bytes = ins.offset;
em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
num_bytes -= ins.offset;
cur_offset += ins.offset;
@@ -9888,28 +9159,6 @@ out_inode:
return finish_open_simple(file, ret);
}
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
- u32 len;
-
- ASSERT(end + 1 - start <= U32_MAX);
- len = end + 1 - start;
- while (index <= end_index) {
- page = find_get_page(inode->vfs_inode.i_mapping, index);
- ASSERT(page); /* Pages should be in the extent_io_tree */
-
- /* This is for data, which doesn't yet support larger folio. */
- ASSERT(folio_order(page_folio(page)) == 0);
- btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
- put_page(page);
- index++;
- }
-}
-
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type)
{
@@ -9947,27 +9196,29 @@ static ssize_t btrfs_encoded_read_inline(
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_file_extent_item *item;
u64 ram_bytes;
unsigned long ptr;
void *tmp;
ssize_t ret;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
+
+ path->nowait = nowait;
+
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0);
if (ret) {
if (ret > 0) {
/* The extent item disappeared? */
- ret = -EIO;
+ return -EIO;
}
- goto out;
+ return ret;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
@@ -9980,17 +9231,16 @@ static ssize_t btrfs_encoded_read_inline(
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_file_extent_compression(leaf, item));
if (ret < 0)
- goto out;
+ return ret;
encoded->compression = ret;
if (encoded->compression) {
size_t inline_size;
inline_size = btrfs_file_extent_inline_item_len(leaf,
path->slots[0]);
- if (inline_size > count) {
- ret = -ENOBUFS;
- goto out;
- }
+ if (inline_size > count)
+ return -ENOBUFS;
+
count = inline_size;
encoded->unencoded_len = ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - extent_start;
@@ -10002,13 +9252,12 @@ static ssize_t btrfs_encoded_read_inline(
}
tmp = kmalloc(count, GFP_NOFS);
- if (!tmp) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!tmp)
+ return -ENOMEM;
+
read_extent_buffer(leaf, tmp, ptr, count);
btrfs_release_path(path);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -10016,14 +9265,14 @@ static ssize_t btrfs_encoded_read_inline(
if (ret != count)
ret = -EFAULT;
kfree(tmp);
-out:
- btrfs_free_path(path);
+
return ret;
}
struct btrfs_encoded_read_private {
- wait_queue_head_t wait;
- atomic_t pending;
+ struct completion *sync_reads;
+ void *uring_ctx;
+ refcount_t pending_refs;
blk_status_t status;
};
@@ -10033,35 +9282,58 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
if (bbio->bio.bi_status) {
/*
- * The memory barrier implied by the atomic_dec_return() here
- * pairs with the memory barrier implied by the
- * atomic_dec_return() or io_wait_event() in
- * btrfs_encoded_read_regular_fill_pages() to ensure that this
- * write is observed before the load of status in
+ * The memory barrier implied by the refcount_dec_and_test() here
+ * pairs with the memory barrier implied by the refcount_dec_and_test()
+ * in btrfs_encoded_read_regular_fill_pages() to ensure that
+ * this write is observed before the load of status in
* btrfs_encoded_read_regular_fill_pages().
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (!atomic_dec_return(&priv->pending))
- wake_up(&priv->wait);
+ if (refcount_dec_and_test(&priv->pending_refs)) {
+ int err = blk_status_to_errno(READ_ONCE(priv->status));
+
+ if (priv->uring_ctx) {
+ btrfs_uring_read_extent_endio(priv->uring_ctx, err);
+ kfree(priv);
+ } else {
+ complete(priv->sync_reads);
+ }
+ }
bio_put(&bbio->bio);
}
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size, struct page **pages)
+ u64 disk_bytenr, u64 disk_io_size,
+ struct page **pages, void *uring_ctx)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private priv = {
- .pending = ATOMIC_INIT(1),
- };
+ struct btrfs_encoded_read_private *priv, sync_priv;
+ struct completion sync_reads;
unsigned long i = 0;
struct btrfs_bio *bbio;
+ int ret;
- init_waitqueue_head(&priv.wait);
+ /*
+ * Fast path for synchronous reads which completes in this call, io_uring
+ * needs longer time span.
+ */
+ if (uring_ctx) {
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
+ } else {
+ priv = &sync_priv;
+ init_completion(&sync_reads);
+ priv->sync_reads = &sync_reads;
+ }
+
+ refcount_set(&priv->pending_refs, 1);
+ priv->status = 0;
+ priv->uring_ctx = uring_ctx;
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
@@ -10069,11 +9341,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv.pending);
- btrfs_submit_bio(bbio, 0);
+ refcount_inc(&priv->pending_refs);
+ btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
continue;
@@ -10084,22 +9356,31 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
- atomic_inc(&priv.pending);
- btrfs_submit_bio(bbio, 0);
+ refcount_inc(&priv->pending_refs);
+ btrfs_submit_bbio(bbio, 0);
+
+ if (uring_ctx) {
+ if (refcount_dec_and_test(&priv->pending_refs)) {
+ ret = blk_status_to_errno(READ_ONCE(priv->status));
+ btrfs_uring_read_extent_endio(uring_ctx, ret);
+ kfree(priv);
+ return ret;
+ }
- if (atomic_dec_return(&priv.pending))
- io_wait_event(priv.wait, !atomic_read(&priv.pending));
- /* See btrfs_encoded_read_endio() for ordering. */
- return blk_status_to_errno(READ_ONCE(priv.status));
+ return -EIOCBQUEUED;
+ } else {
+ if (!refcount_dec_and_test(&priv->pending_refs))
+ wait_for_completion_io(&sync_reads);
+ /* See btrfs_encoded_read_endio() for ordering. */
+ return blk_status_to_errno(READ_ONCE(priv->status));
+ }
}
-static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
- struct iov_iter *iter,
- u64 start, u64 lockend,
- struct extent_state **cached_state,
- u64 disk_bytenr, u64 disk_io_size,
- size_t count, bool compressed,
- bool *unlocked)
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -10113,18 +9394,18 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ ret = btrfs_alloc_page_array(nr_pages, pages, false);
if (ret) {
ret = -ENOMEM;
goto out;
}
- ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
- disk_io_size, pages);
+ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
+ disk_io_size, pages, NULL);
if (ret)
goto out;
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -10160,21 +9441,26 @@ out:
}
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded)
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
ssize_t ret;
size_t count = iov_iter_count(iter);
- u64 start, lockend, disk_bytenr, disk_io_size;
- struct extent_state *cached_state = NULL;
+ u64 start, lockend;
struct extent_map *em;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
bool unlocked = false;
file_accessed(iocb->ki_filp);
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ ret = btrfs_inode_lock(inode,
+ BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
+ if (ret)
+ return ret;
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -10187,21 +9473,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
- for (;;) {
+ if (nowait) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
- lockend - start + 1);
- if (ret)
+ if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
+ start, lockend)) {
+ ret = -EAGAIN;
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, &cached_state);
+ }
+
+ if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) {
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
- if (!ordered)
- break;
- btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, &cached_state);
- cond_resched();
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+ } else {
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+
+ btrfs_lock_extent(io_tree, start, lockend, cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
+ cond_resched();
+ }
}
em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
@@ -10210,94 +9521,89 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_unlock_extent;
}
- if (em->block_start == EXTENT_MAP_INLINE) {
+ if (em->disk_bytenr == EXTENT_MAP_INLINE) {
u64 extent_start = em->start;
/*
* For inline extents we get everything we need out of the
* extent item.
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
- &cached_state, extent_start,
+ cached_state, extent_start,
count, encoded, &unlocked);
- goto out;
+ goto out_unlock_extent;
}
/*
* We only want to return up to EOF even if the extent extends beyond
* that.
*/
- encoded->len = min_t(u64, extent_map_end(em),
+ encoded->len = min_t(u64, btrfs_extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
- if (em->block_start == EXTENT_MAP_HOLE ||
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
- disk_bytenr = EXTENT_MAP_HOLE;
+ *disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (extent_map_is_compressed(em)) {
- disk_bytenr = em->block_start;
+ } else if (btrfs_extent_map_is_compressed(em)) {
+ *disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
* compressed extent.
*/
- if (em->block_len > count) {
+ if (em->disk_num_bytes > count) {
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = em->block_len;
- count = em->block_len;
+ *disk_io_size = em->disk_num_bytes;
+ count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
- encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- extent_map_compression(em));
+ btrfs_extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
} else {
- disk_bytenr = em->block_start + (start - em->start);
+ *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
* Don't read beyond what we locked. This also limits the page
* allocations that we'll do.
*/
- disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
- count = start + disk_io_size - iocb->ki_pos;
+ *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + *disk_io_size - iocb->ki_pos;
encoded->len = count;
encoded->unencoded_len = count;
- disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
- if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ if (*disk_bytenr == EXTENT_MAP_HOLE) {
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
if (ret != count)
ret = -EFAULT;
} else {
- ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
- &cached_state, disk_bytenr,
- disk_io_size, count,
- encoded->compression,
- &unlocked);
+ ret = -EIOCBQUEUED;
+ goto out_unlock_extent;
}
-out:
- if (ret >= 0)
- iocb->ki_pos += encoded->len;
out_em:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out_unlock_extent:
- if (!unlocked)
- unlock_extent(io_tree, start, lockend, &cached_state);
+ /* Leave inode and extent locked if we need to do a read. */
+ if (!unlocked && ret != -EIOCBQUEUED)
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
- if (!unlocked)
+ if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
@@ -10312,12 +9618,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
struct extent_changeset *data_reserved = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
int compression;
size_t orig_count;
u64 start, end;
u64 num_bytes, ram_bytes, disk_num_bytes;
- unsigned long nr_pages, i;
- struct page **pages;
+ unsigned long nr_folios, i;
+ struct folio **folios;
struct btrfs_key ins;
bool extent_reserved = false;
struct extent_map *em;
@@ -10406,24 +9713,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
* isn't.
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
- nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
- if (!pages)
+ nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
+ if (!folios)
return -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_folios; i++) {
size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
char *kaddr;
- pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!pages[i]) {
+ folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
+ if (!folios[i]) {
ret = -ENOMEM;
- goto out_pages;
+ goto out_folios;
}
- kaddr = kmap_local_page(pages[i]);
+ kaddr = kmap_local_folio(folios[i], 0);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
kunmap_local(kaddr);
ret = -EFAULT;
- goto out_pages;
+ goto out_folios;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
@@ -10433,22 +9740,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
for (;;) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ ret = btrfs_wait_ordered_range(inode, start, num_bytes);
if (ret)
- goto out_pages;
+ goto out_folios;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ret)
- goto out_pages;
- lock_extent(io_tree, start, end, &cached_state);
+ goto out_folios;
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
break;
if (ordered)
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
cond_resched();
}
@@ -10468,10 +9775,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_qgroup_free_data;
/* Try an inline extent first. */
- if (start == 0 && encoded->unencoded_len == encoded->len &&
- encoded->unencoded_offset == 0) {
- ret = cow_file_range_inline(inode, encoded->len, orig_count,
- compression, pages, true);
+ if (encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0 &&
+ can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
+ ret = __cow_file_range_inline(inode, encoded->len,
+ orig_count, compression, folios[0],
+ true);
if (ret <= 0) {
if (ret == 0)
ret = orig_count;
@@ -10485,22 +9794,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_delalloc_release;
extent_reserved = true;
- em = create_io_em(inode, start, num_bytes,
- start - encoded->unencoded_offset, ins.objectid,
- ins.offset, ins.offset, ram_bytes, compression,
- BTRFS_ORDERED_COMPRESSED);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = num_bytes;
+ file_extent.ram_bytes = ram_bytes;
+ file_extent.offset = encoded->unencoded_offset;
+ file_extent.compression = compression;
+ em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserved;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
- ins.objectid, ins.offset,
- encoded->unencoded_offset,
- (1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED),
- compression);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ (1U << BTRFS_ORDERED_ENCODED) |
+ (1U << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -10511,17 +9820,17 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (start + encoded->len > inode->vfs_inode.i_size)
i_size_write(&inode->vfs_inode, start + encoded->len);
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
btrfs_delalloc_release_extents(inode, num_bytes);
- btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
+ btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
ret = orig_count;
goto out;
out_free_reserved:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_delalloc_release:
btrfs_delalloc_release_extents(inode, num_bytes);
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
@@ -10534,15 +9843,15 @@ out_free_data_space:
* bytes_may_use.
*/
if (!extent_reserved)
- btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+ btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
out_unlock:
- unlock_extent(io_tree, start, end, &cached_state);
-out_pages:
- for (i = 0; i < nr_pages; i++) {
- if (pages[i])
- __free_page(pages[i]);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
+out_folios:
+ for (i = 0; i < nr_folios; i++) {
+ if (folios[i])
+ folio_put(folios[i]);
}
- kvfree(pages);
+ kvfree(folios);
out:
if (ret >= 0)
iocb->ki_pos += encoded->len;
@@ -10689,39 +9998,59 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
- struct extent_map *em = NULL;
struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
};
+ struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
+ struct btrfs_path *path = NULL;
int ret = 0;
u64 isize;
- u64 start;
+ u64 prev_extent_end = 0;
+
+ /*
+ * Acquire the inode's mmap lock to prevent races with memory mapped
+ * writes, as they could happen after we flush delalloc below and before
+ * we lock the extent range further below. The inode was already locked
+ * up in the call chain.
+ */
+ btrfs_assert_inode_locked(BTRFS_I(inode));
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
/*
* If the swap file was just created, make sure delalloc is done. If the
* file changes again after this, the user is doing something stupid and
* we don't really care.
*/
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
- return ret;
+ goto out_unlock_mmap;
/*
* The inode is locked, so these flags won't change after we check them.
*/
if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
btrfs_warn(fs_info, "swapfile must not be compressed");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
btrfs_warn(fs_info, "swapfile must not be checksummed");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
+ }
+
+ path = btrfs_alloc_path();
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
+ if (!path || !backref_ctx) {
+ ret = -ENOMEM;
+ goto out_unlock_mmap;
}
/*
@@ -10736,7 +10065,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock_mmap;
}
/*
@@ -10750,7 +10080,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because snapshot creation is in progress");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
/*
* Snapshots can create extents which require COW even if NODATACOW is
@@ -10766,36 +10097,53 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (btrfs_root_dead(root)) {
spin_unlock(&root->root_item_lock);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
- root->root_key.objectid);
- return -EPERM;
+ btrfs_root_id(root));
+ ret = -EPERM;
+ goto out_unlock_mmap;
}
atomic_inc(&root->nr_swapfiles);
spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
- lock_extent(io_tree, 0, isize - 1, &cached_state);
- start = 0;
- while (start < isize) {
- u64 logical_block_start, physical_block_start;
+ btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state);
+ while (prev_extent_end < isize) {
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
struct btrfs_block_group *bg;
- u64 len = isize - start;
+ u64 logical_block_start;
+ u64 physical_block_start;
+ u64 extent_gen;
+ u64 disk_bytenr;
+ u64 len;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = prev_extent_end;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
goto out;
- }
- if (em->block_start == EXTENT_MAP_HOLE) {
+ /*
+ * If key not found it means we have an implicit hole (NO_HOLES
+ * is enabled).
+ */
+ if (ret > 0) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
- if (em->block_start == EXTENT_MAP_INLINE) {
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
/*
* It's unlikely we'll ever actually find ourselves
* here, as a file small enough to fit inline won't be
@@ -10807,23 +10155,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (extent_map_is_compressed(em)) {
+
+ if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
}
- logical_block_start = em->block_start + (start - em->start);
- len = min(len, em->len - (start - em->start));
- free_extent_map(em);
- em = NULL;
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (disk_bytenr == 0) {
+ btrfs_warn(fs_info, "swapfile must not have holes");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+ prev_extent_end = btrfs_file_extent_end(path);
+
+ if (prev_extent_end > isize)
+ len = isize - key.offset;
+ else
+ len = btrfs_file_extent_num_bytes(leaf, ei);
+
+ backref_ctx->curr_leaf_bytenr = leaf->start;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
+ /*
+ * Don't need the path anymore, release to avoid deadlocks when
+ * calling btrfs_is_data_extent_shared() because when joining a
+ * transaction it can block waiting for the current one's commit
+ * which in turn may be trying to lock the same leaf to flush
+ * delayed items for example.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
+ extent_gen, backref_ctx);
if (ret < 0) {
goto out;
- } else if (ret) {
- ret = 0;
- } else {
+ } else if (ret > 0) {
btrfs_warn(fs_info,
"swapfile must not be copy-on-write");
ret = -EINVAL;
@@ -10858,7 +10228,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
physical_block_start = (map->stripes[0].physical +
(logical_block_start - map->start));
- len = min(len, map->chunk_len - (logical_block_start - map->start));
btrfs_free_chunk_map(map);
map = NULL;
@@ -10899,24 +10268,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret)
goto out;
}
- bsi.start = start;
+ bsi.start = key.offset;
bsi.block_start = physical_block_start;
bsi.block_len = len;
}
- start += len;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ cond_resched();
}
if (bsi.block_len)
ret = btrfs_add_swap_extent(sis, &bsi);
out:
- if (!IS_ERR_OR_NULL(em))
- free_extent_map(em);
if (!IS_ERR_OR_NULL(map))
btrfs_free_chunk_map(map);
- unlock_extent(io_tree, 0, isize - 1, &cached_state);
+ btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
@@ -10925,6 +10297,10 @@ out:
btrfs_exclop_finish(fs_info);
+out_unlock_mmap:
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_free_backref_share_ctx(backref_ctx);
+ btrfs_free_path(path);
if (ret)
return ret;
@@ -10933,7 +10309,6 @@ out:
*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
sis->max = bsi.nr_pages;
sis->pages = bsi.nr_pages - 1;
- sis->highest_bit = bsi.nr_pages - 1;
return bsi.nr_extents;
}
#else
@@ -10995,7 +10370,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
if (ordered) {
btrfs_err(root->fs_info,
"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
- start, end, btrfs_ino(inode), root->root_key.objectid,
+ start, end, btrfs_ino(inode), btrfs_root_id(root),
ordered->file_offset,
ordered->file_offset + ordered->num_bytes - 1);
btrfs_put_ordered_extent(ordered);
@@ -11004,6 +10379,36 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
ASSERT(ordered == NULL);
}
+/*
+ * Find the first inode with a minimum number.
+ *
+ * @root: The root to search for.
+ * @min_ino: The minimum inode number.
+ *
+ * Find the first inode in the @root with a number >= @min_ino and return it.
+ * Returns NULL if no such inode found.
+ */
+struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
+{
+ struct btrfs_inode *inode;
+ unsigned long from = min_ino;
+
+ xa_lock(&root->inodes);
+ while (true) {
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+ if (igrab(&inode->vfs_inode))
+ break;
+
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
+ }
+ xa_unlock(&root->inodes);
+
+ return inode;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -11056,6 +10461,7 @@ static const struct address_space_operations btrfs_aops = {
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.invalidate_folio = btrfs_invalidate_folio,
+ .launder_folio = btrfs_launder_folio,
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,