diff options
Diffstat (limited to 'fs/btrfs')
135 files changed, 18150 insertions, 11126 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981..fa8515598341 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -78,6 +78,32 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_EXPERIMENTAL + bool "Btrfs experimental features" + depends on BTRFS_FS + default n + help + Enable experimental features. These features may not be stable enough + for end users. This is meant for btrfs developers or users who wish + to test the functionality and report problems. + + Current list: + + - extent map shrinker - performance problems with too frequent shrinks + + - send stream protocol v3 - fs-verity support + + - checksum offload mode - sysfs knob to affect when checksums are + calculated (at IO time, or in a thread) + + - raid-stripe-tree - additional mapping of extents to devices to + support RAID1* profiles on zoned devices, + RAID56 not yet supported + + - extent tree v2 - complex rework of extent tracking + + If unsure, say N. + config BTRFS_FS_REF_VERIFY bool "Btrfs with the ref verify tool compiled in" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 525af975f61c..2d5f0482678b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ - lru_cache.o raid-stripe-tree.o + lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o @@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ - tests/free-space-tree-tests.o tests/extent-map-tests.o + tests/free-space-tree-tests.o tests/extent-map-tests.o \ + tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 1925a0919ca6..e3716516ca38 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -3,9 +3,10 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "messages.h" -#include "ctree.h" +#include "extent_io.h" +#include "fs.h" #include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, @@ -63,8 +64,8 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -94,7 +95,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ @@ -117,8 +118,8 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -151,7 +152,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ed7aa32972ad..7a7e0ef69973 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,8 +3,17 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H +#include <linux/unaligned.h> #include <linux/stddef.h> -#include <asm/unaligned.h> +#include <linux/types.h> +#include <linux/align.h> +#include <linux/build_bug.h> +#include <linux/compiler.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; struct btrfs_map_token { struct extent_buffer *eb; @@ -25,7 +34,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e static inline u8 get_unaligned_le8(const void *p) { - return *(u8 *)p; + return *(const u8 *)p; } static inline void put_unaligned_le8(u8 val, void *p) @@ -39,8 +48,8 @@ static inline void put_unaligned_le8(u8 val, void *p) offsetof(type, member), \ sizeof_field(type, member))) -#define write_eb_member(eb, ptr, type, member, result) (\ - write_extent_buffer(eb, (char *)(result), \ +#define write_eb_member(eb, ptr, type, member, source) ( \ + write_extent_buffer(eb, (const char *)(source), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof_field(type, member))) @@ -306,11 +315,8 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); -BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8); BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64); BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding, - struct btrfs_stripe_extent, encoding, 8); BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64); @@ -344,7 +350,7 @@ static inline void btrfs_tree_block_key(const struct extent_buffer *eb, static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) + const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } @@ -437,7 +443,7 @@ void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); static inline void btrfs_set_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) + const struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr; @@ -503,7 +509,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb, } static inline void btrfs_set_item_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) + const struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(eb, nr); @@ -844,45 +850,6 @@ static inline void btrfs_set_balance_sys(struct extent_buffer *eb, write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } -static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void btrfs_cpu_balance_args_to_disk( - struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7427449a04a3..e0ba00d64ea0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -12,7 +12,6 @@ #include <linux/sched/mm.h> #include <linux/slab.h> #include "ctree.h" -#include "btrfs_inode.h" #include "xattr.h" #include "acl.h" diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index a270e71ec05f..48b9ddae4a46 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,8 +3,15 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +struct posix_acl; +struct inode; +struct btrfs_trans_handle; + #ifdef CONFIG_BTRFS_FS_POSIX_ACL +struct mnt_idmap; +struct dentry; + struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); @@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, #else +#include <linux/errno.h> + +struct btrfs_trans_handle; + #define btrfs_get_acl NULL #define btrfs_set_acl NULL static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 9e261aac671e..a4c51600a408 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -11,7 +11,6 @@ #include <linux/freezer.h> #include <trace/events/btrfs.h> #include "async-thread.h" -#include "ctree.h" enum { WORK_DONE_BIT, @@ -19,7 +18,7 @@ enum { }; #define NO_THRESHOLD (-1) -#define DFT_THRESHOLD (32) +#define DEFAULT_THRESHOLD (32) struct btrfs_workqueue { struct workqueue_struct *normal_wq; @@ -95,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, ret->limit_active = limit_active; if (thresh == 0) - thresh = DFT_THRESHOLD; + thresh = DEFAULT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ - if (thresh < DFT_THRESHOLD) { + if (thresh < DEFAULT_THRESHOLD) { ret->current_active = limit_active; ret->thresh = NO_THRESHOLD; } else { diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 62b8a0d57898..04c2f3175828 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -7,11 +7,14 @@ #ifndef BTRFS_ASYNC_THREAD_H #define BTRFS_ASYNC_THREAD_H +#include <linux/compiler_types.h> #include <linux/workqueue.h> +#include <linux/list.h> struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; + typedef void (*btrfs_func_t)(struct btrfs_work *arg); typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index beed7e459dab..3d3923cfc357 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -198,10 +198,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache; int __init btrfs_prelim_ref_init(void) { btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", - sizeof(struct prelim_ref), - 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct prelim_ref), 0, 0, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; return 0; @@ -222,8 +219,8 @@ static void free_pref(struct prelim_ref *ref) * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 * indicates a 'higher' block. */ -static int prelim_ref_compare(struct prelim_ref *ref1, - struct prelim_ref *ref2) +static int prelim_ref_compare(const struct prelim_ref *ref1, + const struct prelim_ref *ref2) { if (ref1->level < ref2->level) return -1; @@ -253,8 +250,23 @@ static int prelim_ref_compare(struct prelim_ref *ref1, return 0; } +static int prelim_ref_rb_add_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct prelim_ref *ref_new = + rb_entry(new, struct prelim_ref, rbnode); + const struct prelim_ref *ref_exist = + rb_entry(exist, struct prelim_ref, rbnode); + + /* + * prelim_ref_compare() expects the first parameter as the existing one, + * different from the rb_find_add_cached() order. + */ + return prelim_ref_compare(ref_exist, ref_new); +} + static void update_share_count(struct share_check *sc, int oldcount, - int newcount, struct prelim_ref *newref) + int newcount, const struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; @@ -264,7 +276,7 @@ static void update_share_count(struct share_check *sc, int oldcount, else if (oldcount < 1 && newcount > 0) sc->share_count++; - if (newref->root_id == sc->root->root_key.objectid && + if (newref->root_id == btrfs_root_id(sc->root) && newref->wanted_disk_byte == sc->data_bytenr && newref->key_for_search.objectid == sc->inum) sc->self_ref_count += newref->count; @@ -281,55 +293,39 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, struct share_check *sc) { struct rb_root_cached *root; - struct rb_node **p; - struct rb_node *parent = NULL; - struct prelim_ref *ref; - int result; - bool leftmost = true; + struct rb_node *exist; root = &preftree->root; - p = &root->rb_root.rb_node; + exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp); + if (exist) { + struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode); + /* Identical refs, merge them and free @newref */ + struct extent_inode_elem *eie = ref->inode_list; - while (*p) { - parent = *p; - ref = rb_entry(parent, struct prelim_ref, rbnode); - result = prelim_ref_compare(ref, newref); - if (result < 0) { - p = &(*p)->rb_left; - } else if (result > 0) { - p = &(*p)->rb_right; - leftmost = false; - } else { - /* Identical refs, merge them and free @newref */ - struct extent_inode_elem *eie = ref->inode_list; - - while (eie && eie->next) - eie = eie->next; + while (eie && eie->next) + eie = eie->next; - if (!eie) - ref->inode_list = newref->inode_list; - else - eie->next = newref->inode_list; - trace_btrfs_prelim_ref_merge(fs_info, ref, newref, - preftree->count); - /* - * A delayed ref can have newref->count < 0. - * The ref->count is updated to follow any - * BTRFS_[ADD|DROP]_DELAYED_REF actions. - */ - update_share_count(sc, ref->count, - ref->count + newref->count, newref); - ref->count += newref->count; - free_pref(newref); - return; - } + if (!eie) + ref->inode_list = newref->inode_list; + else + eie->next = newref->inode_list; + trace_btrfs_prelim_ref_merge(fs_info, ref, newref, + preftree->count); + /* + * A delayed ref can have newref->count < 0. + * The ref->count is updated to follow any + * BTRFS_[ADD|DROP]_DELAYED_REF actions. + */ + update_share_count(sc, ref->count, + ref->count + newref->count, newref); + ref->count += newref->count; + free_pref(newref); + return; } update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); - rb_link_node(&newref->rbnode, parent, p); - rb_insert_color_cached(&newref->rbnode, root, leftmost); } /* @@ -772,7 +768,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, continue; } - if (sc && ref->root_id != sc->root->root_key.objectid) { + if (sc && ref->root_id != btrfs_root_id(sc->root)) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; @@ -922,40 +918,38 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ - struct btrfs_delayed_tree_ref *ref; struct btrfs_key *key_ptr = NULL; + /* The owner of a tree block ref is the level. */ + int level = btrfs_delayed_ref_owner(node); if (head->extent_op && head->extent_op->update_key) { btrfs_disk_key_to_cpu(&key, &head->extent_op->key); key_ptr = &key; } - ref = btrfs_delayed_node_to_tree_ref(node); - ret = add_indirect_ref(fs_info, preftrees, ref->root, - key_ptr, ref->level + 1, - node->bytenr, count, sc, - GFP_ATOMIC); + ret = add_indirect_ref(fs_info, preftrees, node->ref_root, + key_ptr, level + 1, node->bytenr, + count, sc, GFP_ATOMIC); break; } case BTRFS_SHARED_BLOCK_REF_KEY: { - /* SHARED DIRECT METADATA backref */ - struct btrfs_delayed_tree_ref *ref; - - ref = btrfs_delayed_node_to_tree_ref(node); + /* + * SHARED DIRECT METADATA backref + * + * The owner of a tree block ref is the level. + */ + int level = btrfs_delayed_ref_owner(node); - ret = add_direct_ref(fs_info, preftrees, ref->level + 1, - ref->parent, node->bytenr, count, + ret = add_direct_ref(fs_info, preftrees, level + 1, + node->parent, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_EXTENT_DATA_REF_KEY: { /* NORMAL INDIRECT DATA backref */ - struct btrfs_delayed_data_ref *ref; - ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; + key.objectid = btrfs_delayed_ref_owner(node); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; + key.offset = btrfs_delayed_ref_offset(node); /* * If we have a share check context and a reference for @@ -975,18 +969,14 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, if (sc && count < 0) sc->have_delayed_delete_refs = true; - ret = add_indirect_ref(fs_info, preftrees, ref->root, + ret = add_indirect_ref(fs_info, preftrees, node->ref_root, &key, 0, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ - struct btrfs_delayed_data_ref *ref; - - ref = btrfs_delayed_node_to_data_ref(node); - - ret = add_direct_ref(fs_info, preftrees, 0, ref->parent, + ret = add_direct_ref(fs_info, preftrees, 0, node->parent, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -1036,8 +1026,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); - BUG_ON(item_size < sizeof(*ei)); - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); if (ctx->check_extent_item) { @@ -1435,8 +1423,10 @@ again: if (ret < 0) goto out; if (ret == 0) { - /* This shouldn't happen, indicates a bug or fs corruption. */ - ASSERT(ret != 0); + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto out; } @@ -1451,7 +1441,8 @@ again: */ delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); + head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs, + ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -2225,6 +2216,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + return -EUCLEAN; + } ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { @@ -2247,7 +2245,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, eb = path->nodes[0]; item_size = btrfs_item_size(eb, path->slots[0]); - BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); @@ -2626,7 +2623,7 @@ static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) btrfs_debug(fs_root->fs_info, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, - fs_root->root_key.objectid); + btrfs_root_id(fs_root)); ret = inode_to_path(parent, name_len, (unsigned long)(iref + 1), eb, ipath); if (ret) @@ -2773,20 +2770,14 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = kvmalloc(alloc_bytes, GFP_KERNEL); + data = kvzalloc(alloc_bytes, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); - if (total_bytes >= sizeof(*data)) { + if (total_bytes >= sizeof(*data)) data->bytes_left = total_bytes - sizeof(*data); - data->bytes_missing = 0; - } else { + else data->bytes_missing = sizeof(*data) - total_bytes; - data->bytes_left = 0; - } - - data->elem_cnt = 0; - data->elem_missed = 0; return data; } @@ -2850,6 +2841,16 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf return ret; } +static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) +{ + iter->bytenr = 0; + iter->item_ptr = 0; + iter->cur_ptr = 0; + iter->end_ptr = 0; + btrfs_release_path(iter->path); + memset(&iter->cur_key, 0, sizeof(iter->cur_key)); +} + int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; @@ -2868,6 +2869,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) if (ret < 0) return ret; if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto release; } @@ -2938,6 +2943,14 @@ release: return ret; } +static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || + iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) + return true; + return false; +} + /* * Go to the next backref item of current bytenr, can be either inlined or * keyed. @@ -2950,7 +2963,7 @@ release: */ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { - struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct extent_buffer *eb = iter->path->nodes[0]; struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; @@ -3008,9 +3021,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - INIT_LIST_HEAD(&cache->changed); - INIT_LIST_HEAD(&cache->detached); - INIT_LIST_HEAD(&cache->leaves); INIT_LIST_HEAD(&cache->pending_edge); INIT_LIST_HEAD(&cache->useless_node); cache->fs_info = fs_info; @@ -3038,6 +3048,19 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( return node; } +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); + cache->nr_nodes--; + btrfs_put_root(node->root); + kfree(node); + } +} + struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache) { @@ -3049,6 +3072,52 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( return edge; } +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node) +{ + if (node->eb) { + btrfs_backref_unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +/* + * Drop the backref node from cache without cleaning up its children + * edges. + * + * This can only be called on node without parent edges. + * The children edges are still kept as is. + */ +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node) +{ + ASSERT(list_empty(&node->upper)); + + btrfs_backref_drop_node_buffer(node); + list_del_init(&node->list); + list_del_init(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + btrfs_backref_free_node(tree, node); +} + /* * Drop the backref node from cache, also cleaning up all its * upper edges and any uncached nodes in the path. @@ -3059,29 +3128,17 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { - struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; if (!node) return; - BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct btrfs_backref_edge, list[LOWER]); - upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - - /* - * Add the node to leaf node list if no other child block - * cached. - */ - if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, &cache->leaves); - upper->lowest = 1; - } } btrfs_backref_drop_node(cache, node); @@ -3093,33 +3150,30 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) { struct btrfs_backref_node *node; - int i; - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct btrfs_backref_node, list); + while ((node = rb_entry_safe(rb_first(&cache->rb_root), + struct btrfs_backref_node, rb_node))) btrfs_backref_cleanup_node(cache, node); - } - while (!list_empty(&cache->leaves)) { - node = list_entry(cache->leaves.next, - struct btrfs_backref_node, lower); - btrfs_backref_cleanup_node(cache, node); - } - - cache->last_trans = 0; - - for (i = 0; i < BTRFS_MAX_LEVEL; i++) - ASSERT(list_empty(&cache->pending[i])); ASSERT(list_empty(&cache->pending_edge)); ASSERT(list_empty(&cache->useless_node)); - ASSERT(list_empty(&cache->changed)); - ASSERT(list_empty(&cache->detached)); - ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); ASSERT(!cache->nr_nodes); ASSERT(!cache->nr_edges); } +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which) +{ + ASSERT(upper && lower && upper->level == lower->level + 1); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + if (link_which & LINK_LOWER) + list_add_tail(&edge->list[LOWER], &lower->upper); + if (link_which & LINK_UPPER) + list_add_tail(&edge->list[UPPER], &upper->lower); +} /* * Handle direct tree backref * @@ -3226,8 +3280,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, root = btrfs_get_fs_root(fs_info, ref_key->offset, false); if (IS_ERR(root)) return PTR_ERR(root); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - cur->cowonly = 1; + + /* We shouldn't be using backref cache for non-shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + return -EUCLEAN; + } if (btrfs_root_level(&root->root_item) == cur->level) { /* Tree root */ @@ -3270,7 +3328,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { btrfs_err(fs_info, "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", - cur->bytenr, level - 1, root->root_key.objectid, + cur->bytenr, level - 1, btrfs_root_id(root), tree_key->objectid, tree_key->type, tree_key->offset); btrfs_put_root(root); ret = -ENOENT; @@ -3313,8 +3371,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, goto out; } upper->owner = btrfs_header_owner(eb); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - upper->cowonly = 1; + + /* We shouldn't be using backref cache for non shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + btrfs_backref_free_edge(cache, edge); + btrfs_backref_free_node(cache, upper); + ret = -EUCLEAN; + goto out; + } /* * If we know the block isn't shared we can avoid @@ -3428,7 +3493,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, int type; cond_resched(); - eb = btrfs_backref_get_eb(iter); + eb = iter->path->nodes[0]; key.objectid = iter->bytenr; if (btrfs_backref_iter_is_inline_ref(iter)) { @@ -3505,15 +3570,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, ASSERT(start->checked); - /* Insert this node to cache if it's not COW-only */ - if (!start->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, - &start->rb_node); - if (rb_node) - btrfs_backref_panic(cache->fs_info, start->bytenr, - -EEXIST); - list_add_tail(&start->lower, &cache->leaves); - } + rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST); /* * Use breadth first search to iterate all related edges. @@ -3552,11 +3611,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, * parents have already been linked. */ if (!RB_EMPTY_NODE(&upper->rb_node)) { - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - list_add_tail(&edge->list[UPPER], &upper->lower); continue; } @@ -3567,23 +3621,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, return -EUCLEAN; } - /* Sanity check, COW-only node has non-COW-only parent */ - if (start->cowonly != upper->cowonly) { - ASSERT(0); + rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (unlikely(rb_node)) { + btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); return -EUCLEAN; } - /* Only cache non-COW-only (subvolume trees) tree blocks */ - if (!upper->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - if (rb_node) { - btrfs_backref_panic(cache->fs_info, - upper->bytenr, -EEXIST); - return -EUCLEAN; - } - } - list_add_tail(&edge->list[UPPER], &upper->lower); /* diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ab4ca0eda605..74e614031274 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -6,11 +6,23 @@ #ifndef BTRFS_BACKREF_H #define BTRFS_BACKREF_H -#include <linux/btrfs.h> +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" -#include "ulist.h" +#include "locking.h" #include "disk-io.h" #include "extent_io.h" +#include "ctree.h" + +struct extent_inode_elem; +struct ulist; +struct btrfs_extent_item; +struct btrfs_trans_handle; +struct btrfs_fs_info; /* * Used by implementations of iterate_extent_inodes_t (see definition below) to @@ -271,22 +283,6 @@ struct btrfs_backref_iter { struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); -static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) -{ - if (!iter) - return; - btrfs_free_path(iter->path); - kfree(iter); -} - -static inline struct extent_buffer *btrfs_backref_get_eb( - struct btrfs_backref_iter *iter) -{ - if (!iter) - return NULL; - return iter->path->nodes[0]; -} - /* * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header. @@ -306,25 +302,6 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr); int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); -static inline bool btrfs_backref_iter_is_inline_ref( - struct btrfs_backref_iter *iter) -{ - if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || - iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) - return true; - return false; -} - -static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) -{ - iter->bytenr = 0; - iter->item_ptr = 0; - iter->cur_ptr = 0; - iter->end_ptr = 0; - btrfs_release_path(iter->path); - memset(&iter->cur_key, 0, sizeof(iter->cur_key)); -} - /* * Backref cache related structures * @@ -341,6 +318,12 @@ struct btrfs_backref_node { u64 bytenr; }; /* Use rb_simple_node for search/insert */ + /* + * This is a sanity check, whenever we COW a block we will update + * new_bytenr with it's current location, and we will check this in + * various places to validate that the cache makes sense, it shouldn't + * be used for anything else. + */ u64 new_bytenr; /* Objectid of tree block owner, can be not uptodate */ u64 owner; @@ -358,10 +341,6 @@ struct btrfs_backref_node { struct extent_buffer *eb; /* Level of the tree block */ unsigned int level:8; - /* Is the block in a non-shareable tree */ - unsigned int cowonly:1; - /* 1 if no child node is in the cache */ - unsigned int lowest:1; /* Is the extent buffer locked */ unsigned int locked:1; /* Has the block been processed */ @@ -414,12 +393,6 @@ struct btrfs_backref_cache { * level blocks may not reflect the new location */ struct list_head pending[BTRFS_MAX_LEVEL]; - /* List of backref nodes with no child node */ - struct list_head leaves; - /* List of blocks that have been COWed in current transaction */ - struct list_head changed; - /* List of detached backref node. */ - struct list_head detached; u64 last_trans; @@ -452,83 +425,22 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( #define LINK_LOWER (1 << 0) #define LINK_UPPER (1 << 1) -static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which) -{ - ASSERT(upper && lower && upper->level == lower->level + 1); - edge->node[LOWER] = lower; - edge->node[UPPER] = upper; - if (link_which & LINK_LOWER) - list_add_tail(&edge->list[LOWER], &lower->upper); - if (link_which & LINK_UPPER) - list_add_tail(&edge->list[UPPER], &upper->lower); -} - -static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, - struct btrfs_backref_node *node) -{ - if (node) { - ASSERT(list_empty(&node->list)); - ASSERT(list_empty(&node->lower)); - ASSERT(node->eb == NULL); - cache->nr_nodes--; - btrfs_put_root(node->root); - kfree(node); - } -} -static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, - struct btrfs_backref_edge *edge) -{ - if (edge) { - cache->nr_edges--; - kfree(edge); - } -} - -static inline void btrfs_backref_unlock_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } -} - -static inline void btrfs_backref_drop_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->eb) { - btrfs_backref_unlock_node_buffer(node); - free_extent_buffer(node->eb); - node->eb = NULL; - } -} - -/* - * Drop the backref node from cache without cleaning up its children - * edges. - * - * This can only be called on node without parent edges. - * The children edges are still kept as is. - */ -static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, - struct btrfs_backref_node *node) -{ - ASSERT(list_empty(&node->upper)); - - btrfs_backref_drop_node_buffer(node); - list_del_init(&node->list); - list_del_init(&node->lower); - if (!RB_EMPTY_NODE(&node->rb_node)) - rb_erase(&node->rb_node, &tree->rb_root); - btrfs_backref_free_node(tree, node); -} +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which); +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge); +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node); +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node); void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node); void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 928f512cdb4a..bc2555c44a12 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -11,7 +11,6 @@ #include "raid56.h" #include "async-thread.h" #include "dev-replace.h" -#include "rcu-string.h" #include "zoned.h" #include "file-item.h" #include "raid-stripe-tree.h" @@ -30,7 +29,7 @@ struct btrfs_failed_bio { /* Is this a data path I/O that needs storage layer checksum and repair? */ static inline bool is_data_bbio(struct btrfs_bio *bbio) { - return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); + return bbio->inode && is_data_inode(bbio->inode); } static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) @@ -50,11 +49,12 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); + WRITE_ONCE(bbio->status, BLK_STS_OK); } /* * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). * * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. @@ -74,20 +74,16 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, - u64 map_length, bool use_append) + u64 map_length) { struct btrfs_bio *bbio; struct bio *bio; - if (use_append) { - unsigned int nr_segs; + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + if (IS_ERR(bio)) + return ERR_CAST(bio); - bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, - &btrfs_clone_bioset, map_length); - } else { - bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, - GFP_NOFS, &btrfs_clone_bioset); - } bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -124,43 +120,26 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; - __btrfs_bio_end_io(bbio); -} - -static void btrfs_orig_write_end_io(struct bio *bio); - -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, - struct btrfs_bio *orig_bbio) -{ - /* - * For writes we tolerate nr_mirrors - 1 write failures, so we can't - * just blindly propagate a write failure here. Instead increment the - * error count in the original I/O context so that it is guaranteed to - * be larger than the error tolerance. - */ - if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { - struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; - struct btrfs_io_context *orig_bioc = orig_stripe->bioc; - - atomic_add(orig_bioc->max_errors, &orig_bioc->error); - } else { - orig_bbio->bio.bi_status = bbio->bio.bi_status; - } -} - -static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) -{ if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - if (bbio->bio.bi_status) - btrfs_bbio_propagate_error(bbio, orig_bbio); btrfs_cleanup_bio(bbio); bbio = orig_bbio; } - if (atomic_dec_and_test(&bbio->pending_ios)) + /* + * At this point, bbio always points to the original btrfs_bio. Save + * the first error in it. + */ + if (status != BLK_STS_OK) + cmpxchg(&bbio->status, BLK_STS_OK, status); + + if (atomic_dec_and_test(&bbio->pending_ios)) { + /* Load split bio's error which might be set above. */ + if (status == BLK_STS_OK) + bbio->bio.bi_status = READ_ONCE(bbio->status); __btrfs_bio_end_io(bbio); + } } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -180,7 +159,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) static void btrfs_repair_done(struct btrfs_failed_bio *fbio) { if (atomic_dec_and_test(&fbio->repair_count)) { - btrfs_orig_bbio_end_io(fbio->bbio); + btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); mempool_free(fbio, &btrfs_failed_bio_pool); } } @@ -212,7 +191,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return; } @@ -281,7 +260,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return fbio; } @@ -327,7 +306,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de if (fbio) btrfs_repair_done(fbio); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) @@ -361,7 +340,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) @@ -379,9 +358,9 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } } @@ -395,7 +374,7 @@ static void btrfs_raid56_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -422,10 +401,10 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -436,7 +415,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } @@ -474,6 +453,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); + /* + * Track reads if tracking is enabled; ignore I/O operations before the + * filesystem is fully initialized. + */ + if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) + percpu_counter_add(&dev->fs_info->stats_read_blocks, + bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -503,14 +490,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, - struct btrfs_io_stripe *smap, int mirror_num) +static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - if (bio_op(bio) != REQ_OP_READ) - btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; if (bio_op(bio) != REQ_OP_READ) btrfs_bio(bio)->orig_physical = smap->physical; @@ -596,7 +581,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_orig_bbio_end_io(async->bbio); + btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); return; } @@ -606,13 +591,25 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) * context. This changes nothing when cgroups aren't in use. */ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; - __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); + btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } static bool should_async_write(struct btrfs_bio *bbio) { + bool auto_csum_mode = true; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; + enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); + + if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) + return false; + + auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); +#endif + /* Submit synchronously if the checksum implementation is fast. */ - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* @@ -655,11 +652,29 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } +static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) +{ + unsigned int nr_segs; + int sector_offset; + + map_length = min(map_length, bbio->fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + &nr_segs, map_length); + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } + return map_length; +} + static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = bbio->fs_info; - struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; @@ -670,22 +685,34 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; - smap.is_scrub = !bbio->inode; + if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + smap.rst_search_commit_root = true; + else + smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bioc, &smap, &mirror_num); if (error) { ret = errno_to_blk_status(error); - goto fail; + btrfs_bio_counter_dec(fs_info); + goto end_bbio; } map_length = min(map_length, length); if (use_append) - map_length = min(map_length, fs_info->max_zone_append_size); + map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + struct btrfs_bio *split; + + split = btrfs_split_bio(fs_info, bbio, map_length); + if (IS_ERR(split)) { + ret = errno_to_blk_status(PTR_ERR(split)); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; + } + bbio = split; bio = &bbio->bio; } @@ -697,7 +724,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) - goto fail_put_bio; + goto fail; } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { @@ -706,8 +733,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } - if (is_data_bbio(bbio) && bioc && - btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + if (is_data_bbio(bbio) && bioc && bioc->use_rst) { /* * No locking for the list update, as we only add to * the list in the I/O submission path, and list @@ -723,7 +749,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * point, so they are handled as part of the no-checksum case. */ if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) @@ -731,29 +757,41 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) - goto fail_put_bio; - } else if (use_append) { + goto fail; + } else if (use_append || + (btrfs_is_zoned(fs_info) && inode && + inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); if (ret) - goto fail_put_bio; + goto fail; } } - __btrfs_submit_bio(bio, bioc, &smap, mirror_num); + btrfs_submit_bio(bio, bioc, &smap, mirror_num); done: return map_length == length; -fail_put_bio: - if (map_length < length) - btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(orig_bbio, ret); + /* + * We have split the original bbio, now we have to end both the current + * @bbio and remaining one, as the remaining one will never be submitted. + */ + if (map_length < length) { + struct btrfs_bio *remaining = bbio->private; + + ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); + ASSERT(remaining); + + btrfs_bio_end_io(remaining, ret); + } +end_bbio: + btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; } -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); @@ -765,7 +803,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) /* * Submit a repair write. * - * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * @@ -854,7 +892,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_ ASSERT(smap.dev == fs_info->dev_replace.srcdev); smap.dev = fs_info->dev_replace.tgtdev; } - __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); return; fail: diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index bbaed317161a..e2fe16074ad6 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -7,12 +7,14 @@ #ifndef BTRFS_BIO_H #define BTRFS_BIO_H +#include <linux/types.h> #include <linux/bio.h> #include <linux/workqueue.h> #include "tree-checker.h" struct btrfs_bio; struct btrfs_fs_info; +struct btrfs_inode; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -27,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and - * passed to btrfs_submit_bio for mapping to the physical devices. + * passed to btrfs_submit_bbio() for mapping to the physical devices. */ struct btrfs_bio { /* @@ -40,7 +42,7 @@ struct btrfs_bio { union { /* * For data reads: checksumming and original I/O information. - * (for internal use in the btrfs_submit_bio machinery only) + * (for internal use in the btrfs_submit_bbio() machinery only) */ struct { u8 *csum; @@ -77,6 +79,9 @@ struct btrfs_bio { /* File system that this I/O operates on. */ struct btrfs_fs_info *fs_info; + /* Save the first error status of split bio. */ + blk_status_t status; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -102,7 +107,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct folio *folio, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 378d9103a207..c0a8f7d92acc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -23,7 +23,7 @@ #include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -40,9 +40,9 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) * * Should be called with balance_lock held */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags) { - struct btrfs_balance_control *bctl = fs_info->balance_ctl; + const struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; if (!bctl) @@ -173,43 +173,41 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) } } +static int btrfs_bg_start_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_block_group *new_bg = + rb_entry(new, struct btrfs_block_group, cache_node); + const struct btrfs_block_group *exist_bg = + rb_entry(exist, struct btrfs_block_group, cache_node); + + if (new_bg->start < exist_bg->start) + return -1; + if (new_bg->start > exist_bg->start) + return 1; + return 0; +} + /* * This adds the block group to the fs_info rb tree for the block group cache */ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_block_group *cache; - bool leftmost = true; + struct rb_node *exist; + int ret = 0; ASSERT(block_group->length != 0); write_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_root.rb_node; - - while (*p) { - parent = *p; - cache = rb_entry(parent, struct btrfs_block_group, cache_node); - if (block_group->start < cache->start) { - p = &(*p)->rb_left; - } else if (block_group->start > cache->start) { - p = &(*p)->rb_right; - leftmost = false; - } else { - write_unlock(&info->block_group_cache_lock); - return -EEXIST; - } - } - - rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color_cached(&block_group->cache_node, - &info->block_group_cache_tree, leftmost); + exist = rb_find_add_cached(&block_group->cache_node, + &info->block_group_cache_tree, btrfs_bg_start_cmp); + if (exist) + ret = -EEXIST; write_unlock(&info->block_group_cache_lock); - return 0; + return ret; } /* @@ -418,7 +416,7 @@ struct btrfs_caching_control *btrfs_get_caching_control( return ctl; } -void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +static void btrfs_put_caching_control(struct btrfs_caching_control *ctl) { if (refcount_dec_and_test(&ctl->count)) kfree(ctl); @@ -1022,6 +1020,13 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) } } +static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) + return fs_info->block_group_root; + return btrfs_extent_root(fs_info, 0); +} + static int remove_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_block_group *block_group) @@ -1063,7 +1068,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, map->start); - BUG_ON(!block_group); + if (!block_group) + return -ENOENT; + BUG_ON(!block_group->ro); trace_btrfs_remove_block_group(block_group); @@ -1214,8 +1221,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->space_info->total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); - block_group->space_info->bytes_zone_unusable -= - block_group->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info, + -block_group->zone_unusable); block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); @@ -1387,7 +1394,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */ sinfo->bytes_readonly += cache->zone_unusable; - sinfo->bytes_zone_unusable -= cache->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable); cache->zone_unusable = 0; } cache->ro++; @@ -1405,9 +1412,9 @@ out: } static bool clean_pinned_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *bg) + const struct btrfs_block_group *bg) { - struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *prev_trans = NULL; const u64 start = bg->start; const u64 end = start + bg->length - 1; @@ -1429,7 +1436,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, * group in pinned_extents before we were able to clear the whole block * group range from pinned_extents. This means that task can lookup for * the block group after we unpinned it from pinned_extents and removed - * it, leading to a BUG_ON() at unpin_extent_range(). + * it, leading to an error at unpin_extent_range(). */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { @@ -1522,6 +1529,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group. + * + * Also bail out if this is the only block group for its + * type, because otherwise we would lose profile + * information from fs_info->avail_*_alloc_bits and the + * next block group of this type would be created with a + * "single" profile (even if we're in a raid fs) because + * fs_info->avail_*_alloc_bits would be 0. */ trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1550,7 +1564,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used) { + if (space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the @@ -1627,8 +1642,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - btrfs_space_info_update_bytes_pinned(fs_info, space_info, - -block_group->pinned); + btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; block_group->pinned = 0; @@ -1738,33 +1752,30 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } -static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) { if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; } -static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed) { - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info); + u64 thresh_bytes = mult_perc(bg->length, thresh_pct); const u64 new_val = bg->used; const u64 old_val = new_val + bytes_freed; - u64 thresh; - if (reclaim_thresh == 0) + if (thresh_bytes == 0) return false; - thresh = mult_perc(bg->length, reclaim_thresh); - /* * If we were below the threshold before don't reclaim, we are likely a * brand new block group and we don't want to relocate new block groups. */ - if (old_val < thresh) + if (old_val < thresh_bytes) return false; - if (new_val >= thresh) + if (new_val >= thresh_bytes) return false; return true; } @@ -1775,6 +1786,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; + LIST_HEAD(retry_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1811,6 +1823,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; + u64 reclaimed; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1824,6 +1837,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) /* Don't race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); + spin_lock(&space_info->lock); spin_lock(&bg->lock); if (bg->reserved || bg->pinned || bg->ro) { /* @@ -1833,6 +1847,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * this block group. */ spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } @@ -1851,6 +1866,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_mark_bg_unused(bg); spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; @@ -1867,10 +1883,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!should_reclaim_block_group(bg, bg->length)) { spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); /* * Get out fast, in case we're read-only or unmounting the @@ -1903,16 +1921,39 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(bg->used * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); + reclaimed = bg->used; ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); + reclaimed = 0; + spin_lock(&space_info->lock); + space_info->reclaim_errors++; + if (READ_ONCE(space_info->periodic_reclaim)) + space_info->periodic_reclaim_ready = false; + spin_unlock(&space_info->lock); } + spin_lock(&space_info->lock); + space_info->reclaim_count++; + space_info->reclaim_bytes += reclaimed; + spin_unlock(&space_info->lock); next: - if (ret) - btrfs_mark_bg_to_reclaim(bg); + if (ret && !READ_ONCE(space_info->periodic_reclaim)) { + /* Refcount held by the reclaim_bgs list after splice. */ + spin_lock(&fs_info->unused_bgs_lock); + /* + * This block group might be added to the unused list + * during the above process. Move it back to the + * reclaim list otherwise. + */ + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, &retry_list); + } + spin_unlock(&fs_info->unused_bgs_lock); + } btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1932,12 +1973,16 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); end: + spin_lock(&fs_info->unused_bgs_lock); + list_splice_tail(&retry_list, &fs_info->reclaim_bgs); + spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) { + btrfs_reclaim_sweep(fs_info); spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); @@ -1957,8 +2002,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } -static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct btrfs_path *path) +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, + const struct btrfs_path *path) { struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; @@ -2006,7 +2051,7 @@ out_free_map: static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; @@ -2591,8 +2636,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, } static int insert_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 chunk_offset, - u64 start, u64 num_bytes) + const struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -2623,7 +2668,6 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -2748,7 +2792,7 @@ next: * uncompressed data size, because the compression is only done * when writeback triggered and we don't know how much space we * are actually going to need, so we reserve the uncompressed - * size because the data may be uncompressible in the worst case. + * size because the data may be incompressible in the worst case. */ if (ret == 0) { bool used; @@ -2768,7 +2812,7 @@ next: * For extent tree v2 we use the block_group_item->chunk_offset to point at our * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. */ -static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset) { u64 div = SZ_1G; u64 index; @@ -3008,9 +3052,10 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ cache->zone_unusable = - (cache->alloc_offset - cache->used) + + (cache->alloc_offset - cache->used - cache->pinned - + cache->reserved) + (cache->length - cache->zone_capacity); - sinfo->bytes_zone_unusable += cache->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } num_bytes = cache->length - cache->reserved - @@ -3072,7 +3117,6 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); - btrfs_mark_buffer_dirty(trans, leaf); fail: btrfs_release_path(path); /* @@ -3636,20 +3680,25 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, old_val += num_bytes; cache->used = old_val; cache->reserved -= num_bytes; + cache->reclaim_mark = 0; space_info->bytes_reserved -= num_bytes; space_info->bytes_used += num_bytes; space_info->disk_used += num_bytes * factor; + if (READ_ONCE(space_info->periodic_reclaim)) + btrfs_space_info_update_reclaimable(space_info, -num_bytes); spin_unlock(&cache->lock); spin_unlock(&space_info->lock); } else { old_val -= num_bytes; cache->used = old_val; cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); space_info->bytes_used -= num_bytes; space_info->disk_used -= num_bytes * factor; - - reclaim = should_reclaim_block_group(cache, num_bytes); + if (READ_ONCE(space_info->periodic_reclaim)) + btrfs_space_info_update_reclaimable(space_info, num_bytes); + else + reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -3725,8 +3774,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved += num_bytes; trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); + btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; @@ -3763,6 +3811,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, spin_lock(&cache->lock); if (cache->ro) space_info->bytes_readonly += num_bytes; + else if (btrfs_is_zoned(cache->fs_info)) + space_info->bytes_zone_unusable += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; @@ -3786,8 +3836,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) +static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; @@ -4162,7 +4212,7 @@ out: return ret; } -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; @@ -4303,13 +4353,13 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) spin_lock(&block_group->lock); if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { - struct inode *inode = block_group->inode; + struct btrfs_inode *inode = block_group->inode; block_group->inode = NULL; spin_unlock(&block_group->lock); ASSERT(block_group->io_ctl.inode == NULL); - iput(inode); + iput(&inode->vfs_inode); } else { spin_unlock(&block_group->lock); } @@ -4566,7 +4616,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, return 0; } -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg) { if (btrfs_is_zoned(bg->fs_info)) return false; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 962b11983901..36937eeab9b8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -3,9 +3,22 @@ #ifndef BTRFS_BLOCK_GROUP_H #define BTRFS_BLOCK_GROUP_H +#include <linux/atomic.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/wait.h> +#include <linux/sizes.h> +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs_tree.h> #include "free-space-cache.h" struct btrfs_chunk_map; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_trans_handle; enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, @@ -102,7 +115,7 @@ struct btrfs_caching_control { struct btrfs_block_group { struct btrfs_fs_info *fs_info; - struct inode *inode; + struct btrfs_inode *inode; spinlock_t lock; u64 start; u64 length; @@ -250,9 +263,10 @@ struct btrfs_block_group { struct work_struct zone_finish_work; struct extent_buffer *last_eb; enum btrfs_block_group_size_class size_class; + u64 reclaim_mark; }; -static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group) { return (block_group->start + block_group->length); } @@ -264,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); } -static inline bool btrfs_is_block_group_data_only( - struct btrfs_block_group *block_group) +static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) { /* * In mixed mode the fragmentation is expected to be high, lowering the @@ -276,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( @@ -297,7 +310,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); -void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); int btrfs_add_new_free_space(struct btrfs_block_group *block_group, @@ -357,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static inline int btrfs_block_group_done(struct btrfs_block_group *cache) +static inline int btrfs_block_group_done(const struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || @@ -374,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 1043a8142351..3f3608299c0b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,7 +6,6 @@ #include "space-info.h" #include "transaction.h" #include "block-group.h" -#include "disk-io.h" #include "fs.h" #include "accessors.h" @@ -151,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&dest->lock); } if (num_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, - space_info, - num_bytes); + btrfs_space_info_free_bytes_may_use(space_info, num_bytes); } if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release; @@ -342,9 +339,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) read_lock(&fs_info->global_root_lock); rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, rb_node) { - if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID || - root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || - root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) { num_bytes += btrfs_root_used(&root->root_item); min_items++; } @@ -384,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, num_bytes); block_rsv->reserved = block_rsv->size; } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - -num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes); block_rsv->reserved = block_rsv->size; btrfs_try_granting_tickets(fs_info, sinfo); } @@ -407,7 +402,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - switch (root->root_key.objectid) { + switch (btrfs_root_id(root)) { case BTRFS_CSUM_TREE_OBJECTID: case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: @@ -469,8 +464,7 @@ static struct btrfs_block_rsv *get_block_rsv( if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || (root == fs_info->uuid_root) || - (trans->adding_csums && - root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID)) + (trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -555,7 +549,7 @@ try_reserve: return ERR_PTR(ret); } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { u64 needed_bytes; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 43a9a6b5a79f..d12b1fac5c74 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -3,8 +3,15 @@ #ifndef BTRFS_BLOCK_RSV_H #define BTRFS_BLOCK_RSV_H +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/spinlock.h> + struct btrfs_trans_handle; struct btrfs_root; +struct btrfs_space_info; +struct btrfs_block_rsv; +struct btrfs_fs_info; enum btrfs_reserve_flush_enum; /* @@ -82,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7f7c5a92d2b8..b2fa33911c28 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -8,13 +8,31 @@ #include <linux/hash.h> #include <linux/refcount.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/compiler.h> #include <linux/fscrypt.h> +#include <linux/lockdep.h> +#include <uapi/linux/btrfs_tree.h> #include <trace/events/btrfs.h> +#include "block-rsv.h" #include "extent_map.h" #include "extent_io.h" +#include "extent-io-tree.h" #include "ordered-data.h" #include "delayed-inode.h" +struct extent_state; +struct posix_acl; +struct iov_iter; +struct writeback_control; +struct btrfs_root; +struct btrfs_fs_info; +struct btrfs_trans_handle; + /* * Since we search a directory based on f_pos (struct dir_context::pos) we have * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so @@ -41,7 +59,6 @@ enum { */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, - BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, /* @@ -71,6 +88,39 @@ enum { BTRFS_INODE_FREE_SPACE_INODE, /* Set when there are no capabilities in XATTs for the inode. */ BTRFS_INODE_NO_CAP_XATTR, + /* + * Set if an error happened when doing a COW write before submitting a + * bio or during writeback. Used for both buffered writes and direct IO + * writes. This is to signal a fast fsync that it has to wait for + * ordered extents to complete and therefore not log extent maps that + * point to unwritten extents (when an ordered extent completes and it + * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its + * range). + */ + BTRFS_INODE_COW_WRITE_ERROR, + /* + * Indicate this is a directory that points to a subvolume for which + * there is no root reference item. That's a case like the following: + * + * $ btrfs subvolume create /mnt/parent + * $ btrfs subvolume create /mnt/parent/child + * $ btrfs subvolume snapshot /mnt/parent /mnt/snap + * + * If subvolume "parent" is root 256, subvolume "child" is root 257 and + * snapshot "snap" is root 258, then there's no root reference item (key + * BTRFS_ROOT_REF_KEY in the root tree) for the subvolume "child" + * associated to root 258 (the snapshot) - there's only for the root + * of the "parent" subvolume (root 256). In the chunk root we have a + * (256 BTRFS_ROOT_REF_KEY 257) key but we don't have a + * (258 BTRFS_ROOT_REF_KEY 257) key - the sames goes for backrefs, we + * have a (257 BTRFS_ROOT_BACKREF_KEY 256) but we don't have a + * (257 BTRFS_ROOT_BACKREF_KEY 258) key. + * + * So when opening the "child" dentry from the snapshot's directory, + * we don't find a root ref item and we create a stub inode. This is + * done at new_simple_dir(), called from btrfs_lookup_dentry(). + */ + BTRFS_INODE_ROOT_STUB, }; /* in memory btrfs inode */ @@ -78,10 +128,14 @@ struct btrfs_inode { /* which subvolume this inode belongs to */ struct btrfs_root *root; - /* key used to find this inode on disk. This is used by the code - * to read in roots of subvolumes +#if BITS_PER_LONG == 32 + /* + * The objectid of the corresponding BTRFS_INODE_ITEM_KEY. + * On 64 bits platforms we can get it from vfs_inode.i_ino, which is an + * unsigned long and therefore 64 bits on such platforms. */ - struct btrfs_key location; + u64 objectid; +#endif /* Cached value of inode property 'compression'. */ u8 prop_compress; @@ -98,6 +152,7 @@ struct btrfs_inode { * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes, * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to * update the VFS' inode number of bytes used. + * Also protects setting struct file::private_data. */ spinlock_t lock; @@ -137,9 +192,6 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; - /* node for the red-black tree that links inodes in subvolume root */ - struct rb_node rb_node; - unsigned long runtime_flags; /* full 64 bit generation number, struct vfs_inode doesn't have a big @@ -200,11 +252,20 @@ struct btrfs_inode { u64 last_dir_index_offset; }; - /* - * Total number of bytes pending defrag, used by stat to check whether - * it needs COW. Protected by 'lock'. - */ - u64 defrag_bytes; + union { + /* + * Total number of bytes pending defrag, used by stat to check whether + * it needs COW. Protected by 'lock'. + * Used by inodes other than the data relocation inode. + */ + u64 defrag_bytes; + + /* + * Logical address of the block group being relocated. + * Used only by the data relocation inode. + */ + u64 reloc_block_group_start; + }; /* * The size of the file stored in the metadata on disk. data=ordered @@ -213,12 +274,21 @@ struct btrfs_inode { */ u64 disk_i_size; - /* - * If this is a directory then index_cnt is the counter for the index - * number for new files that are created. For an empty directory, this - * must be initialized to BTRFS_DIR_START_INDEX. - */ - u64 index_cnt; + union { + /* + * If this is a directory then index_cnt is the counter for the + * index number for new files that are created. For an empty + * directory, this must be initialized to BTRFS_DIR_START_INDEX. + */ + u64 index_cnt; + + /* + * If this is not a directory, this is the number of bytes + * outstanding that are going to need csums. This is used in + * ENOSPC accounting. Protected by 'lock'. + */ + u64 csum_bytes; + }; /* Cache the directory index number to speed the dir/file remove */ u64 dir_index; @@ -230,22 +300,25 @@ struct btrfs_inode { */ u64 last_unlink_trans; - /* - * The id/generation of the last transaction where this inode was - * either the source or the destination of a clone/dedupe operation. - * Used when logging an inode to know if there are shared extents that - * need special care when logging checksum items, to avoid duplicate - * checksum items in a log (which can lead to a corruption where we end - * up with missing checksum ranges after log replay). - * Protected by the vfs inode lock. - */ - u64 last_reflink_trans; + union { + /* + * The id/generation of the last transaction where this inode + * was either the source or the destination of a clone/dedupe + * operation. Used when logging an inode to know if there are + * shared extents that need special care when logging checksum + * items, to avoid duplicate checksum items in a log (which can + * lead to a corruption where we end up with missing checksum + * ranges after log replay). Protected by the VFS inode lock. + * Used for regular files only. + */ + u64 last_reflink_trans; - /* - * Number of bytes outstanding that are going to need csums. This is - * used in ENOSPC accounting. Protected by 'lock'. - */ - u64 csum_bytes; + /* + * In case this a root stub inode (BTRFS_INODE_ROOT_STUB flag set), + * the ID of that root. + */ + u64 ref_root_id; + }; /* Backwards incompatible flags, lower half of inode_item::flags */ u32 flags; @@ -278,10 +351,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, WRITE_ONCE(inode->first_dir_index_to_log, index); } -static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) -{ - return container_of(inode, struct btrfs_inode, vfs_inode); -} +/* Type checked and const-preserving VFS inode -> btrfs inode. */ +#define BTRFS_I(_inode) \ + _Generic(_inode, \ + struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \ + const struct inode *: (const struct btrfs_inode *)container_of( \ + _inode, const struct btrfs_inode, vfs_inode)) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) @@ -303,10 +378,9 @@ static inline unsigned long btrfs_inode_hash(u64 objectid, */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { - u64 ino = inode->location.objectid; + u64 ino = inode->objectid; - /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ - if (inode->location.type == BTRFS_ROOT_ITEM_KEY) + if (test_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags)) ino = inode->vfs_inode.i_ino; return ino; } @@ -320,20 +394,36 @@ static inline u64 btrfs_ino(const struct btrfs_inode *inode) #endif +static inline void btrfs_get_inode_key(const struct btrfs_inode *inode, + struct btrfs_key *key) +{ + key->objectid = btrfs_ino(inode); + key->type = BTRFS_INODE_ITEM_KEY; + key->offset = 0; +} + +static inline void btrfs_set_inode_number(struct btrfs_inode *inode, u64 ino) +{ +#if BITS_PER_LONG == 32 + inode->objectid = ino; +#endif + inode->vfs_inode.i_ino = ino; +} + static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); inode->disk_i_size = size; } -static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) +static inline bool btrfs_is_free_space_inode(const struct btrfs_inode *inode) { return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); } -static inline bool is_data_inode(struct inode *inode) +static inline bool is_data_inode(const struct btrfs_inode *inode) { - return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; + return btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID; } static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, @@ -363,9 +453,11 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) } /* - * Should be called while holding the inode's VFS lock in exclusive mode or in a - * context where no one else can access the inode concurrently (during inode - * creation or when loading an inode from disk). + * Should be called while holding the inode's VFS lock in exclusive mode, or + * while holding the inode's mmap lock (struct btrfs_inode::i_mmap_lock) in + * either shared or exclusive mode, or in a context where no one else can access + * the inode concurrently (during inode creation or when loading an inode from + * disk). */ static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) { @@ -416,6 +508,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } +static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) +{ + /* Immediately trigger a crash if the inode is not locked. */ + ASSERT(inode_is_locked(&inode->vfs_inode)); + /* Trigger a splat in dmesg if this task is not holding the lock. */ + lockdep_assert_held(&inode->vfs_inode.i_rwsem); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes @@ -425,10 +525,10 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict); + struct btrfs_file_extent *file_extent, + bool nowait); -void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -477,8 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state struct extent_state *other); void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); @@ -486,12 +584,11 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path); -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); +struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path); +struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); + struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, @@ -509,24 +606,27 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); +int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, - struct page **pages); + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size); +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); +struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino); extern const struct dentry_operations btrfs_dentry_operations; @@ -542,5 +642,10 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); +u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + u64 num_bytes); +struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, + const struct btrfs_file_extent *file_extent, + int type); #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 68345f73d429..0c4d486c3048 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -25,8 +25,6 @@ #include "misc.h" #include "ctree.h" #include "fs.h" -#include "disk-io.h" -#include "transaction.h" #include "btrfs_inode.h" #include "bio.h" #include "ordered-data.h" @@ -34,8 +32,7 @@ #include "extent_io.h" #include "extent_map.h" #include "subpage.h" -#include "zoned.h" -#include "file-item.h" +#include "messages.h" #include "super.h" static struct bio_set btrfs_compressed_bioset; @@ -93,20 +90,20 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) } static int compression_compress_pages(int type, struct list_head *ws, - struct address_space *mapping, u64 start, struct page **pages, - unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out) + struct address_space *mapping, u64 start, + struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { switch (type) { case BTRFS_COMPRESS_ZLIB: - return zlib_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return zlib_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_LZO: - return lzo_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return lzo_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_ZSTD: - return zstd_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return zstd_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_NONE: default: /* @@ -118,7 +115,7 @@ static int compression_compress_pages(int type, struct list_head *ws, * Not a big deal, just need to inform caller that we * haven't allocated any pages yet. */ - *out_pages = 0; + *out_folios = 0; return -E2BIG; } } @@ -141,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, + const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { - case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: @@ -161,11 +158,11 @@ static int compression_decompress(int type, struct list_head *ws, } } -static void btrfs_free_compressed_pages(struct compressed_bio *cb) +static void btrfs_free_compressed_folios(struct compressed_bio *cb) { - for (unsigned int i = 0; i < cb->nr_pages; i++) - btrfs_free_compr_page(cb->compressed_pages[i]); - kfree(cb->compressed_pages); + for (unsigned int i = 0; i < cb->nr_folios; i++) + btrfs_free_compr_folio(cb->compressed_folios[i]); + kfree(cb->compressed_folios); } static int btrfs_decompress_bio(struct compressed_bio *cb); @@ -226,25 +223,25 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co /* * Common wrappers for page allocation from compression wrappers */ -struct page *btrfs_alloc_compr_page(void) +struct folio *btrfs_alloc_compr_folio(void) { - struct page *page = NULL; + struct folio *folio = NULL; spin_lock(&compr_pool.lock); if (compr_pool.count > 0) { - page = list_first_entry(&compr_pool.list, struct page, lru); - list_del_init(&page->lru); + folio = list_first_entry(&compr_pool.list, struct folio, lru); + list_del_init(&folio->lru); compr_pool.count--; } spin_unlock(&compr_pool.lock); - if (page) - return page; + if (folio) + return folio; - return alloc_page(GFP_NOFS); + return folio_alloc(GFP_NOFS, 0); } -void btrfs_free_compr_page(struct page *page) +void btrfs_free_compr_folio(struct folio *folio) { bool do_free = false; @@ -252,7 +249,7 @@ void btrfs_free_compr_page(struct page *page) if (compr_pool.count > compr_pool.thresh) { do_free = true; } else { - list_add(&page->lru, &compr_pool.list); + list_add(&folio->lru, &compr_pool.list); compr_pool.count++; } spin_unlock(&compr_pool.lock); @@ -260,11 +257,11 @@ void btrfs_free_compr_page(struct page *page) if (!do_free) return; - ASSERT(page_ref_count(page) == 1); - put_page(page); + ASSERT(folio_ref_count(folio) == 1); + folio_put(folio); } -static void end_bbio_comprssed_read(struct btrfs_bio *bbio) +static void end_bbio_compressed_read(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); blk_status_t status = bbio->bio.bi_status; @@ -272,7 +269,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio) if (!status) status = errno_to_blk_status(btrfs_decompress_bio(cb)); - btrfs_free_compressed_pages(cb); + btrfs_free_compressed_folios(cb); btrfs_bio_end_io(cb->orig_bbio, status); bio_put(&bbio->bio); } @@ -284,7 +281,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio) static noinline void end_compressed_writeback(const struct compressed_bio *cb) { struct inode *inode = &cb->bbio.inode->vfs_inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; @@ -326,7 +323,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) end_compressed_writeback(cb); /* Note, our inode could be gone now */ - btrfs_free_compressed_pages(cb); + btrfs_free_compressed_folios(cb); bio_put(&cb->bbio.bio); } @@ -337,7 +334,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) * This also calls the writeback end hooks for the file pages so that metadata * and checksums can be updated in the file. */ -static void end_bbio_comprssed_write(struct btrfs_bio *bbio) +static void end_bbio_compressed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; @@ -345,17 +342,19 @@ static void end_bbio_comprssed_write(struct btrfs_bio *bbio) queue_work(fs_info->compressed_write_workers, &cb->write_end_work); } -static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) +static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { struct bio *bio = &cb->bbio.bio; u32 offset = 0; while (offset < cb->compressed_len) { + int ret; u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); /* Maximum compressed extent is smaller than bio size limit. */ - __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], - len, 0); + ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], + len, 0); + ASSERT(ret); offset += len; } } @@ -370,12 +369,12 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) * the end io hooks. */ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct page **compressed_pages, - unsigned int nr_pages, + struct folio **compressed_folios, + unsigned int nr_folios, blk_opf_t write_flags, bool writeback) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct compressed_bio *cb; @@ -384,19 +383,19 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb = alloc_compressed_bio(inode, ordered->file_offset, REQ_OP_WRITE | write_flags, - end_bbio_comprssed_write); + end_bbio_compressed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; - cb->compressed_pages = compressed_pages; + cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); - cb->nr_pages = nr_pages; + cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; - btrfs_add_compressed_bio_pages(cb); + btrfs_add_compressed_bio_folios(cb); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); } /* @@ -415,13 +414,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, struct compressed_bio *cb, int *memstall, unsigned long *pflags) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long end_index; struct bio *orig_bio = &cb->orig_bbio->bio; u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; - struct page *page; + struct folio *folio; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; @@ -441,7 +440,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * This makes readahead less effective, so here disable readahead for * subpage for now, until full compressed write is supported. */ - if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) + if (fs_info->sectorsize < PAGE_SIZE) return 0; end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; @@ -454,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - page = xa_load(&mapping->i_pages, pg_index); - if (page && !xa_is_value(page)) { - sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + folio = filemap_get_folio(mapping, pg_index); + if (!IS_ERR(folio)) { + u64 folio_sz = folio_size(folio); + u64 offset = offset_in_folio(folio, cur); + + folio_put(folio); + sectors_missed += (folio_sz - offset) >> fs_info->sectorsize_bits; /* Beyond threshold, no need to continue */ @@ -467,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode, * Jump to next page start as we already have page for * current offset. */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += (folio_sz - offset); continue; } - page = __page_cache_alloc(mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, + ~__GFP_FS), 0); + if (!folio) break; - if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { - put_page(page); + if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { /* There is already a page, skip to page end */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += folio_size(folio); + folio_put(folio); continue; } - if (!*memstall && PageWorkingset(page)) { + if (!*memstall && folio_test_workingset(folio)) { psi_memstall_enter(pflags); *memstall = 1; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } - page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); @@ -508,31 +511,32 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ if (!em || cur < em->start || (cur + fs_info->sectorsize > extent_map_end(em)) || - (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { + (extent_map_block_start(em) >> SECTOR_SHIFT) != + orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } + add_size = min(em->start + em->len, page_end + 1) - cur; free_extent_map(em); + unlock_extent(tree, cur, page_end, NULL); - if (page->index == end_index) { - size_t zero_offset = offset_in_page(isize); + if (folio->index == end_index) { + size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { int zeros; - zeros = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, zeros); + zeros = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, zeros); } } - add_size = min(em->start + em->len, page_end + 1) - cur; - ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); - if (ret != add_size) { - unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + if (!bio_add_folio(orig_bio, folio, add_size, + offset_in_folio(folio, cur))) { + folio_unlock(folio); + folio_put(folio); break; } /* @@ -541,9 +545,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page_folio(page), - cur, add_size); - put_page(page); + btrfs_folio_set_lock(fs_info, folio, cur, add_size); + folio_put(folio); cur += add_size; } return 0; @@ -586,12 +589,12 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) } ASSERT(extent_map_is_compressed(em)); - compressed_len = em->block_len; + compressed_len = em->disk_num_bytes; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, - end_bbio_comprssed_read); + end_bbio_compressed_read); - cb->start = em->orig_start; + cb->start = em->start - em->offset; em_len = em->len; em_start = em->start; @@ -602,14 +605,14 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) free_extent_map(em); - cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); - if (!cb->compressed_pages) { + cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); + if (!cb->compressed_folios) { ret = BLK_STS_RESOURCE; goto out_free_bio; } - ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0); + ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); if (ret2) { ret = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -621,16 +624,16 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) /* include any pages we added in add_ra-bio_pages */ cb->len = bbio->bio.bi_iter.bi_size; cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; - btrfs_add_compressed_bio_pages(cb); + btrfs_add_compressed_bio_folios(cb); if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); return; out_free_compressed_pages: - kfree(cb->compressed_pages); + kfree(cb->compressed_folios); out_free_bio: bio_put(&cb->bbio.bio); out: @@ -698,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(unsigned int level) +static struct list_head *alloc_heuristic_ws(void) { struct heuristic_ws *ws; @@ -740,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { static struct list_head *alloc_workspace(int type, unsigned int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); default: /* @@ -977,6 +980,29 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) return level; } +/* Wrapper around find_get_page(), with extra error message. */ +int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, + struct folio **in_folio_ret) +{ + struct folio *in_folio; + + /* + * The compressed write path should have the folio locked already, thus + * we only need to grab one reference. + */ + in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT); + if (IS_ERR(in_folio)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_crit(inode->root->fs_info, + "failed to get page cache, root %lld ino %llu file offset %llu", + btrfs_root_id(inode->root), btrfs_ino(inode), start); + return -ENOENT; + } + *in_folio_ret = in_folio; + return 0; +} + /* * Given an address space and start and length, compress the bytes into @pages * that are allocated on demand. @@ -997,21 +1023,22 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, - u64 start, struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { int type = btrfs_compress_type(type_level); int level = btrfs_compress_level(type_level); + const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); - ret = compression_compress_pages(type, workspace, mapping, start, pages, - out_pages, total_in, total_out); + ret = compression_compress_pages(type, workspace, mapping, start, folios, + out_folios, total_in, total_out); + /* The total read-in bytes should be no larger than the input. */ + ASSERT(*total_in <= orig_len); put_workspace(type, workspace); return ret; } @@ -1036,10 +1063,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); struct list_head *workspace; const u32 sectorsize = fs_info->sectorsize; int ret; @@ -1052,7 +1079,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); workspace = get_workspace(type, 0); - ret = compression_decompress(type, workspace, data_in, dest_page, + ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); put_workspace(type, workspace); @@ -1479,11 +1506,6 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, /* * Compression heuristic. * - * For now is's a naive and optimistic 'return true', we'll extend the logic to - * quickly (compared to direct compression) detect data characteristics - * (compressible/incompressible) to avoid wasting CPU time on incompressible - * data. - * * The following types of analysis can be performed: * - detect mostly zero data * - detect data with low "byte set" size (text, etc) @@ -1491,7 +1513,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, * * Return non-zero if the compression should be done, 0 otherwise. */ -int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) +int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) { struct list_head *ws_list = get_workspace(0, 0); struct heuristic_ws *ws; @@ -1501,7 +1523,7 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) ws = list_entry(ws_list, struct heuristic_ws, list); - heuristic_collect_sample(inode, start, end, ws); + heuristic_collect_sample(&inode->vfs_inode, start, end, ws); if (sample_repeated_patterns(ws)) { ret = 1; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index afd7e50d073d..954034086d0d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -7,10 +7,18 @@ #define BTRFS_COMPRESSION_H #include <linux/sizes.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/wait.h> #include "bio.h" +struct address_space; +struct page; +struct inode; struct btrfs_inode; struct btrfs_ordered_extent; +struct btrfs_bio; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -32,14 +40,12 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 -struct page; - struct compressed_bio { - /* Number of compressed pages in the array */ - unsigned int nr_pages; + /* Number of compressed folios in the array. */ + unsigned int nr_folios; - /* the pages with the compressed data on them */ - struct page **compressed_pages; + /* The folios with the compressed data on them. */ + struct folio **compressed_folios; /* starting offset in the inode for our pages */ u64 start; @@ -76,30 +82,35 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } +/* @range_end must be exclusive. */ +static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +{ + u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + + return min(range_end, page_end) - cur; +} + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, - u64 start, struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out); -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out); +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct page **compressed_pages, - unsigned int nr_pages, - blk_opf_t write_flags, - bool writeback); + struct folio **compressed_folios, + unsigned int nr_folios, blk_opf_t write_flags, + bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); -struct page *btrfs_alloc_compr_page(void); -void btrfs_free_compr_page(struct page *page); +struct folio *btrfs_alloc_compr_folio(void); +void btrfs_free_compr_folio(struct folio *folio); enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, @@ -141,35 +152,38 @@ extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); -int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); +int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end); + +int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, + struct folio **in_folio_ret); -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); struct list_head *zlib_get_workspace(unsigned int level); -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); +struct list_head *lzo_alloc_workspace(void); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e65e012bac55..3dc5a35dd19b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); - -static const struct btrfs_csums { - u16 size; - const char name[10]; - const char driver[12]; -} btrfs_csums[] = { - [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, - [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, - [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, - [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", - .driver = "blake2b-256" }, -}; - /* * The leaf data grows from end-to-front in the node. this returns the address * of the start of the last item, which is the stop of the leaf data stack. @@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst, nr_items * sizeof(struct btrfs_item)); } -/* This exists for btrfs-progs usages. */ -u16 btrfs_csum_type_size(u16 type) -{ - return btrfs_csums[type].size; -} - -int btrfs_super_csum_size(const struct btrfs_super_block *s) -{ - u16 t = btrfs_super_csum_type(s); - /* - * csum type is validated at mount time - */ - return btrfs_csum_type_size(t); -} - -const char *btrfs_super_csum_name(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].name; -} - -/* - * Return driver name if defined, otherwise the name that's also a valid driver - * name - */ -const char *btrfs_super_csum_driver(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver[0] ? - btrfs_csums[csum_type].driver : - btrfs_csums[csum_type].name; -} - -size_t __attribute_const__ btrfs_get_num_csums(void) -{ - return ARRAY_SIZE(btrfs_csums); -} - struct btrfs_path *btrfs_alloc_path(void) { might_sleep(); @@ -226,22 +175,6 @@ noinline void btrfs_release_path(struct btrfs_path *p) } /* - * We want the transaction abort to print stack trace only for errors where the - * cause could be a bug, eg. due to ENOSPC, and not for common errors that are - * caused by external factors. - */ -bool __cold abort_should_print_stack(int error) -{ - switch (error) { - case -EIO: - case -EROFS: - case -ENOMEM: - return false; - } - return true; -} - -/* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the @@ -291,7 +224,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ - if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) + if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID) list_move_tail(&root->dirty_list, &fs_info->dirty_cowonly_roots); else @@ -321,7 +254,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - trans->transid != root->last_trans); + trans->transid != btrfs_get_root_last_trans(root)); level = btrfs_header_level(buf); if (level == 0) @@ -417,7 +350,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, u64 refs; u64 owner; u64 flags; - u64 new_flags = 0; int ret; /* @@ -454,7 +386,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } } else { refs = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; else @@ -462,19 +394,26 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } owner = btrfs_header_owner(buf); - BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && - !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) { + btrfs_crit(fs_info, +"found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set", + buf->start, btrfs_header_level(buf), + btrfs_root_id(root), refs, flags); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } if (refs > 1) { - if ((owner == root->root_key.objectid || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + if ((owner == btrfs_root_id(root) || + btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1); if (ret) return ret; - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0); if (ret) return ret; @@ -482,26 +421,22 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + ret = btrfs_set_disk_extent_flags(trans, buf, + BTRFS_BLOCK_FLAG_FULL_BACKREF); + if (ret) + return ret; } else { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; } - if (new_flags != 0) { - ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); - if (ret) - return ret; - } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); @@ -554,7 +489,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - trans->transid != root->last_trans); + trans->transid != btrfs_get_root_last_trans(root)); level = btrfs_header_level(buf); @@ -563,13 +498,13 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { if (parent) parent_start = parent->start; reloc_src_root = btrfs_header_owner(buf); } cow = btrfs_alloc_tree_block(trans, root, parent_start, - root->root_key.objectid, &disk_key, level, + btrfs_root_id(root), &disk_key, level, search_start, empty_size, reloc_src_root, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -582,60 +517,56 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | BTRFS_HEADER_FLAG_RELOC); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); else - btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_set_header_owner(cow, btrfs_root_id(root)); write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } } if (buf == root->node) { WARN_ON(parent && parent != buf); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); if (ret < 0) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } atomic_inc(&cow->refs); rcu_assign_pointer(root->node, cow); - btrfs_free_tree_block(trans, btrfs_root_id(root), buf, - parent_start, last_ref); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; + } } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, BTRFS_MOD_LOG_KEY_REPLACE); if (ret) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } btrfs_set_node_blockptr(parent, parent_slot, cow->start); @@ -645,21 +576,30 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } } - btrfs_free_tree_block(trans, btrfs_root_id(root), buf, - parent_start, last_ref); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; + } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; + +error_unlock_cow: + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + return ret; } static inline int should_cow_block(struct btrfs_trans_handle *trans, @@ -685,7 +625,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; @@ -705,7 +645,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -746,12 +685,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, - cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); @@ -820,7 +755,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; @@ -986,9 +921,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used_bytes(root); - btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } return 0; } if (btrfs_header_nritems(mid) > @@ -1003,7 +942,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -1021,7 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, BTRFS_NESTING_RIGHT_COW); @@ -1056,10 +995,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } root_sub_used_bytes(root); - btrfs_free_tree_block(trans, btrfs_root_id(root), right, - 0, 1); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), + right, 0, 1); free_extent_buffer_stale(right); right = NULL; + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); @@ -1114,9 +1057,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } root_sub_used_bytes(root); - btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; @@ -1205,7 +1152,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (IS_ERR(left)) return PTR_ERR(left); - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -1265,7 +1212,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (IS_ERR(right)) return PTR_ERR(right); - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -1491,27 +1438,27 @@ static noinline void unlock_up(struct btrfs_path *path, int level, */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer **eb_ret, int level, int slot, + struct extent_buffer **eb_ret, int slot, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; - u64 gen; - struct extent_buffer *tmp; - int ret; + struct extent_buffer *tmp = NULL; + int ret = 0; int parent_level; - bool unlock_up; + int err; + bool read_tmp = false; + bool tmp_locked = false; + bool path_released = false; - unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); - gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); check.has_first_key = true; check.level = parent_level - 1; - check.transid = gen; - check.owner_root = root->root_key.objectid; + check.transid = btrfs_node_ptr_generation(*eb_ret, slot); + check.owner_root = btrfs_root_id(root); /* * If we need to read an extent buffer from disk and we are holding locks @@ -1523,84 +1470,117 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, - parent_level - 1, &check.first_key, gen)) { - free_extent_buffer(tmp); - return -EUCLEAN; + if (btrfs_verify_level_key(tmp, &check)) { + ret = -EUCLEAN; + goto out; } *eb_ret = tmp; - return 0; + tmp = NULL; + ret = 0; + goto out; } if (p->nowait) { - free_extent_buffer(tmp); - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) - btrfs_unlock_up_safe(p, level + 1); - - /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, &check); - if (ret) { - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; - } - if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { - free_extent_buffer(tmp); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return -EUCLEAN; + ret = -EAGAIN; + path_released = true; } - if (unlock_up) - ret = -EAGAIN; + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; + } + if (ret == 0) { + ASSERT(!tmp_locked); + *eb_ret = tmp; + tmp = NULL; + } goto out; } else if (p->nowait) { - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) { - btrfs_unlock_up_safe(p, level + 1); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); ret = -EAGAIN; - } else { - ret = 0; } if (p->reada != READA_NONE) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, &check); + tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + tmp = NULL; + goto out; + } + read_tmp = true; + + if (!p->skip_locking) { + ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return PTR_ERR(tmp); + path_released = true; } + + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; + } + /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) + if (!extent_buffer_uptodate(tmp)) { ret = -EIO; + goto out; + } -out: if (ret == 0) { + ASSERT(!tmp_locked); *eb_ret = tmp; - } else { - free_extent_buffer(tmp); - btrfs_release_path(p); + tmp = NULL; } +out: + if (tmp) { + if (tmp_locked) + btrfs_tree_read_unlock(tmp); + if (read_tmp && ret && ret != -EAGAIN) + free_extent_buffer_stale(tmp); + else + free_extent_buffer(tmp); + } + if (ret && !path_released) + btrfs_release_path(p); return ret; } @@ -1998,7 +1978,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info; struct extent_buffer *b; int slot; int ret; @@ -2011,6 +1991,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int min_write_lock_level; int prev_cmp; + if (!root) + return -EINVAL; + + fs_info = root->fs_info; might_sleep(); lowest_level = p->lowest_level; @@ -2185,8 +2169,8 @@ cow_done: goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2312,8 +2296,8 @@ again: goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2322,7 +2306,7 @@ again: level = btrfs_header_level(b); btrfs_tree_read_lock(b); - b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); + b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -2552,8 +2536,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * */ static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, int level) + const struct btrfs_path *path, + const struct btrfs_disk_key *key, int level) { int i; struct extent_buffer *t; @@ -2582,7 +2566,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, * that the new key won't break the order */ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2648,8 +2632,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, * is correct, we only need to bother the last key of @left and the first * key of @right. */ -static bool check_sibling_keys(struct extent_buffer *left, - struct extent_buffer *right) +static bool check_sibling_keys(const struct extent_buffer *left, + const struct extent_buffer *right) { struct btrfs_key left_last; struct btrfs_key right_first; @@ -2865,7 +2849,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), &lower_key, level, root->node->start, 0, 0, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) @@ -2886,7 +2870,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, old = root->node; ret = btrfs_tree_mod_log_insert_root(root->node, c, false); if (ret < 0) { - btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + int ret2; + + ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + if (ret2 < 0) + btrfs_abort_transaction(trans, ret2); btrfs_tree_unlock(c); free_extent_buffer(c); return ret; @@ -2912,8 +2900,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static int insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, + const struct btrfs_path *path, + const struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { struct extent_buffer *lower; @@ -3009,7 +2997,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), &disk_key, level, c->start, 0, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) @@ -3267,7 +3255,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return PTR_ERR(right); - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); free_space = btrfs_leaf_free_space(right); if (free_space < data_size) @@ -3483,7 +3471,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return PTR_ERR(left); - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { @@ -3761,7 +3749,7 @@ again: * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just * use BTRFS_NESTING_NEW_ROOT. */ - right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), &disk_key, 0, l->start, 0, 0, num_doubles ? BTRFS_NESTING_NEW_ROOT : BTRFS_NESTING_SPLIT); @@ -3847,6 +3835,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) @@ -4003,7 +3992,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * the front. */ void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end) + const struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4095,7 +4084,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size) + const struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; @@ -4280,6 +4269,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, /* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. + * + * Returns: 0 on success + * -EEXIST if the first key already exists + * < 0 on other errors */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4451,9 +4444,12 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used_bytes(root); atomic_inc(&leaf->refs); - btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); - return 0; + if (ret < 0) + btrfs_abort_transaction(trans, ret); + + return ret; } /* * delete the item at the leaf level in path. If that empties @@ -4907,8 +4903,7 @@ again: } next = c; - ret = read_block_for_search(root, path, &next, level, - slot, &key); + ret = read_block_for_search(root, path, &next, slot, &key); if (ret == -EAGAIN && !path->nowait) goto again; @@ -4951,8 +4946,7 @@ again: if (!level) break; - ret = read_block_for_search(root, path, &next, level, - 0, &key); + ret = read_block_for_search(root, path, &next, 0, &key); if (ret == -EAGAIN && !path->nowait) goto again; @@ -5082,9 +5076,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root, int __init btrfs_ctree_init(void) { - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); if (!btrfs_path_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 70e828d33177..1096a80a64e7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,26 +6,25 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include <linux/pagemap.h> +#include "linux/cleanup.h" +#include <linux/spinlock.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> +#include <linux/wait.h> +#include <linux/list.h> +#include <linux/atomic.h> +#include <linux/xarray.h> +#include <linux/refcount.h> +#include <uapi/linux/btrfs_tree.h> #include "locking.h" #include "fs.h" #include "accessors.h" +#include "extent-io-tree.h" +struct extent_buffer; +struct btrfs_block_rsv; struct btrfs_trans_handle; -struct btrfs_transaction; -struct btrfs_pending_snapshot; -struct btrfs_delayed_ref_root; -struct btrfs_space_info; struct btrfs_block_group; -struct btrfs_ordered_sum; -struct btrfs_ref; -struct btrfs_bio; -struct btrfs_ioctl_encoded_io_args; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; -struct reloc_control; /* Read ahead values for struct btrfs_path.reada */ enum { @@ -85,6 +84,9 @@ struct btrfs_path { unsigned int nowait:1; }; +#define BTRFS_PATH_AUTO_FREE(path_name) \ + struct btrfs_path *path_name __free(btrfs_free_path) = NULL + /* * The state of btrfs root */ @@ -222,9 +224,11 @@ struct btrfs_root { struct list_head root_list; - spinlock_t inode_lock; - /* red-black tree that keeps track of in-memory inodes */ - struct rb_root inode_tree; + /* + * Xarray that keeps track of in-memory inodes, protected by the lock + * @inode_lock. + */ + struct xarray inodes; /* * Xarray that keeps track of delayed nodes of every inode, protected @@ -355,6 +359,35 @@ static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int c WRITE_ONCE(root->last_log_commit, commit_id); } +static inline u64 btrfs_get_root_last_trans(const struct btrfs_root *root) +{ + return READ_ONCE(root->last_trans); +} + +static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transid) +{ + WRITE_ONCE(root->last_trans, transid); +} + +/* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. @@ -448,6 +481,8 @@ struct btrfs_file_private { void *filldir_buf; u64 last_index; struct extent_state *llseek_cached_state; + /* Task that allocated this structure. */ + struct task_struct *owner_task; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) @@ -470,21 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ - ((bytes) >> (fs_info)->sectorsize_bits) - -static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) -{ - return mapping_gfp_constraint(mapping, ~__GFP_FS); -} - -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end); -int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); -int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); - -/* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); @@ -528,7 +548,7 @@ int btrfs_previous_item(struct btrfs_root *root, int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -562,9 +582,9 @@ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size); + const struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end); + const struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -588,6 +608,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); +DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); @@ -720,23 +741,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } -u16 btrfs_csum_type_size(u16 type); -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - -/* - * We use page status Private2 to indicate there is an ordered extent with - * unfinished IO. - * - * Rename the Private2 accessors to Ordered, to improve readability. - */ -#define PageOrdered(page) PagePrivate2(page) -#define SetPageOrdered(page) SetPagePrivate2(page) -#define ClearPageOrdered(page) ClearPagePrivate2(page) -#define folio_test_ordered(folio) folio_test_private_2(folio) -#define folio_set_ordered(folio) folio_set_private_2(folio) -#define folio_clear_ordered(folio) folio_clear_private_2(folio) - #endif diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5b0b64571418..968dae953948 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include "ctree.h" #include "disk-io.h" -#include "print-tree.h" #include "transaction.h" #include "locking.h" #include "accessors.h" @@ -46,8 +45,8 @@ struct inode_defrag { u32 extent_thresh; }; -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) +static int compare_inode_defrag(const struct inode_defrag *defrag1, + const struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; @@ -62,16 +61,14 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, } /* - * Pop a record for an inode into the defrag tree. The lock must be held + * Insert a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. - * - * If an existing record is found the defrag item you pass in is freed. */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) +static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; @@ -84,7 +81,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(defrag, entry); + ret = compare_inode_defrag(defrag, entry); if (ret < 0) p = &parent->rb_left; else if (ret > 0) @@ -108,7 +105,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; @@ -120,35 +117,29 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) } /* - * Insert a defrag record for this inode if auto defrag is enabled. + * Insert a defrag record for this inode if auto defrag is enabled. No errors + * returned as they're not considered fatal. */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; - u64 transid; int ret; - if (!__need_auto_defrag(fs_info)) - return 0; + if (!need_auto_defrag(fs_info)) + return; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = inode->root->last_trans; + return; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) - return -ENOMEM; + return; defrag->ino = btrfs_ino(inode); - defrag->transid = transid; - defrag->root = root->root_key.objectid; + defrag->transid = btrfs_get_root_last_trans(root); + defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); @@ -158,14 +149,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ - ret = __btrfs_add_inode_defrag(inode, defrag); + ret = btrfs_insert_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); - return 0; } /* @@ -190,7 +180,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(&tmp, entry); + ret = compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) @@ -199,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( goto out; } - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -215,27 +205,24 @@ out: void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { - struct inode_defrag *defrag; - struct rb_node *node; + struct inode_defrag *defrag, *next; spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); + + rbtree_postorder_for_each_entry_safe(defrag, next, + &fs_info->defrag_inodes, rb_node) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - cond_resched_lock(&fs_info->defrag_inodes_lock); + fs_info->defrag_inodes = RB_ROOT; - node = rb_first(&fs_info->defrag_inodes); - } spin_unlock(&fs_info->defrag_inodes_lock); } #define BTRFS_DEFRAG_BATCH 1024 -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) +static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag, + struct file_ra_state *ra) { struct btrfs_root *inode_root; struct inode *inode; @@ -246,7 +233,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ @@ -256,7 +243,7 @@ again: goto cleanup; } - inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); + inode = btrfs_iget(defrag->ino, inode_root); btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -274,9 +261,10 @@ again: range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; + file_ra_state_init(ra, inode->i_mapping); sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); iput(inode); @@ -303,11 +291,13 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while (1) { + struct file_ra_state ra = { 0 }; + /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) break; /* find an inode to defrag */ @@ -325,7 +315,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - __btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag, &ra); } atomic_dec(&fs_info->defrag_running); @@ -521,7 +511,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, * keep_locks set and lowest_level is 1, regardless of the value of * path->slots[1]. */ - BUG_ON(path->locks[1] == 0); + ASSERT(path->locks[1] != 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -708,8 +698,10 @@ iterate: */ if (key.offset > start) { em->start = start; - em->orig_start = start; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; + em->disk_num_bytes = 0; + em->ram_bytes = 0; + em->offset = 0; em->len = key.offset - start; break; } @@ -771,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * We can get a merged extent, in that case, we need to re-search * tree to get the original em for defrag. * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. + * This is because even if we have adjacent extents that are contiguous + * and compatible (same type and flags), we still want to defrag them + * so that we use less metadata (extent items in the extent tree and + * file extent items in the inode's subvolume tree). */ - if (em && (em->flags & EXTENT_FLAG_MERGED) && - newer_than && em->generation >= newer_than) { + if (em && (em->flags & EXTENT_FLAG_MERGED)) { free_extent_map(em); em = NULL; } @@ -810,7 +802,7 @@ static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *next; bool ret = false; @@ -826,7 +818,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, */ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); /* No more em or hole */ - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) goto out; if (next->flags & EXTENT_FLAG_PREALLOC) goto out; @@ -861,20 +853,21 @@ out: * NOTE: Caller should also wait for page writeback after the cluster is * prepared, here we don't do writeback wait for each page. */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); u64 page_start = (u64)index << PAGE_SHIFT; u64 page_end = page_start + PAGE_SIZE - 1; struct extent_state *cached_state = NULL; - struct page *page; + struct folio *folio; int ret; again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) + return folio; /* * Since we can defragment files opened read-only, we can encounter @@ -884,16 +877,16 @@ again: * executables that explicitly enable them, so this isn't very * restrictive. */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ETXTBSY); } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ERR_PTR(ret); } @@ -908,17 +901,17 @@ again: if (!ordered) break; - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); - lock_page(page); + folio_lock(folio); /* - * We unlocked the page above, so we need check if it was + * We unlocked the folio above, so we need check if it was * released or not. */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } } @@ -927,21 +920,21 @@ again: * Now the page range has no ordered extent any more. Read the page to * make it uptodate. */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } } - return page; + return folio; } struct defrag_target_range { @@ -992,12 +985,12 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * This is for users who want to convert inline extents to * regular ones through max_inline= mount option. */ - if (em->block_start == EXTENT_MAP_INLINE && + if (em->disk_bytenr == EXTENT_MAP_INLINE && em->len <= inode->root->fs_info->max_inline) goto next; /* Skip holes and preallocated extents. */ - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) goto next; @@ -1062,7 +1055,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * So if an inline extent passed all above checks, just add it * for defrag, and be converted to regular extents. */ - if (em->block_start == EXTENT_MAP_INLINE) + if (em->disk_bytenr == EXTENT_MAP_INLINE) goto add; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, @@ -1162,7 +1155,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); */ static int defrag_one_locked_target(struct btrfs_inode *inode, struct defrag_target_range *target, - struct page **pages, int nr_pages, + struct folio **folios, int nr_pages, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1171,7 +1164,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, const u64 len = target->len; unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); + unsigned long first_index = folios[0]->index; int ret = 0; int i; @@ -1188,8 +1181,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len); + folio_clear_checked(folios[i]); + btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1205,7 +1198,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); - struct page **pages; + struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; u64 last_index = (start + len - 1) >> PAGE_SHIFT; u64 start_index = start >> PAGE_SHIFT; @@ -1216,21 +1209,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) + folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); + if (!folios) return -ENOMEM; /* Prepare all pages */ for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; + folios[i] = defrag_prepare_one_folio(inode, start_index + i); + if (IS_ERR(folios[i])) { + ret = PTR_ERR(folios[i]); + nr_pages = i; + goto free_folios; } } for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); + folio_wait_writeback(folios[i]); /* Lock the pages range */ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, @@ -1250,7 +1243,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, goto unlock_extent; list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + ret = defrag_one_locked_target(inode, entry, folios, nr_pages, &cached_state); if (ret < 0) break; @@ -1264,14 +1257,12 @@ unlock_extent: unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, &cached_state); -free_pages: +free_folios: for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folios[i]); + folio_put(folios[i]); } - kfree(pages); + kfree(folios); return ret; } @@ -1317,8 +1308,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, if (entry->start + range_len <= *last_scanned_ret) continue; - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, + page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, ((entry->start + range_len - 1) >> PAGE_SHIFT) - (entry->start >> PAGE_SHIFT) + 1); @@ -1350,7 +1340,7 @@ out: * Entry point to file defragmentation. * * @inode: inode to be defragged - * @ra: readahead state (can be NUL) + * @ra: readahead state * @range: defrag options including range and flags * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode @@ -1366,18 +1356,19 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long sectors_defragged = 0; u64 isize = i_size_read(inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); - bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; + ASSERT(ra); + if (isize == 0) return 0; @@ -1407,18 +1398,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, last_byte = round_up(last_byte, fs_info->sectorsize) - 1; /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - - /* * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. */ @@ -1472,8 +1451,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cond_resched(); } - if (ra_allocated) - kfree(ra); /* * Update range.start for autodefrag, this will indicate where to start * in next run. @@ -1512,9 +1489,7 @@ void __cold btrfs_auto_defrag_exit(void) int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct inode_defrag), 0, 0, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 5a62763528d1..6b7596c4f0dc 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -3,13 +3,22 @@ #ifndef BTRFS_DEFRAG_H #define BTRFS_DEFRAG_H +#include <linux/types.h> +#include <linux/compiler_types.h> + +struct inode; +struct file_ra_state; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ioctl_defrag_range_args; + int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index acf9f4b6c044..88e900e5a43d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -6,9 +6,7 @@ #include "block-rsv.h" #include "btrfs_inode.h" #include "space-info.h" -#include "transaction.h" #include "qgroup.h" -#include "block-group.h" #include "fs.h" /* @@ -113,7 +111,7 @@ * making error handling and cleanup easier. */ -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) +int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -178,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); + btrfs_space_info_free_bytes_may_use(data_sinfo, len); } /* diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index c5d573f2366e..3f32953c0a80 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -3,9 +3,13 @@ #ifndef BTRFS_DELALLOC_SPACE_H #define BTRFS_DELALLOC_SPACE_H +#include <linux/types.h> + struct extent_changeset; +struct btrfs_inode; +struct btrfs_fs_info; -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len, bool noflush); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 08102883f560..0b4933c6a889 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -28,11 +28,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("btrfs_delayed_node", - sizeof(struct btrfs_delayed_node), - 0, - SLAB_MEM_SPREAD, - NULL); + delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0); if (!delayed_node_cache) return -ENOMEM; return 0; @@ -43,6 +39,17 @@ void __cold btrfs_delayed_inode_exit(void) kmem_cache_destroy(delayed_node_cache); } +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + static inline void btrfs_init_delayed_node( struct btrfs_delayed_node *delayed_node, struct btrfs_root *root, u64 inode_id) @@ -70,14 +77,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( return node; } - spin_lock(&root->inode_lock); + xa_lock(&root->delayed_nodes); node = xa_load(&root->delayed_nodes, ino); if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ BUG_ON(btrfs_inode->delayed_node != node); - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); return node; } @@ -104,10 +111,10 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = NULL; } - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); return node; } - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); return NULL; } @@ -141,21 +148,21 @@ again: kmem_cache_free(delayed_node_cache, node); return ERR_PTR(-ENOMEM); } - spin_lock(&root->inode_lock); + xa_lock(&root->delayed_nodes); ptr = xa_load(&root->delayed_nodes, ino); if (ptr) { /* Somebody inserted it, go back and read it. */ - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); kmem_cache_free(delayed_node_cache, node); node = NULL; goto again; } - ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC); + ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC); ASSERT(xa_err(ptr) != -EINVAL); ASSERT(xa_err(ptr) != -ENOMEM); ASSERT(ptr == NULL); btrfs_inode->delayed_node = node; - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); return node; } @@ -268,14 +275,12 @@ static void __btrfs_release_delayed_node( if (refcount_dec_and_test(&delayed_node->refs)) { struct btrfs_root *root = delayed_node->root; - spin_lock(&root->inode_lock); + xa_erase(&root->delayed_nodes, delayed_node->inode_id); /* * Once our refcount goes to zero, nobody is allowed to bump it * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - xa_erase(&root->delayed_nodes, delayed_node->inode_id); - spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } } @@ -361,40 +366,35 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( return NULL; } +static int btrfs_delayed_item_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_delayed_item *new_item = + rb_entry(new, struct btrfs_delayed_item, rb_node); + const struct btrfs_delayed_item *exist_item = + rb_entry(exist, struct btrfs_delayed_item, rb_node); + + if (new_item->index < exist_item->index) + return -1; + if (new_item->index > exist_item->index) + return 1; + return 0; +} + static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct btrfs_delayed_item *ins) { - struct rb_node **p, *node; - struct rb_node *parent_node = NULL; struct rb_root_cached *root; - struct btrfs_delayed_item *item; - bool leftmost = true; + struct rb_node *exist; if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else root = &delayed_node->del_root; - p = &root->rb_root.rb_node; - node = &ins->rb_node; - - while (*p) { - parent_node = *p; - item = rb_entry(parent_node, struct btrfs_delayed_item, - rb_node); - - if (item->index < ins->index) { - p = &(*p)->rb_right; - leftmost = false; - } else if (item->index > ins->index) { - p = &(*p)->rb_left; - } else { - return -EEXIST; - } - } - - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); + exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp); + if (exist) + return -EEXIST; if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && ins->index >= delayed_node->index_cnt) @@ -430,8 +430,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) delayed_root = delayed_node->root->fs_info->delayed_root; - BUG_ON(!delayed_root); - if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else @@ -980,7 +978,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) if (delayed_node && test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - BUG_ON(!delayed_node->root); + ASSERT(delayed_node->root); clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; @@ -1035,7 +1033,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); - btrfs_mark_buffer_dirty(trans, leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; @@ -1128,6 +1125,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_record_root_in_trans(trans, node->root); + if (ret) + return ret; ret = btrfs_update_delayed_inode(trans, node->root, path, node); return ret; } @@ -1463,7 +1463,7 @@ static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 flags, + const struct btrfs_disk_key *disk_key, u8 flags, u64 index) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1555,8 +1555,7 @@ release_node: return ret; } -static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_node *node, +static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, u64 index) { struct btrfs_delayed_item *item; @@ -1614,7 +1613,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(node)) return PTR_ERR(node); - ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); + ret = btrfs_delete_delayed_insertion_item(node, index); if (!ret) goto end; @@ -1643,7 +1642,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, node->root->root_key.objectid, + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_delayed_item_release_metadata(dir->root, item); btrfs_release_delayed_item(item); @@ -1676,7 +1675,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) return 0; } -bool btrfs_readdir_get_delayed_items(struct inode *inode, +bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, u64 last_index, struct list_head *ins_list, struct list_head *del_list) @@ -1684,7 +1683,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; - delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return false; @@ -1692,8 +1691,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); - btrfs_inode_lock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1724,7 +1723,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, return true; } -void btrfs_readdir_put_delayed_items(struct inode *inode, +void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, struct list_head *ins_list, struct list_head *del_list) { @@ -1746,10 +1745,10 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, * The VFS is going to do up_read(), so we need to downgrade back to a * read lock. */ - downgrade_write(&inode->i_rwsem); + downgrade_write(&inode->vfs_inode.i_rwsem); } -int btrfs_should_delete_dir_index(struct list_head *del_list, +int btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index) { struct btrfs_delayed_item *curr; @@ -1770,7 +1769,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * Read dir info stored in the delayed tree. */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list) + const struct list_head *ins_list) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1908,7 +1907,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); inode->i_generation = BTRFS_I(inode)->generation; - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -2049,9 +2049,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) struct btrfs_delayed_node *node; int count; - spin_lock(&root->inode_lock); + xa_lock(&root->delayed_nodes); if (xa_empty(&root->delayed_nodes)) { - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); return; } @@ -2068,7 +2068,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) if (count >= ARRAY_SIZE(delayed_nodes)) break; } - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); index++; for (int i = 0; i < count; i++) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 5cceb31bbd16..f4d9feac0d0e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -7,15 +7,23 @@ #ifndef BTRFS_DELAYED_INODE_H #define BTRFS_DELAYED_INODE_H +#include <linux/types.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> +#include <linux/fs.h> #include <linux/atomic.h> #include <linux/refcount.h> #include "ctree.h" +struct btrfs_disk_key; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_trans_handle; + enum btrfs_delayed_item_type { BTRFS_DELAYED_INSERTION_ITEM, BTRFS_DELAYED_DELETION_ITEM @@ -56,9 +64,9 @@ struct btrfs_delayed_node { struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; + int count; u64 index_cnt; unsigned long flags; - int count; /* * The size of the next batch of dir index items to insert (if this * node is from a directory inode). Protected by @mutex. @@ -98,22 +106,11 @@ struct btrfs_delayed_item { char data[] __counted_by(data_len); }; -static inline void btrfs_init_delayed_root( - struct btrfs_delayed_root *delayed_root) -{ - atomic_set(&delayed_root->items, 0); - atomic_set(&delayed_root->items_seq, 0); - delayed_root->nodes = 0; - spin_lock_init(&delayed_root->lock); - init_waitqueue_head(&delayed_root->wait); - INIT_LIST_HEAD(&delayed_root->node_list); - INIT_LIST_HEAD(&delayed_root->prepare_list); -} - +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root); int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 flags, + const struct btrfs_disk_key *disk_key, u8 flags, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, @@ -146,17 +143,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info); /* Used for readdir() */ -bool btrfs_readdir_get_delayed_items(struct inode *inode, +bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, u64 last_index, struct list_head *ins_list, struct list_head *del_list); -void btrfs_readdir_put_delayed_items(struct inode *inode, +void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, struct list_head *ins_list, struct list_head *del_list); -int btrfs_should_delete_dir_index(struct list_head *del_list, +int btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list); + const struct list_head *ins_list); /* Used during directory logging. */ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 891ea2fa263c..98c5b61dabe8 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -9,6 +9,7 @@ #include "messages.h" #include "ctree.h" #include "delayed-ref.h" +#include "extent-tree.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" @@ -16,8 +17,7 @@ #include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; -struct kmem_cache *btrfs_delayed_tree_ref_cachep; -struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_ref_node_cachep; struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees @@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) u64 num_bytes; u64 reserved_bytes; + if (btrfs_is_testing(fs_info)) + return; + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, trans->delayed_ref_csum_deletions); @@ -196,48 +199,6 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) } /* - * Transfer bytes to our delayed refs rsv. - * - * @fs_info: the filesystem - * @num_bytes: number of bytes to transfer - * - * This transfers up to the num_bytes amount, previously reserved, to the - * delayed_refs_rsv. Any extra bytes are returned to the space info. - */ -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - u64 num_bytes) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - u64 to_free = 0; - - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { - u64 delta = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - if (num_bytes > delta) { - to_free = num_bytes - delta; - num_bytes = delta; - } - } else { - to_free = num_bytes; - num_bytes = 0; - } - - if (num_bytes) - delayed_refs_rsv->reserved += num_bytes; - if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = true; - spin_unlock(&delayed_refs_rsv->lock); - - if (num_bytes) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - if (to_free) - btrfs_space_info_free_bytes_may_use(fs_info, - delayed_refs_rsv->space_info, to_free); -} - -/* * Refill based on our delayed refs usage. * * @fs_info: the filesystem @@ -296,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); if (to_free > 0) - btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); + btrfs_space_info_free_bytes_may_use(space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, @@ -305,55 +266,24 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, } /* - * compare two delayed tree backrefs with same bytenr and type - */ -static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, - struct btrfs_delayed_tree_ref *ref2) -{ - if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } - return 0; -} - -/* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, - struct btrfs_delayed_data_ref *ref2) +static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1, + const struct btrfs_delayed_ref_node *ref2) { - if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - if (ref1->objectid < ref2->objectid) - return -1; - if (ref1->objectid > ref2->objectid) - return 1; - if (ref1->offset < ref2->offset) - return -1; - if (ref1->offset > ref2->offset) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } + if (ref1->data_ref.objectid < ref2->data_ref.objectid) + return -1; + if (ref1->data_ref.objectid > ref2->data_ref.objectid) + return 1; + if (ref1->data_ref.offset < ref2->data_ref.offset) + return -1; + if (ref1->data_ref.offset > ref2->data_ref.offset) + return 1; return 0; } -static int comp_refs(struct btrfs_delayed_ref_node *ref1, - struct btrfs_delayed_ref_node *ref2, +static int comp_refs(const struct btrfs_delayed_ref_node *ref1, + const struct btrfs_delayed_ref_node *ref2, bool check_seq) { int ret = 0; @@ -362,13 +292,20 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return -1; if (ref1->type > ref2->type) return 1; - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), - btrfs_delayed_node_to_tree_ref(ref2)); - else - ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), - btrfs_delayed_node_to_data_ref(ref2)); + if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } else { + if (ref1->ref_root < ref2->ref_root) + return -1; + if (ref1->ref_root > ref2->ref_root) + return 1; + if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) + ret = comp_data_refs(ref1, ref2); + } if (ret) return ret; if (check_seq) { @@ -380,142 +317,57 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return 0; } -/* insert a new ref to head ref rbtree */ -static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, - struct rb_node *node) +static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist) { - struct rb_node **p = &root->rb_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_head *entry; - struct btrfs_delayed_ref_head *ins; - u64 bytenr; - bool leftmost = true; - - ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - bytenr = ins->bytenr; - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, - href_node); - - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - leftmost = false; - } else { - return entry; - } - } + const struct btrfs_delayed_ref_node *new_node = + rb_entry(new, struct btrfs_delayed_ref_node, ref_node); + const struct btrfs_delayed_ref_node *exist_node = + rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); - return NULL; + return comp_refs(new_node, exist_node, true); } static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { - struct rb_node **p = &root->rb_root.rb_node; struct rb_node *node = &ins->ref_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - bool leftmost = true; - - while (*p) { - int comp; - - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - ref_node); - comp = comp_refs(ins, entry, true); - if (comp < 0) { - p = &(*p)->rb_left; - } else if (comp > 0) { - p = &(*p)->rb_right; - leftmost = false; - } else { - return entry; - } - } + struct rb_node *exist; - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); + exist = rb_find_add_cached(node, root, cmp_refs_node); + if (exist) + return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); return NULL; } static struct btrfs_delayed_ref_head *find_first_ref_head( struct btrfs_delayed_ref_root *dr) { - struct rb_node *n; - struct btrfs_delayed_ref_head *entry; - - n = rb_first_cached(&dr->href_root); - if (!n) - return NULL; + unsigned long from = 0; - entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + lockdep_assert_held(&dr->lock); - return entry; + return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT); } -/* - * Find a head entry based on bytenr. This returns the delayed ref head if it - * was able to find one, or NULL if nothing was in that spot. If return_bigger - * is given, the next bigger entry is returned if no exact match is found. - */ -static struct btrfs_delayed_ref_head *find_ref_head( - struct btrfs_delayed_ref_root *dr, u64 bytenr, - bool return_bigger) -{ - struct rb_root *root = &dr->href_root.rb_root; - struct rb_node *n; - struct btrfs_delayed_ref_head *entry; - - n = root->rb_node; - entry = NULL; - while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return entry; - } - if (entry && return_bigger) { - if (bytenr > entry->bytenr) { - n = rb_next(&entry->href_node); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_delayed_ref_head, - href_node); - } - return entry; - } - return NULL; -} - -int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) +static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) - return 0; + return true; refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_NODE(&head->href_node)) { + if (!head->tracked) { mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); - return -EAGAIN; + return false; } btrfs_put_delayed_ref_head(head); - return 0; + return true; } static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, @@ -529,7 +381,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, if (!list_empty(&ref->add_list)) list_del(&ref->add_list); btrfs_put_delayed_ref(ref); - atomic_dec(&delayed_refs->num_entries); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } @@ -625,33 +476,31 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } struct btrfs_delayed_ref_head *btrfs_select_ref_head( + const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; + unsigned long start_index; + unsigned long found_index; + bool found_head = false; + bool locked; - lockdep_assert_held(&delayed_refs->lock); + spin_lock(&delayed_refs->lock); again: - head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, - true); - if (!head && delayed_refs->run_delayed_start != 0) { - delayed_refs->run_delayed_start = 0; - head = find_first_ref_head(delayed_refs); + start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits); + xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) { + if (!head->processing) { + found_head = true; + break; + } } - if (!head) - return NULL; - - while (head->processing) { - struct rb_node *node; - - node = rb_next(&head->href_node); - if (!node) { - if (delayed_refs->run_delayed_start == 0) - return NULL; - delayed_refs->run_delayed_start = 0; - goto again; + if (!found_head) { + if (delayed_refs->run_delayed_start == 0) { + spin_unlock(&delayed_refs->lock); + return NULL; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); + delayed_refs->run_delayed_start = 0; + goto again; } head->processing = true; @@ -659,23 +508,73 @@ again: delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + head->num_bytes; + + locked = btrfs_delayed_ref_lock(delayed_refs, head); + spin_unlock(&delayed_refs->lock); + + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (!locked) + return ERR_PTR(-EAGAIN); + return head; } -void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = false; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); +} + +void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { + const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits); + lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&head->lock); - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); - atomic_dec(&delayed_refs->num_entries); + xa_erase(&delayed_refs->head_refs, index); + head->tracked = false; delayed_refs->num_heads--; if (!head->processing) delayed_refs->num_heads_ready--; } +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + + lockdep_assert_held(&head->mutex); + lockdep_assert_held(&head->lock); + + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + return NULL; + + /* + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. + */ + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); + + ref = rb_entry(rb_first_cached(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); + ASSERT(list_empty(&ref->add_list)); + return ref; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -696,7 +595,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, if (!exist) { if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); - atomic_inc(&root->num_entries); spin_unlock(&href->lock); trans->delayed_ref_updates++; return false; @@ -716,7 +614,7 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, &href->ref_add_list); else if (ref->action == BTRFS_DROP_DELAYED_REF) { ASSERT(!list_empty(&exist->add_list)); - list_del(&exist->add_list); + list_del_init(&exist->add_list); } else { ASSERT(0); } @@ -828,18 +726,20 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, } static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, + struct btrfs_ref *generic_ref, struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, u64 ref_root, - u64 reserved, int action, bool is_data, - bool is_system, u64 owning_root) + u64 reserved) { int count_mod = 1; bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ - BUG_ON(!is_data && reserved); + BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); - switch (action) { + switch (generic_ref->action) { + case BTRFS_ADD_DELAYED_REF: + /* count_mod is already set to 1. */ + break; case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; break; @@ -868,29 +768,34 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, } refcount_set(&head_ref->refs, 1); - head_ref->bytenr = bytenr; - head_ref->num_bytes = num_bytes; + head_ref->bytenr = generic_ref->bytenr; + head_ref->num_bytes = generic_ref->num_bytes; head_ref->ref_mod = count_mod; head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; - head_ref->owning_root = owning_root; - head_ref->is_data = is_data; - head_ref->is_system = is_system; + head_ref->owning_root = generic_ref->owning_root; + head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); + head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); - RB_CLEAR_NODE(&head_ref->href_node); + head_ref->tracked = false; head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); + /* If not metadata set an impossible level to help debugging. */ + if (generic_ref->type == BTRFS_REF_METADATA) + head_ref->level = generic_ref->tree_ref.level; + else + head_ref->level = U8_MAX; + if (qrecord) { - if (ref_root && reserved) { + if (generic_ref->ref_root && reserved) { qrecord->data_rsv = reserved; - qrecord->data_rsv_refroot = ref_root; + qrecord->data_rsv_refroot = generic_ref->ref_root; } - qrecord->bytenr = bytenr; - qrecord->num_bytes = num_bytes; + qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } } @@ -899,6 +804,8 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, * helper function to actually insert a head node into the rbtree. * this does all the dirty work in terms of maintaining the correct * overall modification count. + * + * Returns an error pointer in case of an error. */ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, @@ -906,25 +813,48 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; + const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; + lockdep_assert_held(&delayed_refs->lock); + +#if BITS_PER_LONG == 32 + if (head_ref->bytenr >= MAX_LFS_FILESIZE) { + if (qrecord) + xa_release(&delayed_refs->dirty_extents, index); + btrfs_err_rl(fs_info, +"delayed ref head %llu is beyond 32bit page cache and xarray index limit", + head_ref->bytenr); + btrfs_err_32bit_limit(fs_info); + return ERR_PTR(-EOVERFLOW); + } +#endif /* Record qgroup extent info if provided */ if (qrecord) { - if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord)) + int ret; + + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, + head_ref->bytenr); + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, index); + /* Caller responsible for freeing qrecord on error. */ + if (ret < 0) + return ERR_PTR(ret); kfree(qrecord); - else + } else { qrecord_inserted = true; + } } - trace_add_delayed_ref_head(trans->fs_info, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); - existing = htree_insert(&delayed_refs->href_root, - &head_ref->href_node); + existing = xa_load(&delayed_refs->head_refs, index); if (existing) { update_existing_head_ref(trans, existing, head_ref); /* @@ -934,6 +864,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC); + if (xa_is_err(existing)) { + /* Memory was preallocated by the caller. */ + ASSERT(xa_err(existing) != -ENOMEM); + return ERR_PTR(xa_err(existing)); + } else if (WARN_ON(existing)) { + /* + * Shouldn't happen we just did a lookup before under + * delayed_refs->lock. + */ + return ERR_PTR(-EEXIST); + } + head_ref->tracked = true; /* * We reserve the amount of bytes needed to delete csums when * adding the ref head and not when adding individual drop refs @@ -943,12 +886,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_csum_deletions += - btrfs_csum_bytes_to_leaves(trans->fs_info, - head_ref->num_bytes); + btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; - atomic_inc(&delayed_refs->num_entries); } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; @@ -982,102 +923,140 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, */ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 ref_root, - int action, u8 ref_type) + struct btrfs_ref *generic_ref) { + int action = generic_ref->action; u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - if (is_fstree(ref_root)) + if (is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; + ref->bytenr = generic_ref->bytenr; + ref->num_bytes = generic_ref->num_bytes; ref->ref_mod = 1; ref->action = action; ref->seq = seq; - ref->type = ref_type; + ref->type = btrfs_ref_type(generic_ref); + ref->ref_root = generic_ref->ref_root; + ref->parent = generic_ref->parent; RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); + + if (generic_ref->type == BTRFS_REF_DATA) + ref->data_ref = generic_ref->data_ref; + else + ref->tree_ref = generic_ref->tree_ref; } -/* - * add a delayed tree ref. This does all of the accounting required - * to make sure the delayed ref is eventually processed before this - * transaction commits. - */ -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op) +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, + bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: generic_ref->ref_root; +#endif + generic_ref->tree_ref.level = level; + generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + +} + +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, + u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: generic_ref->ref_root; +#endif + generic_ref->data_ref.objectid = ino; + generic_ref->data_ref.offset = offset; + generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; +} + +static int add_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + struct btrfs_delayed_extent_op *extent_op, + u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_head *new_head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; + const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits); + bool qrecord_reserved = false; bool qrecord_inserted; - bool is_system; - bool merged; int action = generic_ref->action; - int level = generic_ref->tree_ref.level; - u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; - u64 parent = generic_ref->parent; - u8 ref_type; - - is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID); + bool merged; + int ret; - ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); - ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); - if (!ref) + node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); + if (!node) return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - return -ENOMEM; + ret = -ENOMEM; + goto free_node; } + delayed_refs = &trans->transaction->delayed_refs; + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); - return -ENOMEM; + ret = -ENOMEM; + goto free_head_ref; + } + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { + ret = -ENOMEM; + goto free_record; } + qrecord_reserved = true; } - if (parent) - ref_type = BTRFS_SHARED_BLOCK_REF_KEY; - else - ref_type = BTRFS_TREE_BLOCK_REF_KEY; - - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.ref_root, action, - ref_type); - ref->root = generic_ref->tree_ref.ref_root; - ref->parent = parent; - ref->level = level; - - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.ref_root, 0, action, - false, is_system, generic_ref->owning_root); + ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); + if (ret) { + if (qrecord_reserved) + xa_release(&delayed_refs->dirty_extents, index); + goto free_record; + } + + init_delayed_ref_common(fs_info, node, generic_ref); + init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); /* * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted); + new_head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted); + if (IS_ERR(new_head_ref)) { + xa_release(&delayed_refs->head_refs, index); + spin_unlock(&delayed_refs->lock); + ret = PTR_ERR(new_head_ref); + goto free_record; + } + head_ref = new_head_ref; - merged = insert_delayed_ref(trans, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, node); spin_unlock(&delayed_refs->lock); /* @@ -1086,16 +1065,36 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, */ btrfs_update_delayed_refs_rsv(trans); - trace_add_delayed_tree_ref(fs_info, &ref->node, ref, - action == BTRFS_ADD_DELAYED_EXTENT ? - BTRFS_ADD_DELAYED_REF : action); + if (generic_ref->type == BTRFS_REF_DATA) + trace_add_delayed_data_ref(trans->fs_info, node); + else + trace_add_delayed_tree_ref(trans->fs_info, node); if (merged) - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) - btrfs_qgroup_trace_extent_post(trans, record); - + return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); return 0; + +free_record: + kfree(record); +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_node: + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); + return ret; +} + +/* + * Add a delayed tree ref. This does all of the accounting required to make sure + * the delayed ref is eventually processed before this transaction commits. + */ +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + struct btrfs_delayed_extent_op *extent_op) +{ + ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); + return add_delayed_ref(trans, generic_ref, extent_op, 0); } /* @@ -1105,111 +1104,51 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_data_ref *ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_qgroup_extent_record *record = NULL; - bool qrecord_inserted; - int action = generic_ref->action; - bool merged; - u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; - u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.ref_root; - u64 owner = generic_ref->data_ref.ino; - u64 offset = generic_ref->data_ref.offset; - u8 ref_type; - - ASSERT(generic_ref->type == BTRFS_REF_DATA && action); - ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); - if (!ref) - return -ENOMEM; - - if (parent) - ref_type = BTRFS_SHARED_DATA_REF_KEY; - else - ref_type = BTRFS_EXTENT_DATA_REF_KEY; - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - ref_root, action, ref_type); - ref->root = ref_root; - ref->parent = parent; - ref->objectid = owner; - ref->offset = offset; - - - head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - return -ENOMEM; - } - - if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { - record = kzalloc(sizeof(*record), GFP_NOFS); - if (!record) { - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - kmem_cache_free(btrfs_delayed_ref_head_cachep, - head_ref); - return -ENOMEM; - } - } - - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, - reserved, action, true, false, generic_ref->owning_root); - head_ref->extent_op = NULL; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - /* - * insert both the head node and the new ref without dropping - * the spin lock - */ - head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted); - - merged = insert_delayed_ref(trans, head_ref, &ref->node); - spin_unlock(&delayed_refs->lock); - - /* - * Need to update the delayed_refs_rsv with any changes we may have - * made. - */ - btrfs_update_delayed_refs_rsv(trans); - - trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, - action == BTRFS_ADD_DELAYED_EXTENT ? - BTRFS_ADD_DELAYED_REF : action); - if (merged) - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - - - if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(trans, record); - return 0; + ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); + return add_delayed_ref(trans, generic_ref, NULL, reserved); } int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, + u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op) { + const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits); struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_head *head_ref_ret; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_ref generic_ref = { + .type = BTRFS_REF_METADATA, + .action = BTRFS_UPDATE_DELAYED_HEAD, + .bytenr = bytenr, + .num_bytes = num_bytes, + .tree_ref.level = level, + }; + int ret; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; - init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, false, false, 0); + init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, - NULL); + ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); + if (ret) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return ret; + } + spin_lock(&delayed_refs->lock); + head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, + BTRFS_UPDATE_DELAYED_HEAD, NULL); + if (IS_ERR(head_ref_ret)) { + xa_release(&delayed_refs->head_refs, index); + spin_unlock(&delayed_refs->lock); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return PTR_ERR(head_ref_ret); + } spin_unlock(&delayed_refs->lock); /* @@ -1220,53 +1159,193 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return 0; } +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + if (refcount_dec_and_test(&ref->refs)) { + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); + kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); + } +} + /* * This does a simple search for the head node for a given extent. Returns the * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) +btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 bytenr) { + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); + lockdep_assert_held(&delayed_refs->lock); - return find_ref_head(delayed_refs, bytenr, false); + return xa_load(&delayed_refs->head_refs, index); +} + +static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) +{ + int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; + + if (type < entry->type) + return -1; + if (type > entry->type) + return 1; + + if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (root < entry->ref_root) + return -1; + if (root > entry->ref_root) + return 1; + } else { + if (parent < entry->parent) + return -1; + if (parent > entry->parent) + return 1; + } + return 0; +} + +/* + * Check to see if a given root/parent reference is attached to the head. This + * only checks for BTRFS_ADD_DELAYED_REF references that match, as that + * indicates the reference exists for the given root or parent. This is for + * tree blocks only. + * + * @head: the head of the bytenr we're searching. + * @root: the root objectid of the reference if it is a normal reference. + * @parent: the parent if this is a shared backref. + */ +bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, + u64 root, u64 parent) +{ + struct rb_node *node; + bool found = false; + + lockdep_assert_held(&head->mutex); + + spin_lock(&head->lock); + node = head->ref_tree.rb_root.rb_node; + while (node) { + struct btrfs_delayed_ref_node *entry; + int ret; + + entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + ret = find_comp(entry, root, parent); + if (ret < 0) { + node = node->rb_left; + } else if (ret > 0) { + node = node->rb_right; + } else { + /* + * We only want to count ADD actions, as drops mean the + * ref doesn't exist. + */ + if (entry->action == BTRFS_ADD_DELAYED_REF) + found = true; + break; + } + } + spin_unlock(&head->lock); + return found; +} + +void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + bool testing = btrfs_is_testing(fs_info); + + spin_lock(&delayed_refs->lock); + while (true) { + struct btrfs_delayed_ref_head *head; + struct rb_node *n; + bool pin_bytes = false; + + head = find_first_ref_head(delayed_refs); + if (!head) + break; + + if (!btrfs_delayed_ref_lock(delayed_refs, head)) + continue; + + spin_lock(&head->lock); + while ((n = rb_first_cached(&head->ref_tree)) != NULL) { + struct btrfs_delayed_ref_node *ref; + + ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); + drop_delayed_ref(fs_info, delayed_refs, head, ref); + } + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + + if (!testing && pin_bytes) { + struct btrfs_block_group *bg; + + bg = btrfs_lookup_block_group(fs_info, head->bytenr); + if (WARN_ON_ONCE(bg == NULL)) { + /* + * Unexpected and there's nothing we can do here + * because we are in a transaction abort path, + * so any errors can only be ignored or reported + * while attempting to cleanup all resources. + */ + btrfs_err(fs_info, +"block group for delayed ref at %llu was not found while destroying ref head", + head->bytenr); + } else { + spin_lock(&bg->space_info->lock); + spin_lock(&bg->lock); + bg->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(bg->space_info, + head->num_bytes); + bg->reserved -= head->num_bytes; + bg->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&bg->lock); + spin_unlock(&bg->space_info->lock); + + btrfs_put_block_group(bg); + } + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } + if (!testing) + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + btrfs_put_delayed_ref_head(head); + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + if (!testing) + btrfs_qgroup_destroy_extent_records(trans); + + spin_unlock(&delayed_refs->lock); } void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); - kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); - kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + kmem_cache_destroy(btrfs_delayed_ref_node_cachep); kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } int __init btrfs_delayed_ref_init(void) { - btrfs_delayed_ref_head_cachep = kmem_cache_create( - "btrfs_delayed_ref_head", - sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) goto fail; - btrfs_delayed_tree_ref_cachep = kmem_cache_create( - "btrfs_delayed_tree_ref", - sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_delayed_tree_ref_cachep) - goto fail; - - btrfs_delayed_data_ref_cachep = kmem_cache_create( - "btrfs_delayed_data_ref", - sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_delayed_data_ref_cachep) + btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); + if (!btrfs_delayed_ref_node_cachep) goto fail; - btrfs_delayed_extent_op_cachep = kmem_cache_create( - "btrfs_delayed_extent_op", - sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 62d679d40f4f..a35067cebb97 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -6,7 +6,17 @@ #ifndef BTRFS_DELAYED_REF_H #define BTRFS_DELAYED_REF_H +#include <linux/types.h> #include <linux/refcount.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <uapi/linux/btrfs_tree.h> + +struct btrfs_trans_handle; +struct btrfs_fs_info; /* these are the possible values of struct btrfs_delayed_ref_node->action */ enum btrfs_delayed_ref_action { @@ -20,12 +30,39 @@ enum btrfs_delayed_ref_action { BTRFS_UPDATE_DELAYED_HEAD, } __packed; +struct btrfs_data_ref { + /* For EXTENT_DATA_REF */ + + /* Inode which refers to this data extent */ + u64 objectid; + + /* + * file_offset - extent_offset + * + * file_offset is the key.offset of the EXTENT_DATA key. + * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. + */ + u64 offset; +}; + +struct btrfs_tree_ref { + /* + * Level of this tree block. + * + * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. + */ + int level; + + /* For non-skinny metadata, no special member needed */ +}; + struct btrfs_delayed_ref_node { struct rb_node ref_node; /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the - * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + * refs rbtree in the corresponding delayed ref head + * (struct btrfs_delayed_ref_head::ref_tree). */ struct list_head add_list; @@ -38,6 +75,15 @@ struct btrfs_delayed_ref_node { /* seq number to keep track of insertion order */ u64 seq; + /* The ref_root for this ref */ + u64 ref_root; + + /* + * The parent for this ref, if this isn't set the ref_root is the + * reference owner. + */ + u64 parent; + /* ref count on this data structure */ refcount_t refs; @@ -54,11 +100,15 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; + + union { + struct btrfs_tree_ref tree_ref; + struct btrfs_data_ref data_ref; + }; }; struct btrfs_delayed_extent_op { struct btrfs_disk_key key; - u8 level; bool update_key; bool update_flags; u64 flags_to_set; @@ -74,12 +124,6 @@ struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; /* - * For insertion into struct btrfs_delayed_ref_root::href_root. - * Keep it in the same cache line as 'bytenr' for more efficient - * searches in the rbtree. - */ - struct rb_node href_node; - /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ @@ -122,6 +166,9 @@ struct btrfs_delayed_ref_head { */ u64 reserved_bytes; + /* Tree block level, for metadata only. */ + u8 level; + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -139,21 +186,11 @@ struct btrfs_delayed_ref_head { bool is_data; bool is_system; bool processing; -}; - -struct btrfs_delayed_tree_ref { - struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; - int level; -}; - -struct btrfs_delayed_data_ref { - struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; - u64 objectid; - u64 offset; + /* + * Indicate if it's currently in the data structure that tracks head + * refs (struct btrfs_delayed_ref_root::head_refs). + */ + bool tracked; }; enum btrfs_delayed_ref_flags { @@ -162,30 +199,52 @@ enum btrfs_delayed_ref_flags { }; struct btrfs_delayed_ref_root { - /* head ref rbtree */ - struct rb_root_cached href_root; - - /* dirty extent records */ - struct rb_root dirty_extent_root; + /* + * Track head references. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits. This is both to get a more + * dense index space (optimizes xarray structure) and because indexes in + * xarrays are of "unsigned long" type, meaning they are 32 bits wide on + * 32 bits platforms, limiting the extent range to 4G which is too low + * and makes it unusable (truncated index values) on 32 bits platforms. + * Protected by the spinlock 'lock' defined below. + */ + struct xarray head_refs; - /* this spin lock protects the rbtree and the entries inside */ - spinlock_t lock; + /* + * Track dirty extent records. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits, for same reasons as above. + */ + struct xarray dirty_extents; - /* how many delayed ref updates we've queued, used by the - * throttling code + /* + * Protects the xarray head_refs, its entries and the following fields: + * num_heads, num_heads_ready, pending_csums and run_delayed_start. */ - atomic_t num_entries; + spinlock_t lock; - /* total number of head nodes in tree */ + /* Total number of head refs, protected by the spinlock 'lock'. */ unsigned long num_heads; - /* total number of head nodes ready for processing */ + /* + * Total number of head refs ready for processing, protected by the + * spinlock 'lock'. + */ unsigned long num_heads_ready; + /* + * Track space reserved for deleting csums of data extents. + * Protected by the spinlock 'lock'. + */ u64 pending_csums; unsigned long flags; + /* + * Track from which bytenr to start searching ref heads. + * Protected by the spinlock 'lock'. + */ u64 run_delayed_start; /* @@ -204,42 +263,6 @@ enum btrfs_ref_type { BTRFS_REF_LAST, } __packed; -struct btrfs_data_ref { - /* For EXTENT_DATA_REF */ - - /* Root which owns this data reference. */ - u64 ref_root; - - /* Inode which refers to this data extent */ - u64 ino; - - /* - * file_offset - extent_offset - * - * file_offset is the key.offset of the EXTENT_DATA key. - * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. - */ - u64 offset; -}; - -struct btrfs_tree_ref { - /* - * Level of this tree block - * - * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. - */ - int level; - - /* - * Root which owns this tree block reference. - * - * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) - */ - u64 ref_root; - - /* For non-skinny metadata, no special member needed */ -}; - struct btrfs_ref { enum btrfs_ref_type type; enum btrfs_delayed_ref_action action; @@ -257,9 +280,15 @@ struct btrfs_ref { u64 real_root; #endif u64 bytenr; - u64 len; + u64 num_bytes; u64 owning_root; + /* + * The root that owns the reference for this reference, this will be set + * or ->parent will be set, depending on what type of reference this is. + */ + u64 ref_root; + /* Bytenr of the parent tree block */ u64 parent; union { @@ -269,8 +298,7 @@ struct btrfs_ref { }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; -extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; -extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_ref_node_cachep; extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); @@ -308,53 +336,10 @@ static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info * return btrfs_calc_metadata_size(fs_info, num_csum_items); } -static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, - int action, u64 bytenr, u64 len, - u64 parent, u64 owning_root) -{ - generic_ref->action = action; - generic_ref->bytenr = bytenr; - generic_ref->len = len; - generic_ref->parent = parent; - generic_ref->owning_root = owning_root; -} - -static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, - u64 root, u64 mod_root, bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: root; -#endif - generic_ref->tree_ref.level = level; - generic_ref->tree_ref.ref_root = root; - generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; - -} - -static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, - u64 ref_root, u64 ino, u64 offset, u64 mod_root, - bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: ref_root; -#endif - generic_ref->data_ref.ref_root = ref_root; - generic_ref->data_ref.ino = ino; - generic_ref->data_ref.offset = offset; - generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(ref_root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; -} +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, + bool skip_qgroup); +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, + u64 mod_root, bool skip_qgroup); static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) @@ -369,24 +354,7 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) kmem_cache_free(btrfs_delayed_extent_op_cachep, op); } -static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) -{ - if (refcount_dec_and_test(&ref->refs)) { - WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); - switch (ref->type) { - case BTRFS_TREE_BLOCK_REF_KEY: - case BTRFS_SHARED_BLOCK_REF_KEY: - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - case BTRFS_SHARED_DATA_REF_KEY: - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - break; - default: - BUG(); - } - } -} +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref); static inline u64 btrfs_ref_head_to_space_flags( struct btrfs_delayed_ref_head *head_ref) @@ -411,26 +379,30 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved); int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, + u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op); void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } -void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( + const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs); +void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); @@ -442,23 +414,44 @@ void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - u64 num_bytes); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); +bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, + u64 root, u64 parent); +void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); -/* - * helper functions to cast a node into its container - */ -static inline struct btrfs_delayed_tree_ref * -btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) { - return container_of(node, struct btrfs_delayed_tree_ref, node); + if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + return node->data_ref.objectid; + return node->tree_ref.level; } -static inline struct btrfs_delayed_data_ref * -btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) { - return container_of(node, struct btrfs_delayed_data_ref, node); + if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + return node->data_ref.offset; + return 0; +} + +static inline u8 btrfs_ref_type(struct btrfs_ref *ref) +{ + ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA); + + if (ref->type == BTRFS_REF_DATA) { + if (ref->parent) + return BTRFS_SHARED_DATA_REF_KEY; + else + return BTRFS_EXTENT_DATA_REF_KEY; + } else { + if (ref->parent) + return BTRFS_SHARED_BLOCK_REF_KEY; + else + return BTRFS_TREE_BLOCK_REF_KEY; + } + + return 0; } #endif diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 79c4293ddf37..f86fbea0b3de 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -11,10 +11,8 @@ #include <linux/math64.h> #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "async-thread.h" #include "dev-replace.h" @@ -47,7 +45,7 @@ * * - Copy existing extents * - * This happens by re-using scrub facility, as scrub also iterates through + * This happens by reusing scrub facility, as scrub also iterates through * existing extents from commit root. * * Location: scrub_write_block_to_dev_replace() from @@ -246,7 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -257,13 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev_handle); + return PTR_ERR(bdev_file); } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); if (!btrfs_check_device_zone_type(fs_info, bdev)) { btrfs_err(fs_info, @@ -314,11 +312,11 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; - device->bdev_handle = bdev_handle; + device->bdev_file = bdev_file; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_devices; ret = btrfs_get_dev_zone_info(device, false); @@ -335,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - bdev_release(bdev_handle); + fput(bdev_file); return ret; } @@ -442,9 +440,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); - - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); @@ -643,6 +638,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; down_write(&dev_replace->rwsem); + dev_replace->replace_task = current; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -686,7 +682,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); /* * Commit dev_replace state and reserve 1 item for it. @@ -826,22 +822,45 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - u64 start = 0; - int i; + struct rb_node *node; + + /* + * The chunk mutex must be held so that no new chunks can be created + * while we are updating existing chunks. This guarantees we don't miss + * any new chunk that gets created for a range that falls before the + * range of the last chunk we processed. + */ + lockdep_assert_held(&fs_info->chunk_mutex); write_lock(&fs_info->mapping_tree_lock); - do { + node = rb_first_cached(&fs_info->mapping_tree); + while (node) { + struct rb_node *next = rb_next(node); struct btrfs_chunk_map *map; + u64 next_start; - map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX); - if (!map) - break; - for (i = 0; i < map->num_stripes; i++) + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + next_start = map->start + map->chunk_len; + + for (int i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = map->start + map->chunk_len; - btrfs_free_chunk_map(map); - } while (start); + + if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) { + map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX); + if (!map) + break; + node = &map->rb_node; + /* + * Drop the lookup reference since we are holding the + * lock in write mode and no one can remove the chunk + * map from the tree and drop its tree reference. + */ + btrfs_free_chunk_map(map); + } else { + node = next; + } + } write_unlock(&fs_info->mapping_tree_lock); } @@ -882,7 +901,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); /* * We have to use this loop approach because at this point src_device @@ -973,6 +992,7 @@ error: list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->rw_devices++; + dev_replace->replace_task = NULL; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); @@ -1000,8 +1020,7 @@ error: btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) - btrfs_scratch_superblocks(fs_info, src_device->bdev, - src_device->name->str); + btrfs_scratch_superblocks(fs_info, src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 675082ccec89..23e480efe5e6 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -6,11 +6,15 @@ #ifndef BTRFS_DEV_REPLACE_H #define BTRFS_DEV_REPLACE_H +#include <linux/types.h> +#include <linux/compiler_types.h> + struct btrfs_ioctl_dev_replace_args; struct btrfs_fs_info; struct btrfs_trans_handle; struct btrfs_dev_replace; struct btrfs_block_group; +struct btrfs_device; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 9c07d5c3e5ad..ccf91de29f80 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -22,12 +22,11 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, + const struct btrfs_key *cpu_key, u32 data_size, const char *name, int name_len) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; struct extent_buffer *leaf; @@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (di) return ERR_PTR(-EEXIST); btrfs_extend_item(trans, path, data_size); @@ -93,7 +92,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } @@ -108,7 +106,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, - struct btrfs_key *location, u8 type, u64 index) + const struct btrfs_key *location, u8 type, u64 index) { int ret = 0; int ret2 = 0; @@ -153,7 +151,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ @@ -190,7 +187,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( if (ret > 0) return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return btrfs_match_dir_item_name(path, name, name_len); } /* @@ -341,14 +338,13 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; - di = btrfs_match_dir_item_name(root->fs_info, path, - name->name, name->len); + di = btrfs_match_dir_item_name(path, name->name, name->len); if (di) return di; } /* Adjust return code if the key was not found in the next leaf. */ - if (ret > 0) - ret = 0; + if (ret >= 0) + ret = -ENOENT; return ERR_PTR(ret); } @@ -378,8 +374,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; @@ -417,7 +412,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_dir_item *di) + const struct btrfs_dir_item *di) { struct extent_buffer *leaf; diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index e40a226373d7..28d69970bc70 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -3,15 +3,21 @@ #ifndef BTRFS_DIR_ITEM_H #define BTRFS_DIR_ITEM_H +#include <linux/types.h> #include <linux/crc32c.h> struct fscrypt_str; +struct btrfs_fs_info; +struct btrfs_key; +struct btrfs_path; +struct btrfs_root; +struct btrfs_trans_handle; int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, - struct btrfs_key *location, u8 type, u64 index); + const struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, @@ -27,7 +33,7 @@ struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root, int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_dir_item *di); + const struct btrfs_dir_item *di); int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -38,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len); diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c new file mode 100644 index 000000000000..8567af46e16f --- /dev/null +++ b/fs/btrfs/direct-io.c @@ -0,0 +1,1078 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/fsverity.h> +#include <linux/iomap.h> +#include "ctree.h" +#include "delalloc-space.h" +#include "direct-io.h" +#include "extent-tree.h" +#include "file.h" +#include "fs.h" +#include "transaction.h" +#include "volumes.h" + +struct btrfs_dio_data { + ssize_t submitted; + struct extent_changeset *data_reserved; + struct btrfs_ordered_extent *ordered; + bool data_space_reserved; + bool nocow_done; +}; + +struct btrfs_dio_private { + /* Range of I/O */ + u64 file_offset; + u32 bytes; + + /* This must be last */ + struct btrfs_bio bbio; +}; + +static struct bio_set btrfs_dio_bioset; + +static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, + struct extent_state **cached_state, + unsigned int iomap_flags) +{ + const bool writing = (iomap_flags & IOMAP_WRITE); + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + int ret = 0; + + /* Direct lock must be taken before the extent lock. */ + if (nowait) { + if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + return -EAGAIN; + } else { + lock_dio_extent(io_tree, lockstart, lockend, cached_state); + } + + while (1) { + if (nowait) { + if (!try_lock_extent(io_tree, lockstart, lockend, + cached_state)) { + ret = -EAGAIN; + break; + } + } else { + lock_extent(io_tree, lockstart, lockend, cached_state); + } + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure there's no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, + lockend - lockstart + 1); + + /* + * We need to make sure there are no buffered pages in this + * range either, we could have raced between the invalidate in + * generic_file_direct_write and locking the extent. The + * invalidate needs to happen so that reads after a write do not + * get stale data. + */ + if (!ordered && + (!writing || !filemap_range_has_page(inode->i_mapping, + lockstart, lockend))) + break; + + unlock_extent(io_tree, lockstart, lockend, cached_state); + + if (ordered) { + if (nowait) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + break; + } + /* + * If we are doing a DIO read and the ordered extent we + * found is for a buffered write, we can not wait for it + * to complete and retry, because if we do so we can + * deadlock with concurrent buffered writes on page + * locks. This happens only if our DIO read covers more + * than one extent map, if at this point has already + * created an ordered extent for a previous extent map + * and locked its range in the inode's io tree, and a + * concurrent write against that previous extent map's + * range and this range started (we unlock the ranges + * in the io tree only when the bios complete and + * buffered writes always lock pages before attempting + * to lock range in the io tree). + */ + if (writing || + test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) + btrfs_start_ordered_extent(ordered); + else + ret = nowait ? -EAGAIN : -ENOTBLK; + btrfs_put_ordered_extent(ordered); + } else { + /* + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readahead (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readahead wait for that + * ordered extent to complete while holding a lock on + * that page. + */ + ret = nowait ? -EAGAIN : -ENOTBLK; + } + + if (ret) + break; + + cond_resched(); + } + + if (ret) + unlock_dio_extent(io_tree, lockstart, lockend, cached_state); + return ret; +} + +static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, + const u64 start, + const struct btrfs_file_extent *file_extent, + const int type) +{ + struct extent_map *em = NULL; + struct btrfs_ordered_extent *ordered; + + if (type != BTRFS_ORDERED_NOCOW) { + em = btrfs_create_io_em(inode, start, file_extent, type); + if (IS_ERR(em)) + goto out; + } + + ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT)); + if (IS_ERR(ordered)) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_map_range(inode, start, + start + file_extent->num_bytes - 1, false); + } + em = ERR_CAST(ordered); + } else { + ASSERT(!dio_data->ordered); + dio_data->ordered = ordered; + } + out: + + return em; +} + +static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, + u64 start, u64 len) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_file_extent file_extent; + struct extent_map *em; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); +again: + ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, + 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + goto again; + } + if (ret) + return ERR_PTR(ret); + + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = ins.offset; + file_extent.ram_bytes = ins.offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; + em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent, + BTRFS_ORDERED_REGULAR); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + if (IS_ERR(em)) + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, + 1); + + return em; +} + +static int btrfs_get_blocks_direct_write(struct extent_map **map, + struct inode *inode, + struct btrfs_dio_data *dio_data, + u64 start, u64 *lenp, + unsigned int iomap_flags) +{ + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_file_extent file_extent; + struct extent_map *em = *map; + int type; + u64 block_start; + struct btrfs_block_group *bg; + bool can_nocow = false; + bool space_reserved = false; + u64 len = *lenp; + u64 prev_len; + int ret = 0; + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if ((em->flags & EXTENT_FLAG_PREALLOC) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->disk_bytenr != EXTENT_MAP_HOLE)) { + if (em->flags & EXTENT_FLAG_PREALLOC) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = extent_map_block_start(em) + (start - em->start); + + if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) { + bg = btrfs_inc_nocow_writers(fs_info, block_start); + if (bg) + can_nocow = true; + } + } + + prev_len = len; + if (can_nocow) { + struct extent_map *em2; + + /* We can NOCOW, so only need to reserve metadata space. */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + nowait); + if (ret < 0) { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + btrfs_dec_nocow_writers(bg); + if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) + ret = -EAGAIN; + goto out; + } + space_reserved = true; + + em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, + &file_extent, type); + btrfs_dec_nocow_writers(bg); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em2; + em = em2; + } + + if (IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + + dio_data->nocow_done = true; + } else { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + + if (nowait) { + ret = -EAGAIN; + goto out; + } + + /* + * If we could not allocate data space before locking the file + * range and we can't do a NOCOW write, then we have to fail. + */ + if (!dio_data->data_space_reserved) { + ret = -ENOSPC; + goto out; + } + + /* + * We have to COW and we have already reserved data space before, + * so now we reserve only metadata. + */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + false); + if (ret < 0) + goto out; + space_reserved = true; + + em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + *map = em; + len = min(len, em->len - (start - em->start)); + if (len < prev_len) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + prev_len - len, true); + } + + /* + * We have created our ordered extent, so we can now release our reservation + * for an outstanding extent. + */ + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); + + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (start + len > i_size_read(inode)) + i_size_write(inode, start + len); +out: + if (ret && space_reserved) { + btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); + } + *lenp = len; + return ret; +} + +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct extent_map *em; + struct extent_state *cached_state = NULL; + struct btrfs_dio_data *dio_data = iter->private; + u64 lockstart, lockend; + const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; + u64 len = length; + const u64 data_alloc_len = length; + u32 unlock_bits = EXTENT_LOCKED; + + /* + * We could potentially fault if we have a buffer > PAGE_SIZE, and if + * we're NOWAIT we may submit a bio for a partial range and return + * EIOCBQUEUED, which would result in an errant short read. + * + * The best way to handle this would be to allow for partial completions + * of iocb's, so we could submit the partial bio, return and fault in + * the rest of the pages, and then submit the io for the rest of the + * range. However we don't have that currently, so simply return + * -EAGAIN at this point so that the normal path is used. + */ + if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) + return -EAGAIN; + + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ + if (!write) + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); + + lockstart = start; + lockend = start + len - 1; + + /* + * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't + * enough if we've written compressed pages to this area, so we need to + * flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk - the first flush only starts + * compression on the data, while keeping the pages locked, so by the + * time the second flush returns we know bios for the compressed pages + * were submitted and finished, and the pages no longer under writeback. + * + * If we have a NOWAIT request and we have any pages in the range that + * are locked, likely due to compression still in progress, we don't want + * to block on page locks. We also don't want to block on pages marked as + * dirty or under writeback (same as for the non-compression case). + * iomap_dio_rw() did the same check, but after that and before we got + * here, mmap'ed writes may have happened or buffered reads started + * (readpage() and readahead(), which lock pages), as we haven't locked + * the file range yet. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + if (flags & IOMAP_NOWAIT) { + if (filemap_range_needs_writeback(inode->i_mapping, + lockstart, lockend)) + return -EAGAIN; + } else { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } + } + + memset(dio_data, 0, sizeof(*dio_data)); + + /* + * We always try to allocate data space and must do it before locking + * the file range, to avoid deadlocks with concurrent writes to the same + * range if the range has several extents and the writes don't expand the + * current i_size (the inode lock is taken in shared mode). If we fail to + * allocate data space here we continue and later, after locking the + * file range, we fail with ENOSPC only if we figure out we can not do a + * NOCOW write. + */ + if (write && !(flags & IOMAP_NOWAIT)) { + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, data_alloc_len, false); + if (!ret) + dio_data->data_space_reserved = true; + else if (ret && !(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + goto err; + } + + /* + * If this errors out it's because we couldn't invalidate pagecache for + * this range and we need to fallback to buffered IO, or we are doing a + * NOWAIT read/write and we need to block. + */ + ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); + if (ret < 0) + goto err; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because that's what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { + free_extent_map(em); + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; + goto unlock_err; + } + + len = min(len, em->len - (start - em->start)); + + /* + * If we have a NOWAIT request and the range contains multiple extents + * (or a mix of extents and holes), then we return -EAGAIN to make the + * caller fallback to a context where it can do a blocking (without + * NOWAIT) request. This way we avoid doing partial IO and returning + * success to the caller, which is not optimal for writes and for reads + * it can result in unexpected behaviour for an application. + * + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling + * iomap_dio_rw(), we can end up returning less data then what the caller + * asked for, resulting in an unexpected, and incorrect, short read. + * That is, the caller asked to read N bytes and we return less than that, + * which is wrong unless we are crossing EOF. This happens if we get a + * page fault error when trying to fault in pages for the buffer that is + * associated to the struct iov_iter passed to iomap_dio_rw(), and we + * have previously submitted bios for other extents in the range, in + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of + * those bios have completed by the time we get the page fault error, + * which we return back to our caller - we should only return EIOCBQUEUED + * after we have submitted bios for all the extents in the range. + */ + if ((flags & IOMAP_NOWAIT) && len < length) { + free_extent_map(em); + ret = -EAGAIN; + goto unlock_err; + } + + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, &len, flags); + if (ret < 0) + goto unlock_err; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); + if (dio_data->data_space_reserved) { + u64 release_offset; + u64 release_len = 0; + + if (dio_data->nocow_done) { + release_offset = start; + release_len = data_alloc_len; + } else if (len < data_alloc_len) { + release_offset = start + len; + release_len = data_alloc_len - len; + } + + if (release_len > 0) + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + release_offset, + release_len); + } + } + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->disk_bytenr == EXTENT_MAP_HOLE) || + ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = extent_map_block_start(em) + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_dev->bdev; + iomap->length = len; + free_extent_map(em); + + /* + * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, + * writes only hold it for this part. We hold the extent lock until + * we're completely done with the extent map to make sure it remains + * valid. + */ + if (write) + unlock_bits |= EXTENT_DIO_LOCKED; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); + + /* We didn't use everything, unlock the dio extent for the remainder. */ + if (!write && (start + len) < lockend) + unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); + + return 0; + +unlock_err: + /* + * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget + * to update this, be explicit that we expect EXTENT_LOCKED and + * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); +err: + if (dio_data->data_space_reserved) { + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + start, data_alloc_len); + extent_changeset_free(dio_data->data_reserved); + } + + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_dio_data *dio_data = iter->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); + return 0; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + pos, length, false); + else + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); + ret = -ENOTBLK; + } + if (write) { + btrfs_put_ordered_extent(dio_data->ordered); + dio_data->ordered = NULL; + } + + if (write) + extent_changeset_free(dio_data->data_reserved); + return ret; +} + +static void btrfs_dio_end_io(struct btrfs_bio *bbio) +{ + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_inode *inode = bbio->inode; + struct bio *bio = &bbio->bio; + + if (bio->bi_status) { + btrfs_warn(inode->root->fs_info, + "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", + btrfs_ino(inode), bio->bi_opf, + dip->file_offset, dip->bytes, bio->bi_status); + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + btrfs_finish_ordered_extent(bbio->ordered, NULL, + dip->file_offset, dip->bytes, + !bio->bi_status); + } else { + unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); + } + + bbio->bio.bi_private = bbio->private; + iomap_dio_bio_end_io(bio); +} + +static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered) +{ + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_ordered_extent *new; + int ret; + + /* Must always be called for the beginning of an ordered extent. */ + if (WARN_ON_ONCE(start != ordered->disk_bytenr)) + return -EINVAL; + + /* No need to split if the ordered extent covers the entire bio. */ + if (ordered->disk_num_bytes == len) { + refcount_inc(&ordered->refs); + bbio->ordered = ordered; + return 0; + } + + /* + * Don't split the extent_map for NOCOW extents, as we're writing into + * a pre-existing one. + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); + if (ret) + return ret; + } + + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return PTR_ERR(new); + bbio->ordered = new; + return 0; +} + +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_dio_data *dio_data = iter->private; + + btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, + btrfs_dio_end_io, bio->bi_private); + bbio->inode = BTRFS_I(iter->inode); + bbio->file_offset = file_offset; + + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; + + dio_data->submitted += bio->bi_iter.bi_size; + + /* + * Check if we are doing a partial write. If we are, we need to split + * the ordered extent to match the submitted bio. Hang on to the + * remaining unfinishable ordered_extent in dio_data so that it can be + * cancelled in iomap_end to avoid a deadlock wherein faulting the + * remaining pages is blocked on the outstanding ordered extent. + */ + if (iter->flags & IOMAP_WRITE) { + int ret; + + ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); + if (ret) { + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + file_offset, dip->bytes, + !ret); + bio->bi_status = errno_to_blk_status(ret); + iomap_dio_bio_end_io(bio); + return; + } + } + + btrfs_submit_bbio(bbio, 0); +} + +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_dio_submit_io, + .bio_set = &btrfs_dio_bioset, +}; + +static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) +{ + struct btrfs_dio_data data = { 0 }; + + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +} + +static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) +{ + struct btrfs_dio_data data = { 0 }; + + return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +} + +static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + const u32 blocksize_mask = fs_info->sectorsize - 1; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + +ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + loff_t pos; + ssize_t written = 0; + ssize_t written_buffered; + size_t prev_left = 0; + loff_t endbyte; + ssize_t ret; + unsigned int ilock_flags = 0; + struct iomap_dio *dio; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; + + /* + * If the write DIO is within EOF, use a shared lock and also only if + * security bits will likely not be dropped by file_remove_privs() called + * from btrfs_write_check(). Either will need to be rechecked after the + * lock was acquired. + */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) + ilock_flags |= BTRFS_ILOCK_SHARED; + +relock: + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; + + /* Shared lock cannot be used with security bits set. */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + return ret; + } + + ret = btrfs_write_check(iocb, ret); + if (ret < 0) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto out; + } + + pos = iocb->ki_pos; + /* + * Re-check since file size may have changed just before taking the + * lock or pos may have changed because of O_APPEND in generic_write_check() + */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && + pos + iov_iter_count(from) > i_size_read(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + + if (check_direct_IO(fs_info, from, pos)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } + + /* + * The iov_iter can be mapped to the same file range we are writing to. + * If that's the case, then we will deadlock in the iomap code, because + * it first calls our callback btrfs_dio_iomap_begin(), which will create + * an ordered extent, and after that it will fault in the pages that the + * iov_iter refers to. During the fault in we end up in the readahead + * pages code (starting at btrfs_readahead()), which will lock the range, + * find that ordered extent and then wait for it to complete (at + * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since + * obviously the ordered extent can never complete as we didn't submit + * yet the respective bio(s). This always happens when the buffer is + * memory mapped to the same file range, since the iomap DIO code always + * invalidates pages in the target file range (after starting and waiting + * for any writeback). + * + * So here we disable page faults in the iov_iter and then retry if we + * got -EFAULT, faulting in the pages before the retry. + */ +again: + from->nofault = true; + dio = btrfs_dio_write(iocb, from, written); + from->nofault = false; + + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + } else { + /* + * If we have a synchronous write, we must make sure the fsync + * triggered by the iomap_dio_complete() call below doesn't + * deadlock on the inode lock - we are already holding it and we + * can't call it after unlocking because we may need to complete + * partial writes due to the input buffer (or parts of it) not + * being already faulted in. + */ + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; + ret = iomap_dio_complete(dio); + current->journal_info = NULL; + } + + /* No increment (+=) because iomap returns a cumulative value. */ + if (ret > 0) + written = ret; + + if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { + const size_t left = iov_iter_count(from); + /* + * We have more data left to write. Try to fault in as many as + * possible of the remainder pages and retry. We do this without + * releasing and locking again the inode, to prevent races with + * truncate. + * + * Also, in case the iov refers to pages in the file range of the + * file we want to write to (due to a mmap), we could enter an + * infinite loop if we retry after faulting the pages in, since + * iomap will invalidate any pages in the range early on, before + * it tries to fault in the pages of the iov. So we keep track of + * how much was left of iov in the previous EFAULT and fallback + * to buffered IO in case we haven't made any progress. + */ + if (left == prev_left) { + ret = -ENOTBLK; + } else { + fault_in_iov_iter_readable(from, left); + prev_left = left; + goto again; + } + } + + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + + /* + * If 'ret' is -ENOTBLK or we have not written all data, then it means + * we must fallback to buffered IO. + */ + if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) + goto out; + +buffered: + /* + * If we are in a NOWAIT context, then return -EAGAIN to signal the caller + * it must retry the operation in a context where blocking is acceptable, + * because even if we end up not blocking during the buffered IO attempt + * below, we will block when flushing and waiting for the IO. + */ + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + + pos = iocb->ki_pos; + written_buffered = btrfs_buffered_write(iocb, from); + if (written_buffered < 0) { + ret = written_buffered; + goto out; + } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ + endbyte = pos + written_buffered - 1; + ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte); + if (ret) + goto out; + ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); + if (ret) + goto out; + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); +out: + return ret < 0 ? ret : written; +} + +static int check_direct_read(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + int ret; + int i, seg; + + ret = check_direct_IO(fs_info, iter, offset); + if (ret < 0) + return ret; + + if (!iter_is_iovec(iter)) + return 0; + + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + const struct iovec *iov1 = iter_iov(iter) + seg; + const struct iovec *iov2 = iter_iov(iter) + i; + + if (iov1->iov_base == iov2->iov_base) + return -EINVAL; + } + } + return 0; +} + +ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + size_t prev_left = 0; + ssize_t read = 0; + ssize_t ret; + + if (fsverity_active(inode)) + return 0; + + if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) + return 0; + + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); +again: + /* + * This is similar to what we do for direct IO writes, see the comment + * at btrfs_direct_write(), but we also disable page faults in addition + * to disabling them only at the iov_iter level. This is because when + * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), + * which can still trigger page fault ins despite having set ->nofault + * to true of our 'to' iov_iter. + * + * The difference to direct IO writes is that we deadlock when trying + * to lock the extent range in the inode's tree during he page reads + * triggered by the fault in (while for writes it is due to waiting for + * our own ordered extent). This is because for direct IO reads, + * btrfs_dio_iomap_begin() returns with the extent range locked, which + * is only unlocked in the endio callback (end_bio_extent_readpage()). + */ + pagefault_disable(); + to->nofault = true; + ret = btrfs_dio_read(iocb, to, read); + to->nofault = false; + pagefault_enable(); + + /* No increment (+=) because iomap returns a cumulative value. */ + if (ret > 0) + read = ret; + + if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { + const size_t left = iov_iter_count(to); + + if (left == prev_left) { + /* + * We didn't make any progress since the last attempt, + * fallback to a buffered read for the remainder of the + * range. This is just to avoid any possibility of looping + * for too long. + */ + ret = read; + } else { + /* + * We made some progress since the last retry or this is + * the first time we are retrying. Fault in as many pages + * as possible and retry. + */ + fault_in_iov_iter_writeable(to, left); + prev_left = left; + goto again; + } + } + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + return ret < 0 ? ret : read; +} + +int __init btrfs_init_dio(void) +{ + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_dio_private, bbio.bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + + return 0; +} + +void __cold btrfs_destroy_dio(void) +{ + bioset_exit(&btrfs_dio_bioset); +} diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h new file mode 100644 index 000000000000..3dc3ea926afe --- /dev/null +++ b/fs/btrfs/direct-io.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DIRECT_IO_H +#define BTRFS_DIRECT_IO_H + +#include <linux/types.h> + +int __init btrfs_init_dio(void); +void __cold btrfs_destroy_dio(void); + +ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from); +ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to); + +#endif /* BTRFS_DIRECT_IO_H */ diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 944a7340f6a4..e815d165cccc 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { }; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) + const struct btrfs_block_group *block_group) { return &discard_ctl->discard_list[block_group->discard_index]; } @@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ -static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) { struct btrfs_fs_info *fs_info = container_of(discard_ctl, struct btrfs_fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c843563914ca..f09db62e61a1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -17,7 +17,7 @@ #include <linux/error-injection.h> #include <linux/crc32c.h> #include <linux/sched/mm.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" @@ -193,7 +192,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + eb->folio_size); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, @@ -214,7 +213,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int failed = 0; @@ -227,7 +226,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -359,7 +358,7 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) /* Do basic extent buffer checks at read time */ int btrfs_validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -368,6 +367,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, u8 result[BTRFS_CSUM_SIZE]; const u8 *header_csum; int ret = 0; + const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS); ASSERT(check); @@ -400,13 +400,16 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, -"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s", eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), - btrfs_header_level(eb)); - ret = -EUCLEAN; - goto out; + btrfs_header_level(eb), + ignore_csum ? ", ignored" : ""); + if (!ignore_csum) { + ret = -EUCLEAN; + goto out; + } } if (found_level != check->level) { @@ -426,7 +429,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } if (check->has_first_key) { - struct btrfs_key *expect_key = &check->first_key; + const struct btrfs_key *expect_key = &check->first_key; struct btrfs_key found_key; if (found_level) @@ -498,15 +501,15 @@ static int btree_migrate_folio(struct address_space *mapping, static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info; int ret; if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_fs_info *fs_info; if (wbc->for_kupdate) return 0; - fs_info = BTRFS_I(mapping->host)->root->fs_info; + fs_info = inode_to_fs_info(mapping->host); /* this is a bit racy, but that's ok */ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, @@ -522,18 +525,19 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; - return try_release_extent_buffer(&folio->page); + return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(folio->mapping->host)->io_tree; + + tree = &folio_to_inode(folio)->io_tree; extent_invalidate_folio(tree, folio, offset); btree_release_folio(folio, GFP_NOFS); if (folio_get_private(folio)) { - btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + btrfs_warn(folio_to_fs_info(folio), "folio private not zero on folio %llu", (unsigned long long)folio_pos(folio)); folio_detach_private(folio); @@ -544,7 +548,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset, static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; @@ -635,10 +639,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, free_extent_buffer_stale(buf); return ERR_PTR(ret); } - if (btrfs_check_eb_owner(buf, check->owner_root)) { - free_extent_buffer_stale(buf); - return ERR_PTR(-EUCLEAN); - } return buf; } @@ -646,7 +646,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { - bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + bool dummy = btrfs_is_testing(fs_info); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -658,13 +658,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->state = 0; RB_CLEAR_NODE(&root->rb_node); - root->last_trans = 0; + btrfs_set_root_last_trans(root, 0); root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; - root->inode_tree = RB_ROOT; - /* GFP flags are compatible with XA_FLAGS_*. */ - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + xa_init(&root->inodes); + xa_init(&root->delayed_nodes); btrfs_init_root_block_rsv(root); @@ -675,7 +674,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); - spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); @@ -776,7 +774,7 @@ int btrfs_global_root_insert(struct btrfs_root *root) if (tmp) { ret = -EEXIST; btrfs_warn(fs_info, "global root %llu %llu already exists", - root->root_key.objectid, root->root_key.offset); + btrfs_root_id(root), root->root_key.offset); } return ret; } @@ -848,13 +846,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) return btrfs_global_root(fs_info, &key); } -struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) -{ - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) - return fs_info->block_group_root; - return btrfs_extent_root(fs_info, 0); -} - struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -926,8 +917,7 @@ fail: return ERR_PTR(ret); } -static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; @@ -975,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, { struct btrfs_root *log_root; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1001,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; int ret; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1011,8 +1001,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return ret; } - log_root->last_trans = trans->transid; - log_root->root_key.offset = root->root_key.objectid; + btrfs_set_root_last_trans(log_root, trans->transid); + log_root->root_key.offset = btrfs_root_id(root); inode_item = &log_root->root_item.inode; btrfs_set_stack_inode_generation(inode_item, 1); @@ -1034,7 +1024,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_tree_parent_check check = { 0 }; @@ -1076,15 +1066,15 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - root->root_key.objectid != btrfs_header_owner(root->node)) { + if (!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node)) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", - root->root_key.objectid, root->node->start, + btrfs_root_id(root), root->node->start, btrfs_header_owner(root->node), - root->root_key.objectid); + btrfs_root_id(root)); ret = -EUCLEAN; goto fail; } @@ -1096,7 +1086,7 @@ fail: } struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_path *path; @@ -1121,9 +1111,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_drew_lock_init(&root->snapshot_lock); - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && - is_fstree(root->root_key.objectid)) { + is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1132,7 +1122,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ - if (is_fstree(root->root_key.objectid) && + if (is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); @@ -1219,7 +1209,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), root); if (ret == 0) { btrfs_grab_root(root); @@ -1231,7 +1221,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } -void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_DEBUG struct btrfs_root *root; @@ -1244,6 +1234,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "leaked root %s refcount %d", btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); + WARN_ON_ONCE(1); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); btrfs_put_root(root); @@ -1265,9 +1256,15 @@ static void free_global_roots(struct btrfs_fs_info *fs_info) void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { + struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); + if (percpu_counter_initialized(em_counter)) + ASSERT(percpu_counter_sum_positive(em_counter) == 0); + percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -1288,7 +1285,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -1849,7 +1845,8 @@ void btrfs_put_root(struct btrfs_root *root) return; if (refcount_dec_and_test(&root->refs)) { - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (WARN_ON(!xa_empty(&root->inodes))) + xa_destroy(&root->inodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -1923,7 +1920,7 @@ static int btrfs_init_btree_inode(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID); set_nlink(inode, 1); /* * we set the i_size on the btree inode to the max possible int. @@ -1934,15 +1931,11 @@ static int btrfs_init_btree_inode(struct super_block *sb) inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, IO_TREE_BTREE_INODE_IO); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; - BTRFS_I(inode)->location.type = 0; - BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); fs_info->btree_inode = inode; @@ -1966,7 +1959,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2141,7 +2134,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, /* If we have IGNOREDATACSUMS skip loading these roots. */ if (objectid == BTRFS_CSUM_TREE_OBJECTID && btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { - set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); return 0; } @@ -2194,7 +2187,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) - set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) ret = ret ? ret : -ENOENT; @@ -2239,7 +2232,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) struct btrfs_key location; int ret; - BUG_ON(!fs_info->tree_root); + ASSERT(fs_info->tree_root); ret = load_global_roots(tree_root); if (ret) @@ -2335,6 +2328,71 @@ out: return ret; } +static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb) +{ + unsigned int cur = 0; /* Offset inside the sys chunk array */ + /* + * At sb read time, fs_info is not fully initialized. Thus we have + * to use super block sectorsize, which should have been validated. + */ + const u32 sectorsize = btrfs_super_sectorsize(sb); + u32 sys_array_size = btrfs_super_sys_array_size(sb); + + if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + return -EUCLEAN; + } + + while (cur < sys_array_size) { + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + struct btrfs_key key; + u64 type; + u16 num_stripes; + u32 len; + int ret; + + disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); + len = sizeof(*disk_key); + + if (cur + len > sys_array_size) + goto short_read; + cur += len; + + btrfs_disk_key_to_cpu(&key, disk_key); + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + key.type, cur); + return -EUCLEAN; + } + chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + goto short_read; + type = btrfs_stack_chunk_type(chunk); + if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + btrfs_err(fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur); + return -EUCLEAN; + } + ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, + sectorsize); + if (ret < 0) + return ret; + cur += btrfs_chunk_item_size(num_stripes); + } + return 0; +short_read: + btrfs_err(fs_info, + "super block sys chunk array short read, cur=%u sys_array_size=%u", + cur, sys_array_size); + return -EUCLEAN; +} + /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. @@ -2345,21 +2403,29 @@ out: * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ -int btrfs_validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num) +int btrfs_validate_super(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS); if (btrfs_super_magic(sb) != BTRFS_MAGIC) { btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { - btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", - btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); - ret = -EINVAL; + if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { + if (!ignore_flags) { + btrfs_err(fs_info, + "unrecognized or unsupported super flag 0x%llx", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } else { + btrfs_info(fs_info, + "unrecognized or unsupported super flags: 0x%llx, ignored", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + } } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", @@ -2462,7 +2528,7 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || !btrfs_fs_incompat(fs_info, NO_HOLES))) { btrfs_err(fs_info, - "block-group-tree feature requires fres-space-tree and no-holes"); + "block-group-tree feature requires free-space-tree and no-holes"); ret = -EINVAL; } @@ -2495,6 +2561,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + ret = validate_sys_chunk_array(fs_info, sb); + /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk @@ -2583,7 +2651,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev struct btrfs_tree_parent_check check = { .level = level, .transid = gen, - .owner_root = root->root_key.objectid + .owner_root = btrfs_root_id(root) }; int ret = 0; @@ -2785,6 +2853,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; @@ -2839,6 +2908,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block int ret; fs_info->sb = sb; + /* Temporary fixed values for block size until we read the superblock. */ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); @@ -2846,10 +2916,18 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL); + if (ret) + return ret; + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2870,6 +2948,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (sb_rdonly(sb)) set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); + if (btrfs_test_opt(fs_info, IGNOREMETACSUMS)) + set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state); return btrfs_alloc_stripe_hash_table(fs_info); } @@ -2915,22 +2995,22 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i = 0; - int err = 0; - unsigned int ret = 0; + int ret = 0; while (1) { + unsigned int found; + spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + found = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) { + if (!found) { spin_unlock(&fs_info->fs_roots_radix_lock); break; } - root_objectid = gang[ret - 1]->root_key.objectid + 1; + root_objectid = btrfs_root_id(gang[found - 1]) + 1; - for (i = 0; i < ret; i++) { + for (int i = 0; i < found; i++) { /* Avoid to grab roots in dead_roots. */ if (btrfs_root_refs(&gang[i]->root_item) == 0) { gang[i] = NULL; @@ -2941,24 +3021,25 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->fs_roots_radix_lock); - for (i = 0; i < ret; i++) { + for (int i = 0; i < found; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - goto out; + root_objectid = btrfs_root_id(gang[i]); + /* + * Continue to release the remaining roots after the first + * error without cleanup and preserve the first error + * for the return. + */ + if (!ret) + ret = btrfs_orphan_cleanup(gang[i]); btrfs_put_root(gang[i]); } + if (ret) + break; + root_objectid++; } -out: - /* Release the uncleaned roots due to error. */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); - } - return err; + return ret; } /* @@ -3191,8 +3272,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) return 0; } -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - char *options) +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; @@ -3310,8 +3390,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the @@ -3334,20 +3416,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) { - struct btrfs_subpage_info *subpage_info; - + if (sectorsize < PAGE_SIZE) btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) { - ret = -ENOMEM; - goto fail_alloc; - } - btrfs_init_subpage_info(subpage_info, sectorsize); - fs_info->subpage_info = subpage_info; - } ret = btrfs_init_workqueues(fs_info); if (ret) @@ -3356,6 +3428,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); + /* Update the values for the current filesystem. */ sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); @@ -3615,28 +3688,25 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_super_write(struct bio *bio) { struct btrfs_device *device = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - struct page *page; - - bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; + struct folio_iter fi; + bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s (%d)", + "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); - ClearPageUptodate(page); - SetPageError(page); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); - } else { - SetPageUptodate(page); + /* Ensure failure if the primary sb fails. */ + if (bio->bi_opf & REQ_FUA) + atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR, + &device->sb_write_errors); + else + atomic_inc(&device->sb_write_errors); } - - put_page(page); - unlock_page(page); + folio_unlock(fi.folio); + folio_put(fi.folio); } bio_put(bio); @@ -3648,7 +3718,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, struct btrfs_super_block *super; struct page *page; u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; int ret; bytenr_orig = btrfs_sb_offset(copy_num); @@ -3723,34 +3793,36 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) /* * Write superblock @sb to the @device. Do not wait for completion, all the - * pages we use for writing are locked. + * folios we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when page is not found or submission fails. + * Return number of errors when folio is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; - struct address_space *mapping = device->bdev->bd_inode->i_mapping; + struct address_space *mapping = device->bdev->bd_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; - int errors = 0; int ret; u64 bytenr, bytenr_orig; + atomic_set(&device->sb_write_errors, 0); + if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; struct bio *bio; struct btrfs_super_block *disk_super; + size_t offset; bytenr_orig = btrfs_sb_offset(i); ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); @@ -3760,7 +3832,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_err(device->fs_info, "couldn't get super block location for mirror %d", i); - errors++; + atomic_inc(&device->sb_write_errors); continue; } if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3773,20 +3845,20 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); - page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, - GFP_NOFS); - if (!page) { + folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + GFP_NOFS); + if (IS_ERR(folio)) { btrfs_err(device->fs_info, "couldn't get super block page for bytenr %llu", bytenr); - errors++; + atomic_inc(&device->sb_write_errors); continue; } + ASSERT(folio_order(folio) == 0); - /* Bump the refcount for wait_dev_supers() */ - get_page(page); - - disk_super = page_address(page); + offset = offset_in_folio(folio, bytenr); + disk_super = folio_address(folio) + offset; memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); /* @@ -3800,8 +3872,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; - __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, - offset_in_page(bytenr)); + bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset); /* * We FUA only the first super block. The others we allow to @@ -3813,17 +3884,17 @@ static int write_dev_supers(struct btrfs_device *device, submit_bio(bio); if (btrfs_advance_sb_log(device, i)) - errors++; + atomic_inc(&device->sb_write_errors); } - return errors < i ? 0 : -1; + return atomic_read(&device->sb_write_errors) < i ? 0 : -1; } /* * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when page is not found or not marked up to - * date. + * Return -1 if primary super block write failed or when there were no super block + * copies written. Otherwise 0. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { @@ -3837,7 +3908,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { @@ -3852,30 +3923,21 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - page = find_get_page(device->bdev->bd_inode->i_mapping, - bytenr >> PAGE_SHIFT); - if (!page) { - errors++; - if (i == 0) - primary_failed = true; + folio = filemap_get_folio(device->bdev->bd_mapping, + bytenr >> PAGE_SHIFT); + /* If the folio has been removed, then we know it completed. */ + if (IS_ERR(folio)) continue; - } - /* Page is submitted locked and unlocked once the IO completes */ - wait_on_page_locked(page); - if (PageError(page)) { - errors++; - if (i == 0) - primary_failed = true; - } - - /* Drop our reference */ - put_page(page); + ASSERT(folio_order(folio) == 0); - /* Drop the reference from the writing run */ - put_page(page); + /* Folio will be unlocked once the write completes. */ + folio_wait_locked(folio); + folio_put(folio); } - /* log error, force error return */ + errors += atomic_read(&device->sb_write_errors); + if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR) + primary_failed = true; if (primary_failed) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", device->devid); @@ -4136,7 +4198,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); + (unsigned long)btrfs_root_id(root)); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); @@ -4155,9 +4217,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_commit_super(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_trans_handle *trans; - mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); @@ -4167,10 +4226,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - return btrfs_commit_transaction(trans); + return btrfs_commit_current_transaction(fs_info->tree_root); } static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) @@ -4179,9 +4235,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) struct btrfs_transaction *tmp; bool found = false; - if (list_empty(&fs_info->trans_list)) - return; - /* * This function is only called at the very end of close_ctree(), * thus no other running transaction, no need to take trans_lock. @@ -4203,7 +4256,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "transaction %llu (with %llu dirty metadata bytes) is not committed", trans->transid, dirty_bytes); - btrfs_cleanup_one_transaction(trans, fs_info); + btrfs_cleanup_one_transaction(trans); if (trans == fs_info->running_transaction) fs_info->running_transaction = NULL; @@ -4273,6 +4326,26 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Wait for any fixup workers to complete. + * If we don't wait for them here and they are still running by the time + * we call kthread_stop() against the cleaner kthread further below, we + * get an use-after-free on the cleaner because the fixup worker adds an + * inode to the list of delayed iputs and then attempts to wakeup the + * cleaner kthread, which was already stopped and destroyed. We parked + * already the cleaner, but below we run all pending delayed iputs. + */ + btrfs_flush_workqueue(fs_info->fixup_workers); + /* + * Similar case here, we have to wait for delalloc workers before we + * proceed below and stop the cleaner kthread, otherwise we trigger a + * use-after-tree on the cleaner kthread task_struct when a delalloc + * worker running submit_compressed_extents() adds a delayed iput, which + * does a wake up on the cleaner kthread, which was already freed below + * when we call kthread_stop(). + */ + btrfs_flush_workqueue(fs_info->delalloc_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4300,6 +4373,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4481,7 +4555,7 @@ static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; + root_objectid = btrfs_root_id(gang[i]); btrfs_free_log(NULL, gang[i]); btrfs_put_root(gang[i]); } @@ -4534,84 +4608,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) * extents that haven't had their dirty pages IO start writeout yet * actually get run and error out properly. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); -} - -static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - - delayed_refs = &trans->delayed_refs; - - spin_lock(&delayed_refs->lock); - if (atomic_read(&delayed_refs->num_entries) == 0) { - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, "delayed_refs has NO entry"); - return; - } - - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { - struct btrfs_delayed_ref_head *head; - struct rb_node *n; - bool pin_bytes = false; - - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_lock(delayed_refs, head)) - continue; - - spin_lock(&head->lock); - while ((n = rb_first_cached(&head->ref_tree)) != NULL) { - ref = rb_entry(n, struct btrfs_delayed_ref_node, - ref_node); - rb_erase_cached(&ref->ref_node, &head->ref_tree); - RB_CLEAR_NODE(&ref->ref_node); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - atomic_dec(&delayed_refs->num_entries); - btrfs_put_delayed_ref(ref); - btrfs_delayed_refs_rsv_release(fs_info, 1, 0); - } - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - btrfs_delete_ref_head(delayed_refs, head); - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - mutex_unlock(&head->mutex); - - if (pin_bytes) { - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - BUG_ON(!cache); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - cache->space_info, head->num_bytes); - cache->reserved -= head->num_bytes; - cache->space_info->bytes_reserved -= head->num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - btrfs_error_unpin_extent_range(fs_info, head->bytenr, - head->bytenr + head->num_bytes - 1); - } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); - btrfs_put_delayed_ref_head(head); - cond_resched(); - spin_lock(&delayed_refs->lock); - } - btrfs_qgroup_destroy_extent_records(trans); - - spin_unlock(&delayed_refs->lock); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) @@ -4626,7 +4623,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - __btrfs_del_delalloc_inode(root, btrfs_inode); + btrfs_del_delalloc_inode(btrfs_inode); spin_unlock(&root->delalloc_lock); /* @@ -4812,16 +4809,16 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) btrfs_qgroup_free_meta_all_pertrans(root); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); } } spin_unlock(&fs_info->fs_roots_radix_lock); } -void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans) { + struct btrfs_fs_info *fs_info = cur_trans->fs_info; struct btrfs_device *dev, *tmp; btrfs_cleanup_dirty_bgs(cur_trans, fs_info); @@ -4833,7 +4830,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, list_del_init(&dev->post_commit_list); } - btrfs_destroy_delayed_refs(cur_trans, fs_info); + btrfs_destroy_delayed_refs(cur_trans); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); @@ -4841,14 +4838,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&fs_info->transaction_wait); - btrfs_destroy_delayed_inodes(fs_info); - btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); - btrfs_free_all_qgroup_pertrans(fs_info); - cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4883,7 +4876,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } else { spin_unlock(&fs_info->trans_lock); } - btrfs_cleanup_one_transaction(t, fs_info); + btrfs_cleanup_one_transaction(t); spin_lock(&fs_info->trans_lock); if (t == fs_info->running_transaction) @@ -4901,6 +4894,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_all_delalloc_inodes(fs_info); btrfs_drop_all_logs(fs_info); + btrfs_free_all_qgroup_pertrans(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; @@ -4925,7 +4919,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto error; + } if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; @@ -4949,7 +4950,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", - root->root_key.objectid); + btrfs_root_id(root)); ret = -ENOSPC; goto out; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index eb3473d1c1ac..587842991b24 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -6,6 +6,22 @@ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H +#include <linux/sizes.h> +#include <linux/compiler_types.h> +#include "ctree.h" +#include "fs.h" + +struct block_device; +struct super_block; +struct extent_buffer; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info; +struct btrfs_super_block; +struct btrfs_trans_handle; +struct btrfs_tree_parent_check; +struct btrfs_transaction; + #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 @@ -25,11 +41,7 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_tree_parent_check; - -void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); +void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check); @@ -40,12 +52,10 @@ struct extent_buffer *btrfs_find_create_tree_block( int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); -int __cold open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options); +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); -int btrfs_validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num); +int btrfs_validate_super(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); @@ -53,7 +63,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key); + const struct btrfs_key *key); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); @@ -71,7 +81,6 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); -struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); @@ -79,7 +88,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); int btrfs_validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check); + const struct btrfs_tree_parent_check *check); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -87,9 +96,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. - * - * If you want to ensure the whole tree is safe, you should use - * fs_info->subvol_srcu */ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { @@ -106,7 +112,7 @@ void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, - struct btrfs_tree_parent_check *check); + const struct btrfs_tree_parent_check *check); blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, @@ -117,8 +123,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); -void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 744a02b7fd67..e2b22bea348a 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -5,7 +5,6 @@ #include "ctree.h" #include "disk-io.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "export.h" #include "accessors.h" #include "super.h" @@ -35,15 +34,15 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, type = FILEID_BTRFS_WITHOUT_PARENT; fid->objectid = btrfs_ino(BTRFS_I(inode)); - fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid; + fid->root_objectid = btrfs_root_id(BTRFS_I(inode)->root); fid->gen = inode->i_generation; if (parent) { u64 parent_root_id; - fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_objectid = btrfs_ino(BTRFS_I(parent)); fid->parent_gen = parent->i_generation; - parent_root_id = BTRFS_I(parent)->root->root_key.objectid; + parent_root_id = btrfs_root_id(BTRFS_I(parent)->root); if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; @@ -85,7 +84,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(root)) return ERR_CAST(root); - inode = btrfs_iget(sb, objectid, root); + inode = btrfs_iget(objectid, root); btrfs_put_root(root); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -161,7 +160,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) return ERR_PTR(-ENOMEM); if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = root->root_key.objectid; + key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; @@ -174,8 +173,15 @@ struct dentry *btrfs_get_parent(struct dentry *child) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; + if (ret == 0) { + /* + * Key with offset of -1 found, there would have to exist an + * inode with such number or a root with such id. + */ + ret = -EUCLEAN; + goto fail; + } - BUG_ON(ret == 0); /* Key with offset of -1 found */ if (path->slots[0] == 0) { ret = -ENOENT; goto fail; @@ -204,7 +210,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0); } - return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root)); + return d_obtain_alias(btrfs_iget(key.objectid, root)); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -215,7 +221,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, { struct inode *inode = d_inode(child); struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; @@ -237,7 +243,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.objectid = btrfs_root_id(BTRFS_I(inode)->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index eba6bc4f5a61..464582273af9 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -4,6 +4,10 @@ #define BTRFS_EXPORT_H #include <linux/exportfs.h> +#include <linux/types.h> + +struct dentry; +struct super_block; extern const struct export_operations btrfs_export_ops; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index e3ee5449cc4a..6d08c100b01d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -4,9 +4,9 @@ #include <trace/events/btrfs.h> #include "messages.h" #include "ctree.h" +#include "extent_io.h" #include "extent-io-tree.h" #include "btrfs_inode.h" -#include "misc.h" static struct kmem_cache *extent_state_cache; @@ -48,6 +48,7 @@ static inline void btrfs_extent_state_leak_debug_check(void) extent_state_in_tree(state), refcount_read(&state->refs)); list_del(&state->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_state_cache, state); } } @@ -125,7 +126,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * Empty an io tree, removing and freeing every extent state record from the * tree. This should be called once we are sure no other task can access the * tree anymore, so no tree updates happen after we empty the tree and there - * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never + * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ void extent_io_tree_release(struct extent_io_tree *tree) @@ -140,7 +141,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); - ASSERT(!(state->state & EXTENT_LOCKED)); + ASSERT(!(state->state & EXTENT_LOCK_BITS)); /* * No need for a memory barrier here, as we are holding the tree * lock and we only change the waitqueue while holding that lock @@ -398,7 +399,7 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) return; merge_prev_state(tree, state); @@ -444,7 +445,7 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, struct rb_node *parent = NULL; const u64 start = state->start - 1; const u64 end = state->end + 1; - const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); + const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -615,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * inserting elements in the tree, so the gfp mask is used to indicate which * allocations or sleeping are allowed. * - * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given - * range from the tree regardless of state (ie for truncate). - * * The range [start, end] is inclusive. * * This takes the tree lock, and returns 0 on success and < 0 on error. @@ -646,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = (bits & EXTENT_LOCKED) ? 1 : 0; - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); + if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc) { @@ -860,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state, static void cache_state(struct extent_state *state, struct extent_state **cached_ptr) { - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); + return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY); } /* @@ -1059,10 +1056,10 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *prealloc = NULL; struct rb_node **p = NULL; struct rb_node *parent = NULL; - int err = 0; + int ret = 0; u64 last_start; u64 last_end; - u32 exclusive_bits = (bits & EXTENT_LOCKED); + u32 exclusive_bits = (bits & EXTENT_LOCK_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -1084,6 +1081,9 @@ again: */ prealloc = alloc_extent_state(mask); } + /* Optimistically preallocate the extent changeset ulist node. */ + if (changeset) + extent_changeset_prealloc(changeset, mask); spin_lock(&tree->lock); if (cached_state && *cached_state) { @@ -1122,7 +1122,7 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = state->start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1158,7 +1158,7 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1175,12 +1175,12 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, start); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (err) + if (ret) goto out; if (state->end <= end) { set_state_bits(tree, state, bits, changeset); @@ -1224,8 +1224,8 @@ hit_next: prealloc->end = this_end; inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { - err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, prealloc, "insert", err); + ret = PTR_ERR(inserted_state); + extent_io_tree_panic(tree, prealloc, "insert", ret); } cache_state(inserted_state, cached_state); @@ -1244,16 +1244,16 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1275,7 +1275,7 @@ out: if (prealloc) free_extent_state(prealloc); - return err; + return ret; } @@ -1312,7 +1312,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *prealloc = NULL; struct rb_node **p = NULL; struct rb_node *parent = NULL; - int err = 0; + int ret = 0; u64 last_start; u64 last_end; bool first_iteration = true; @@ -1351,7 +1351,7 @@ again: if (!state) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } prealloc->start = start; @@ -1402,14 +1402,14 @@ hit_next: if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, start); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (err) + if (ret) goto out; if (state->end <= end) { set_state_bits(tree, state, bits, NULL); @@ -1442,7 +1442,7 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -1454,8 +1454,8 @@ hit_next: prealloc->end = this_end; inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { - err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, prealloc, "insert", err); + ret = PTR_ERR(inserted_state); + extent_io_tree_panic(tree, prealloc, "insert", ret); } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) @@ -1472,13 +1472,13 @@ hit_next: if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); @@ -1500,7 +1500,7 @@ out: if (prealloc) free_extent_state(prealloc); - return err; + return ret; } /* @@ -1808,12 +1808,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. + * We don't support EXTENT_LOCK_BITS yet, as current changeset will + * record any bits changed, so for EXTENT_LOCK_BITS case, it will either + * fail with -EEXIST or changeset will record the whole range. */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } @@ -1822,26 +1821,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * Don't support EXTENT_LOCKED case, same reason as + * Don't support EXTENT_LOCK_BITS case, same reason as * set_record_extent_bits(). */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached); + clear_extent_bit(tree, start, failed_start - 1, bits, cached); return 0; } return 1; @@ -1851,23 +1849,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached_state); + bits, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, - &failed_state); - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + wait_extent_bit(tree, failed_start, end, bits, &failed_state); + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); } @@ -1883,8 +1880,8 @@ void __cold extent_state_free_cachep(void) int __init extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_state), 0, 0, + NULL); if (!extent_state_cache) return -ENOMEM; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index ebe6390d65e9..6ffef1cd37c1 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,15 +3,23 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/list.h> +#include <linux/wait.h> #include "misc.h" struct extent_changeset; +struct btrfs_fs_info; +struct btrfs_inode; /* Bits for the extent state */ enum { ENUM_BIT(EXTENT_DIRTY), ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), @@ -60,6 +68,8 @@ enum { EXTENT_ADD_INODE_BYTES | \ EXTENT_CLEAR_ALL_BITS) +#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED) + /* * Redefined bits above which are used only in the device allocation tree, * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV @@ -127,12 +137,22 @@ const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tre void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); @@ -205,5 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); +static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); +} #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8e8cc1111277..3014a1a23efd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -18,7 +18,7 @@ #include <linux/crc32c.h> #include "ctree.h" #include "extent-tree.h" -#include "tree-log.h" +#include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" @@ -26,14 +26,11 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" -#include "delalloc-space.h" #include "discard.h" -#include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" @@ -49,9 +46,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, + struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -109,10 +104,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; struct btrfs_key key; - u32 item_size; u64 num_refs; u64 extent_flags; u64 owner = 0; @@ -131,11 +123,6 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (!trans) { - path->skip_locking = 1; - path->search_commit_root = 1; - } - search_again: key.objectid = bytenr; key.offset = offset; @@ -149,7 +136,7 @@ search_again: if (ret < 0) goto out_free; - if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -162,41 +149,40 @@ search_again: } if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - owner = btrfs_get_extent_owner_root(fs_info, leaf, - path->slots[0]); - } else { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_item *ei; + const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + + if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(fs_info, ret, NULL); - + btrfs_abort_transaction(trans, ret); goto out_free; } - BUG_ON(num_refs == 0); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + if (unlikely(num_refs == 0)) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected zero reference count for extent item (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, ret); + goto out_free; + } + extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); } else { num_refs = 0; extent_flags = 0; ret = 0; } - if (!trans) - goto out; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -216,15 +202,13 @@ search_again: spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); -out: + WARN_ON(num_refs == 0); if (refs) *refs = num_refs; @@ -451,9 +435,8 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; u32 nritems; - int ret; int recow; - int err = -ENOENT; + int ret; key.objectid = bytenr; if (parent) { @@ -467,26 +450,26 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, again: recow = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } + if (ret < 0) + return ret; if (parent) { - if (!ret) - return 0; - goto fail; + if (ret) + return -ENOENT; + return 0; } + ret = -ENOENT; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret < 0) - err = ret; - if (ret) - goto fail; + if (ret) { + if (ret > 0) + return -ENOENT; + return ret; + } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -507,37 +490,37 @@ again: btrfs_release_path(path); goto again; } - err = 0; + ret = 0; break; } path->slots[0]++; } fail: - return err; + return ret; } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add) + struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u32 size; u32 num_refs; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; + key.offset = node->parent; size = sizeof(struct btrfs_shared_data_ref); } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); + key.offset = hash_extent_data_ref(node->ref_root, owner, offset); size = sizeof(struct btrfs_extent_data_ref); } @@ -546,15 +529,15 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, goto fail; leaf = path->nodes[0]; - if (parent) { + if (node->parent) { struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); if (ret == 0) { - btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_shared_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } } else { @@ -562,7 +545,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, while (ret == -EEXIST) { ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, + if (match_extent_data_ref(leaf, ref, node->ref_root, owner, offset)) break; btrfs_release_path(path); @@ -577,18 +560,16 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (ret == 0) { - btrfs_set_extent_data_ref_root(leaf, ref, - root_objectid); + btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_extent_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; fail: btrfs_release_path(path); @@ -636,7 +617,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(trans, leaf); } return ret; } @@ -708,20 +688,20 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) + struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; + key.offset = node->parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; + key.offset = node->ref_root; } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -813,7 +793,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, if (insert) { extra_size = btrfs_extent_inline_ref_size(want); path->search_for_extension = 1; - path->keep_locks = 1; } else extra_size = -1; @@ -964,6 +943,25 @@ again: ret = -EAGAIN; goto out; } + + if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); + if (tmp_key.objectid == bytenr && + tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = -EAGAIN; + goto out; + } + goto out_no_entry; + } + + if (!path->keep_locks) { + btrfs_release_path(path); + path->keep_locks = 1; + goto again; + } + /* * To add new inline back ref, we have to make sure * there is no corresponding back ref item. @@ -977,13 +975,15 @@ again: goto out; } } +out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: - if (insert) { + if (path->keep_locks) { path->keep_locks = 0; - path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } + if (insert) + path->search_for_extension = 0; return ret; } @@ -1048,7 +1048,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(trans, leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1193,7 +1192,6 @@ static noinline_for_stack int update_inline_extent_backref( item_size -= size; btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(trans, leaf); return 0; } @@ -1258,12 +1256,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); + u64 aligned_start = ALIGN(start, SECTOR_SIZE); /* Adjust the range to be aligned to 512B sectors if necessary. */ if (start != aligned_start) { len -= aligned_start - start; - len = round_down(len, 1 << SECTOR_SHIFT); + len = round_down(len, SECTOR_SIZE); start = aligned_start; } @@ -1318,13 +1316,29 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, bytes_left = end - start; } - if (bytes_left) { + while (bytes_left) { + u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left); + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, - bytes_left >> SECTOR_SHIFT, + bytes_to_discard >> SECTOR_SHIFT, GFP_NOFS); - if (!ret) - *discarded_bytes += bytes_left; + + if (ret) { + if (ret != -EOPNOTSUPP) + break; + continue; + } + + start += bytes_to_discard; + bytes_left -= bytes_to_discard; + *discarded_bytes += bytes_to_discard; + + if (btrfs_trim_interrupted()) { + ret = -ERESTARTSYS; + break; + } } + return ret; } @@ -1442,7 +1456,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1465,34 +1479,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * @node: The delayed ref node used to get the bytenr/length for * extent whose references are incremented. * - * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ - * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical - * bytenr of the parent block. Since new extents are always - * created with indirect references, this will only be the case - * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must - * be 0 - * - * @root_objectid: The id of the root where this modification has originated, - * this can be either one of the well-known metadata trees or - * the subvolume id which references this extent. - * - * @owner: For data extents it is the inode number of the owning file. - * For metadata extents this parameter holds the level in the - * tree of the extent. - * - * @offset: For metadata extents the offset is ignored and is currently - * always passed as 0. For data extents it is the fileoffset - * this extent belongs to. - * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_path *path; @@ -1501,6 +1493,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_key key; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u64 refs; int refs_to_add = node->ref_mod; int ret; @@ -1511,7 +1505,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, - parent, root_objectid, owner, + node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; @@ -1529,17 +1523,13 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); + ret = insert_tree_block_ref(trans, path, node, bytenr); else - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); + ret = insert_extent_data_ref(trans, path, node, bytenr); if (ret) btrfs_abort_transaction(trans, ret); @@ -1572,15 +1562,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, bool insert_reserved) { int ret = 0; - struct btrfs_delayed_data_ref *ref; u64 parent = 0; u64 flags = 0; - ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_data_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_DATA_REF_KEY) - parent = ref->parent; + parent = node->parent; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { struct btrfs_key key; @@ -1591,6 +1579,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, .is_inc = true, .generation = trans->transid, }; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); if (extent_op) flags |= extent_op->flags_to_set; @@ -1599,21 +1589,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = node->num_bytes; - ret = alloc_reserved_file_extent(trans, parent, ref->root, - flags, ref->objectid, - ref->offset, &key, - node->ref_mod, href->owning_root); + ret = alloc_reserved_file_extent(trans, parent, node->ref_root, + flags, owner, offset, &key, + node->ref_mod, + href->owning_root); free_head_ref_squota_rsv(trans->fs_info, href); if (!ret) ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root, - ref->objectid, ref->offset, - extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, href, node, parent, - ref->root, ref->objectid, - ref->offset, extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1666,7 +1652,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = extent_op->level; + key.offset = head->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = head->num_bytes; @@ -1701,7 +1687,7 @@ again: ret = -EUCLEAN; btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", - head->bytenr, head->num_bytes, extent_op->level); + head->bytenr, head->num_bytes, head->level); goto out; } } @@ -1720,8 +1706,6 @@ again: ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -1735,16 +1719,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_tree_ref *ref; u64 parent = 0; u64 ref_root = 0; - ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_tree_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) - parent = ref->parent; - ref_root = ref->root; + parent = node->parent; + ref_root = node->ref_root; if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, @@ -1762,16 +1744,13 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, .generation = trans->transid, }; - BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, node, extent_op); if (!ret) btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, href, node, parent, ref_root, - ref->level, 0, extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1817,40 +1796,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_node *ref; - - if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - return NULL; - - /* - * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. - * This is to prevent a ref count from going down to zero, which deletes - * the extent item from the extent tree, when there still are references - * to add, which would fail because they would not find the extent item. - */ - if (!list_empty(&head->ref_add_list)) - return list_first_entry(&head->ref_add_list, - struct btrfs_delayed_ref_node, add_list); - - ref = rb_entry(rb_first_cached(&head->ref_tree), - struct btrfs_delayed_ref_node, ref_node); - ASSERT(list_empty(&ref->add_list)); - return ref; -} - -static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - spin_lock(&delayed_refs->lock); - head->processing = false; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(head); -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1925,7 +1870,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { - unselect_delayed_ref_head(delayed_refs, head); + btrfs_unselect_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); return ret; } else if (ret) { @@ -1944,7 +1889,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -1967,39 +1912,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return ret; } -static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( - struct btrfs_trans_handle *trans) -{ - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; - struct btrfs_delayed_ref_head *head = NULL; - int ret; - - spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(delayed_refs); - if (!head) { - spin_unlock(&delayed_refs->lock); - return head; - } - - /* - * Grab the lock that says we are going to process all the refs for - * this head - */ - ret = btrfs_delayed_ref_lock(delayed_refs, head); - spin_unlock(&delayed_refs->lock); - - /* - * We may have dropped the spin lock to get the head mutex lock, and - * that might have given someone else time to free the head. If that's - * true, it has been removed from our list and we can move on. - */ - if (ret == -EAGAIN) - head = ERR_PTR(-EAGAIN); - - return head; -} - static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref, u64 *bytes_released) @@ -2016,11 +1928,11 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); - while ((ref = select_delayed_ref(locked_ref))) { + while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); return -EAGAIN; } @@ -2043,7 +1955,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - atomic_dec(&delayed_refs->num_entries); /* * Record the must_insert_reserved flag before we drop the @@ -2069,7 +1980,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); return ret; } @@ -2107,7 +2018,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, do { if (!locked_ref) { - locked_ref = btrfs_obtain_ref_head(trans); + locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); if (IS_ERR_OR_NULL(locked_ref)) { if (PTR_ERR(locked_ref) == -EAGAIN) { continue; @@ -2254,7 +2165,7 @@ again: btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { + if (xa_empty(&delayed_refs->head_refs)) { spin_unlock(&delayed_refs->lock); return 0; } @@ -2271,7 +2182,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; - int level = btrfs_header_level(eb); int ret; extent_op = btrfs_alloc_delayed_extent_op(); @@ -2281,21 +2191,21 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, + btrfs_header_level(eb), extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; } -static noinline int check_delayed_ref(struct btrfs_root *root, +static noinline int check_delayed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; struct rb_node *node; @@ -2311,7 +2221,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); @@ -2349,6 +2259,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { + u64 ref_owner; + u64 ref_offset; + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { @@ -2356,15 +2269,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root, break; } - data_ref = btrfs_delayed_node_to_data_ref(ref); + ref_owner = btrfs_delayed_ref_owner(ref); + ref_offset = btrfs_delayed_ref_offset(ref); /* * If our ref doesn't match the one we're currently looking at * then we have a cross reference. */ - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || - data_ref->offset != offset) { + if (ref->ref_root != btrfs_root_id(root) || + ref_owner != btrfs_ino(inode) || ref_offset != offset) { ret = 1; break; } @@ -2375,11 +2288,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root, return ret; } -static noinline int check_committed_ref(struct btrfs_root *root, +/* + * Check if there are references for a data extent other than the one belonging + * to the given inode and offset. + * + * @inode: The only inode we expect to find associated with the data extent. + * @path: A path to use for searching the extent tree. + * @offset: The only offset we expect to find associated with the data extent. + * @bytenr: The logical address of the data extent. + * + * When the extent does not have any other references other than the one we + * expect to find, we always return a value of 0 with the path having a locked + * leaf that contains the extent's extent item - this is necessary to ensure + * we don't race with a task running delayed references, and our caller must + * have such a path when calling check_delayed_ref() - it must lock a delayed + * ref head while holding the leaf locked. In case the extent item is not found + * in the extent tree, we return -ENOENT with the path having the leaf (locked) + * where the extent item should be, in order to prevent races with another task + * running delayed references, so that we don't miss any reference when calling + * check_delayed_ref(). + * + * Note: this may return false positives, and this is because we want to be + * quick here as we're called in write paths (when flushing delalloc and + * in the direct IO write path). For example we can have an extent with + * a single reference but that reference is not inlined, or we may have + * many references in the extent tree but we also have delayed references + * that cancel all the reference except the one for our inode and offset, + * but it would be expensive to do such checks and complex due to all + * locking to avoid races between the checks and flushing delayed refs, + * plus non-inline references may be located on leaves other than the one + * that contains the extent item in the extent tree. The important thing + * here is to not return false negatives and that the false positives are + * not very common. + * + * Returns: 0 if there are no cross references and with the path having a locked + * leaf from the extent tree that contains the extent's extent item. + * + * 1 if there are cross references (false positives can happen). + * + * < 0 in case of an error. In case of -ENOENT the leaf in the extent + * tree where the extent item should be located at is read locked and + * accessible in the given path. + */ +static noinline int check_committed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr, - bool strict) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; @@ -2398,28 +2353,32 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out; - BUG_ON(ret == 0); /* Corruption */ + return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + return -EUCLEAN; + } - ret = -ENOENT; if (path->slots[0] == 0) - goto out; + return -ENOENT; path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; + return -ENOENT; - ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); /* No inline refs; we need to bail before checking for owner ref. */ if (item_size == sizeof(*ei)) - goto out; + return 1; /* Check for an owner ref; skip over it to the real inline refs. */ iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2427,57 +2386,69 @@ static noinline int check_committed_ref(struct btrfs_root *root, if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); iref = (struct btrfs_extent_inline_ref *)(iref + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); } /* If extent item has more than 1 inline ref then it's shared */ if (item_size != expected_size) - goto out; - - /* - * If extent created before last snapshot => it's shared unless the - * snapshot has been deleted. Use the heuristic if strict is false. - */ - if (!strict && - (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item))) - goto out; + return 1; /* If this extent has SHARED_DATA_REF then it's shared */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) - goto out; + return 1; ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || - btrfs_extent_data_ref_root(leaf, ref) != - root->root_key.objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || + btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || btrfs_extent_data_ref_offset(leaf, ref) != offset) - goto out; + return 1; - ret = 0; -out: - return ret; + return 0; } -int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict, struct btrfs_path *path) +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, + u64 bytenr, struct btrfs_path *path) { int ret; do { - ret = check_committed_ref(root, path, objectid, - offset, bytenr, strict); + ret = check_committed_ref(inode, path, offset, bytenr); if (ret && ret != -ENOENT) goto out; - ret = check_delayed_ref(root, path, objectid, offset, bytenr); - } while (ret == -EAGAIN); + /* + * The path must have a locked leaf from the extent tree where + * the extent item for our extent is located, in case it exists, + * or where it should be located in case it doesn't exist yet + * because it's new and its delayed ref was not yet flushed. + * We need to lock the delayed ref head at check_delayed_ref(), + * if one exists, while holding the leaf locked in order to not + * race with delayed ref flushing, missing references and + * incorrectly reporting that the extent is not shared. + */ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + struct extent_buffer *leaf = path->nodes[0]; + + ASSERT(leaf != NULL); + btrfs_assert_tree_read_locked(leaf); + + if (ret != -ENOENT) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.objectid == bytenr); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); + } + } + + ret = check_delayed_ref(inode, path, offset, bytenr); + } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); - if (btrfs_is_data_reloc_root(root)) + if (btrfs_is_data_reloc_root(inode->root)) WARN_ON(ret > 0); return ret; } @@ -2488,14 +2459,11 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int full_backref, int inc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 bytenr; - u64 num_bytes; u64 parent; u64 ref_root; u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct btrfs_ref generic_ref = { 0 }; bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; int action; @@ -2522,6 +2490,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { + struct btrfs_ref ref = { + .action = action, + .parent = parent, + .ref_root = ref_root, + }; + if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) @@ -2531,35 +2505,33 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) + ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (ref.bytenr == 0) continue; - num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.owning_root = ref_root; + key.offset -= btrfs_file_extent_offset(buf, fi); - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent, ref_root); - btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset, root->root_key.objectid, - for_reloc); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } else { - bytenr = btrfs_node_blockptr(buf, i); - num_bytes = fs_info->nodesize; - /* We don't know the owning_root, use 0. */ - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent, 0); - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, - root->root_key.objectid, for_reloc); + /* We don't know the owning_root, leave as 0. */ + ref.bytenr = btrfs_node_blockptr(buf, i); + ref.num_bytes = fs_info->nodesize; + + btrfs_init_tree_ref(&ref, level - 1, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } @@ -2621,13 +2593,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = cache->fs_info; - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, - num_bytes); + btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -2774,14 +2743,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; - u64 len; u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; + int ret = 0; while (start <= end) { + u64 len; + readonly = false; if (!cache || start >= cache->start + cache->length) { @@ -2789,7 +2759,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); /* Logic error */ + if (cache == NULL) { + /* Logic error, something removed the block group. */ + ret = -EUCLEAN; + goto out; + } cluster = fetch_cluster_info(fs_info, cache->space_info, @@ -2823,42 +2797,26 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; if (cache->ro) { space_info->bytes_readonly += len; readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - space_info->bytes_zone_unusable += len; + btrfs_space_info_update_bytes_zone_unusable(space_info, len); readonly = true; } spin_unlock(&cache->lock); - if (!readonly && return_free_space && - global_rsv->space_info == space_info) { - spin_lock(&global_rsv->lock); - if (!global_rsv->full) { - u64 to_add = min(len, global_rsv->size - - global_rsv->reserved); - - global_rsv->reserved += to_add; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, to_add); - if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; - len -= to_add; - } - spin_unlock(&global_rsv->lock); - } - /* Add to any tickets we may have */ - if (!readonly && return_free_space && len) - btrfs_try_granting_tickets(fs_info, space_info); + if (!readonly && return_free_space) + btrfs_return_free_space(space_info, len); spin_unlock(&space_info->lock); } if (cache) btrfs_put_block_group(cache); - return 0; +out: + return ret; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) @@ -2888,7 +2846,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, &cached_state); - unpin_extent_range(fs_info, start, end, true); + ret = unpin_extent_range(fs_info, start, end, true); + BUG_ON(ret); mutex_unlock(&fs_info->unused_bg_unpin_mutex); free_extent_state(cached_state); cond_resched(); @@ -3088,9 +3047,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, + struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -3110,6 +3067,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner_objectid = btrfs_delayed_ref_owner(node); + u64 owner_offset = btrfs_delayed_ref_offset(node); bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); u64 delayed_ref_root = href->owning_root; @@ -3135,7 +3094,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, skinny_metadata = false; ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, - parent, root_objectid, owner_objectid, + node->parent, node->ref_root, owner_objectid, owner_offset); if (ret == 0) { /* @@ -3162,7 +3121,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; } - /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -3237,7 +3196,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else if (WARN_ON(ret == -ENOENT)) { abort_and_dump(trans, path, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", - bytenr, parent, root_objectid, owner_objectid, + bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); goto out; } else { @@ -3301,7 +3260,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(trans, leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, @@ -3395,13 +3353,14 @@ out: static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (!head) goto out_delayed_unlock; @@ -3419,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); head->processing = false; spin_unlock(&head->lock); @@ -3429,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -3441,29 +3400,42 @@ out_delayed_unlock: return 0; } -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - u64 root_id, - struct extent_buffer *buf, - u64 parent, int last_ref) +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_ref generic_ref = { 0 }; struct btrfs_block_group *bg; int ret; - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent, btrfs_header_owner(buf)); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root_id, 0, false); - if (root_id != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_ref generic_ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = buf->start, + .num_bytes = buf->len, + .parent = parent, + .owning_root = btrfs_header_owner(buf), + .ref_root = root_id, + }; + + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); - BUG_ON(ret); /* -ENOMEM */ + if (ret < 0) + return ret; } if (!last_ref) - return; + return 0; if (btrfs_header_generation(buf) != trans->transid) goto out; @@ -3520,6 +3492,7 @@ out: * matter anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + return 0; } /* Can return -ENOMEM */ @@ -3535,11 +3508,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ - if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { - btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); + if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { + btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); @@ -3547,10 +3517,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ret = btrfs_add_delayed_data_ref(trans, ref, 0); } - if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) btrfs_ref_tree_mod(fs_info, ref); return ret; @@ -4685,7 +4652,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; - bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); @@ -4860,7 +4827,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); @@ -4879,16 +4845,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 flags = extent_op->flags_to_set; + const u64 flags = (extent_op ? extent_op->flags_to_set : 0); + /* The owner of a tree block is the level. */ + int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - ref = btrfs_delayed_node_to_tree_ref(node); - extent_key.objectid = node->bytenr; if (skinny_metadata) { - extent_key.offset = ref->level; + /* The owner of a tree block is the level. */ + extent_key.offset = level; extent_key.type = BTRFS_METADATA_ITEM_KEY; } else { extent_key.offset = node->num_bytes; @@ -4921,21 +4887,20 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); - btrfs_set_tree_block_level(leaf, block_info, ref->level); + btrfs_set_tree_block_level(leaf, block_info, level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); @@ -4946,19 +4911,20 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_ref generic_ref = { 0 }; - u64 root_objectid = root->root_key.objectid; - u64 owning_root = root_objectid; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins->objectid, + .num_bytes = ins->offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) - owning_root = root->relocation_src_root; + generic_ref.owning_root = root->relocation_src_root; - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0, owning_root); - btrfs_init_data_ref(&generic_ref, root_objectid, owner, - offset, 0, false); + btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); @@ -5081,7 +5047,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, */ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); - __btrfs_tree_lock(buf, nest); + btrfs_tree_lock_nested(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); @@ -5096,7 +5062,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_owner(buf, owner); write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different @@ -5136,8 +5102,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; - struct btrfs_delayed_extent_op *extent_op; - struct btrfs_ref generic_ref = { 0 }; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; @@ -5180,33 +5144,43 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - extent_op = btrfs_alloc_delayed_extent_op(); - if (!extent_op) { - ret = -ENOMEM; - goto out_free_buf; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .parent = parent, + .owning_root = owning_root, + .ref_root = root_objectid, + }; + + if (!skinny_metadata || flags != 0) { + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = (skinny_metadata ? false : true); + extent_op->update_flags = (flags != 0); + } else { + extent_op = NULL; } - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = skinny_metadata ? false : true; - extent_op->update_flags = true; - extent_op->level = level; - - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent, owning_root); - btrfs_init_tree_ref(&generic_ref, level, root_objectid, - root->root_key.objectid, false); + + btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); - if (ret) - goto out_free_delayed; + if (ret) { + btrfs_free_delayed_extent_op(extent_op); + goto out_free_buf; + } } return buf; -out_free_delayed: - btrfs_free_delayed_extent_op(extent_op); out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); @@ -5231,11 +5205,99 @@ struct walk_control { int reada_slot; int reada_count; int restarted; + /* Indicate that extent info needs to be looked up when walking the tree. */ + int lookup_info; }; +/* + * This is our normal stage. We are traversing blocks the current snapshot owns + * and we are dropping any of our references to any children we are able to, and + * then freeing the block once we've processed all of the children. + */ #define DROP_REFERENCE 1 + +/* + * We enter this stage when we have to walk into a child block (meaning we can't + * simply drop our reference to it from our current parent node) and there are + * more than one reference on it. If we are the owner of any of the children + * blocks from the current parent node then we have to do the FULL_BACKREF dance + * on them in order to drop our normal ref and add the shared ref. + */ #define UPDATE_BACKREF 2 +/* + * Decide if we need to walk down into this node to adjust the references. + * + * @root: the root we are currently deleting + * @wc: the walk control for this deletion + * @eb: the parent eb that we're currently visiting + * @refs: the number of refs for wc->level - 1 + * @flags: the flags for wc->level - 1 + * @slot: the slot in the eb that we're currently checking + * + * This is meant to be called when we're evaluating if a node we point to at + * wc->level should be read and walked into, or if we can simply delete our + * reference to it. We return true if we should walk into the node, false if we + * can skip it. + * + * We have assertions in here to make sure this is called correctly. We assume + * that sanity checking on the blocks read to this point has been done, so any + * corrupted file systems must have been caught before calling this function. + */ +static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, + struct extent_buffer *eb, u64 flags, int slot) +{ + struct btrfs_key key; + u64 generation; + int level = wc->level; + + ASSERT(level > 0); + ASSERT(wc->refs[level - 1] > 0); + + /* + * The update backref stage we only want to skip if we already have + * FULL_BACKREF set, otherwise we need to read. + */ + if (wc->stage == UPDATE_BACKREF) { + if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + return false; + return true; + } + + /* + * We're the last ref on this block, we must walk into it and process + * any refs it's pointing at. + */ + if (wc->refs[level - 1] == 1) + return true; + + /* + * If we're already FULL_BACKREF then we know we can just drop our + * current reference. + */ + if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + return false; + + /* + * This block is older than our creation generation, we can drop our + * reference to it. + */ + generation = btrfs_node_ptr_generation(eb, slot); + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) + return false; + + /* + * This block was processed from a previous snapshot deletion run, we + * can skip it. + */ + btrfs_node_key_to_cpu(eb, &key, slot); + if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0) + return false; + + /* All other cases we need to wander into the node. */ + return true; +} + static noinline void reada_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct walk_control *wc, @@ -5247,7 +5309,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 refs; u64 flags; u32 nritems; - struct btrfs_key key; struct extent_buffer *eb; int ret; int slot; @@ -5277,7 +5338,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5287,28 +5348,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't care about errors in readahead. */ if (ret < 0) continue; - BUG_ON(refs == 0); - if (wc->stage == DROP_REFERENCE) { - if (refs == 1) - goto reada; + /* + * This could be racey, it's conceivable that we raced and end + * up with a bogus refs count, if that's the case just skip, if + * we are actually corrupt we will notice when we look up + * everything again with our locks. + */ + if (refs == 0) + continue; - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - if (!wc->update_ref || - generation <= root->root_key.offset) - continue; - btrfs_node_key_to_cpu(eb, &key, slot); - ret = btrfs_comp_cpu_keys(&key, - &wc->update_progress); - if (ret < 0) - continue; - } else { - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - } + /* If we don't need to visit this node don't reada. */ + if (!visit_node_for_delete(root, wc, eb, flags, slot)) + continue; reada: btrfs_readahead_node_child(eb, slot); nread++; @@ -5327,7 +5379,7 @@ reada: static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc, int lookup_info) + struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; int level = wc->level; @@ -5335,27 +5387,29 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; - if (wc->stage == UPDATE_BACKREF && - btrfs_header_owner(eb) != root->root_key.objectid) + if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) return 1; /* * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if (lookup_info && + if (wc->lookup_info && ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { - BUG_ON(!path->locks[level]); + ASSERT(path->locks[level]); ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); - BUG_ON(ret == -ENOMEM); if (ret) return ret; - BUG_ON(wc->refs[level] == 0); + if (unlikely(wc->refs[level] == 0)) { + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + eb->start); + return -EUCLEAN; + } } if (wc->stage == DROP_REFERENCE) { @@ -5371,13 +5425,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { - BUG_ON(!path->locks[level]); + ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_set_disk_extent_flags(trans, eb, flag); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } wc->flags[level] |= flag; } @@ -5400,23 +5463,188 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; struct btrfs_path *path; struct btrfs_extent_inline_ref *iref; int ret; + bool exists = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, - root->root_key.objectid, level, 0); + btrfs_root_id(root), level, 0); + if (ret != -ENOENT) { + /* + * If we get 0 then we found our reference, return 1, else + * return the error if it's not -ENOENT; + */ + btrfs_free_path(path); + return (ret < 0 ) ? ret : 1; + } + + /* + * We could have a delayed ref with this reference, so look it up while + * we're holding the path open to make sure we don't race with the + * delayed ref running. + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); + if (!head) + goto out; + if (!mutex_trylock(&head->mutex)) { + /* + * We're contended, means that the delayed ref is running, get a + * reference and wait for the ref head to be complete and then + * try again. + */ + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto again; + } + + exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); btrfs_free_path(path); - if (ret == -ENOENT) + return exists ? 1 : 0; +} + +/* + * We may not have an uptodate block, so if we are going to walk down into this + * block we need to drop the lock, read it off of the disk, re-lock it and + * return to continue dropping the snapshot. + */ +static int check_next_block_uptodate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, + struct extent_buffer *next) +{ + struct btrfs_tree_parent_check check = { 0 }; + u64 generation; + int level = wc->level; + int ret; + + btrfs_assert_tree_write_locked(next); + + generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); + + if (btrfs_buffer_uptodate(next, generation, 0)) return 0; - if (ret < 0) + + check.level = level - 1; + check.transid = generation; + check.owner_root = btrfs_root_id(root); + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); + + btrfs_tree_unlock(next); + if (level == 1) + reada_walk_down(trans, root, wc, path); + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); return ret; - return 1; + } + btrfs_tree_lock(next); + wc->lookup_info = 1; + return 0; +} + +/* + * If we determine that we don't have to visit wc->level - 1 then we need to + * determine if we can drop our reference. + * + * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. + * + * If we are DROP_REFERENCE this will figure out if we need to drop our current + * reference, skipping it if we dropped it from a previous incompleted drop, or + * dropping it if we still have a reference to it. + */ +static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, struct walk_control *wc, + struct extent_buffer *next, u64 owner_root) +{ + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = next->start, + .num_bytes = root->fs_info->nodesize, + .owning_root = owner_root, + .ref_root = btrfs_root_id(root), + }; + int level = wc->level; + int ret; + + /* We are UPDATE_BACKREF, we're not dropping anything. */ + if (wc->stage == UPDATE_BACKREF) + return 0; + + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + ref.parent = path->nodes[level]->start; + } else { + ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); + if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { + btrfs_err(root->fs_info, "mismatched block owner"); + return -EIO; + } + } + + /* + * If we had a drop_progress we need to verify the refs are set as + * expected. If we find our ref then we know that from here on out + * everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, next->start, ref.parent, + level - 1); + if (ret <= 0) + return ret; + ret = 0; + wc->restarted = 0; + } + + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have already + * accounted them at merge time (replace_path), thus we could skip + * expensive subtree trace here. + */ + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + wc->refs[level - 1] > 1) { + u64 generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + + ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); + if (ret) { + btrfs_err_rl(root->fs_info, +"error %d accounting shared subtree, quota is out of sync, rescan required", + ret); + } + } + + /* + * We need to update the next key in our walk control so we can update + * the drop_progress key accordingly. We don't care if find_next_key + * doesn't find a key because that means we're at the end and are going + * to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); + + btrfs_init_tree_ref(&ref, level - 1, 0, false); + return btrfs_free_extent(trans, &ref); } /* @@ -5435,21 +5663,15 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc, int *lookup_info) + struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; - u64 parent; u64 owner_root = 0; - struct btrfs_tree_parent_check check = { 0 }; - struct btrfs_key key; - struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; - int reada = 0; int ret = 0; - bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -5459,28 +5681,18 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { - *lookup_info = 1; + generation <= btrfs_root_origin_generation(root)) { + wc->lookup_info = 1; return 1; } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - check.level = level - 1; - check.transid = generation; - check.owner_root = root->root_key.objectid; - check.has_first_key = true; - btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, - path->slots[level]); + next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root), + level - 1); + if (IS_ERR(next)) + return PTR_ERR(next); - next = find_extent_buffer(fs_info, bytenr); - if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr, - root->root_key.objectid, level - 1); - if (IS_ERR(next)) - return PTR_ERR(next); - reada = 1; - } btrfs_tree_lock(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, @@ -5491,57 +5703,31 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, goto out_unlock; if (unlikely(wc->refs[level - 1] == 0)) { - btrfs_err(fs_info, "Missing references."); - ret = -EIO; + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + bytenr); + ret = -EUCLEAN; goto out_unlock; } - *lookup_info = 0; + wc->lookup_info = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->refs[level - 1] > 1) { - need_account = true; - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; - - if (!wc->update_ref || - generation <= root->root_key.offset) - goto skip; - - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); - if (ret < 0) - goto skip; + /* If we don't have to walk into this node skip it. */ + if (!visit_node_for_delete(root, wc, path->nodes[level], + wc->flags[level - 1], path->slots[level])) + goto skip; - wc->stage = UPDATE_BACKREF; - wc->shared_level = level - 1; - } - } else { - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; + /* + * We have to walk down into this node, and if we're currently at the + * DROP_REFERNCE stage and this block is shared then we need to switch + * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. + */ + if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; } - if (!btrfs_buffer_uptodate(next, generation, 0)) { - btrfs_tree_unlock(next); - free_extent_buffer(next); - next = NULL; - *lookup_info = 1; - } - - if (!next) { - if (reada && level == 1) - reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, &check); - if (IS_ERR(next)) { - return PTR_ERR(next); - } else if (!extent_buffer_uptodate(next)) { - free_extent_buffer(next); - return -EIO; - } - btrfs_tree_lock(next); - } + ret = check_next_block_uptodate(trans, root, path, wc, next); + if (ret) + return ret; level--; ASSERT(level == btrfs_header_level(next)); @@ -5558,76 +5744,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, wc->reada_slot = 0; return 0; skip: + ret = maybe_drop_reference(trans, root, path, wc, next, owner_root); + if (ret) + goto out_unlock; wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - ASSERT(root->root_key.objectid == - btrfs_header_owner(path->nodes[level])); - if (root->root_key.objectid != - btrfs_header_owner(path->nodes[level])) { - btrfs_err(root->fs_info, - "mismatched block owner"); - ret = -EIO; - goto out_unlock; - } - parent = 0; - } - - /* - * If we had a drop_progress we need to verify the refs are set - * as expected. If we find our ref then we know that from here - * on out everything should be correct, and we can clear the - * ->restarted flag. - */ - if (wc->restarted) { - ret = check_ref_exists(trans, root, bytenr, parent, - level - 1); - if (ret < 0) - goto out_unlock; - if (ret == 0) - goto no_delete; - ret = 0; - wc->restarted = 0; - } - - /* - * Reloc tree doesn't contribute to qgroup numbers, and we have - * already accounted them at merge time (replace_path), - * thus we could skip expensive subtree trace here. - */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - need_account) { - ret = btrfs_qgroup_trace_subtree(trans, next, - generation, level - 1); - if (ret) { - btrfs_err_rl(fs_info, - "Error %d accounting shared subtree. Quota is out of sync, rescan required.", - ret); - } - } - - /* - * We need to update the next key in our walk control so we can - * update the drop_progress key accordingly. We don't care if - * find_next_key doesn't find a key because that means we're at - * the end and are going to clean up now. - */ - wc->drop_level = level; - find_next_key(path, level, &wc->drop_progress); - - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent, owner_root); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, - 0, false); - ret = btrfs_free_extent(trans, &ref); - if (ret) - goto out_unlock; - } -no_delete: - *lookup_info = 1; + wc->lookup_info = 1; ret = 1; out_unlock: @@ -5655,13 +5777,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + int ret = 0; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; if (wc->stage == UPDATE_BACKREF) { - BUG_ON(wc->shared_level < level); + ASSERT(wc->shared_level >= level); if (level < wc->shared_level) goto out; @@ -5679,7 +5801,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, * count is one. */ if (!path->locks[level]) { - BUG_ON(level == 0); + ASSERT(level > 0); btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; @@ -5693,7 +5815,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = 0; return ret; } - BUG_ON(wc->refs[level] == 0); + if (unlikely(wc->refs[level] == 0)) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + eb->start); + return -EUCLEAN; + } if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5703,7 +5830,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } /* wc->stage == DROP_REFERENCE */ - BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + ASSERT(path->locks[level] || wc->refs[level] == 1); if (wc->refs[level] == 1) { if (level == 0) { @@ -5711,8 +5838,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 1); else ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ - if (is_fstree(root->root_key.objectid)) { + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + if (is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, @@ -5732,40 +5862,63 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (root->root_key.objectid != btrfs_header_owner(eb)) + else if (btrfs_root_id(root) != btrfs_header_owner(eb)) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (root->root_key.objectid != + else if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level + 1])) goto owner_mismatch; } - btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, - wc->refs[level] == 1); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, + wc->refs[level] == 1); + if (ret < 0) + btrfs_abort_transaction(trans, ret); out: wc->refs[level] = 0; wc->flags[level] = 0; - return 0; + return ret; owner_mismatch: btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", - btrfs_header_owner(eb), root->root_key.objectid); + btrfs_header_owner(eb), btrfs_root_id(root)); return -EUCLEAN; } +/* + * walk_down_tree consists of two steps. + * + * walk_down_proc(). Look up the reference count and reference of our current + * wc->level. At this point path->nodes[wc->level] should be populated and + * uptodate, and in most cases should already be locked. If we are in + * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and + * we can walk back up the tree. If we are UPDATE_BACKREF we have to set + * FULL_BACKREF on this node if it's not already set, and then do the + * FULL_BACKREF conversion dance, which is to drop the root reference and add + * the shared reference to all of this nodes children. + * + * do_walk_down(). This is where we actually start iterating on the children of + * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping + * our reference to the children that return false from visit_node_for_delete(), + * which has various conditions where we know we can just drop our reference + * without visiting the node. For UPDATE_BACKREF we will skip any children that + * visit_node_for_delete() returns false for, only walking down when necessary. + * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of + * snapshot deletion. + */ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { int level = wc->level; - int lookup_info = 1; int ret = 0; + wc->lookup_info = 1; while (level >= 0) { - ret = walk_down_proc(trans, root, path, wc, lookup_info); + ret = walk_down_proc(trans, root, path, wc); if (ret) break; @@ -5776,7 +5929,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(path->nodes[level])) break; - ret = do_walk_down(trans, root, path, wc, &lookup_info); + ret = do_walk_down(trans, root, path, wc); if (ret > 0) { path->slots[level]++; continue; @@ -5787,6 +5940,23 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, return (ret == 1) ? 0 : ret; } +/* + * walk_up_tree() is responsible for making sure we visit every slot on our + * current node, and if we're at the end of that node then we call + * walk_up_proc() on our current node which will do one of a few things based on + * our stage. + * + * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level + * then we need to walk back up the tree, and then going back down into the + * other slots via walk_down_tree to update any other children from our original + * wc->shared_level. Once we're at or above our wc->shared_level we can switch + * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on. + * + * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block. + * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents + * in our current leaf. After that we call btrfs_free_tree_block() on the + * current node and walk up to the next node to walk down the next slot. + */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -5837,8 +6007,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { - const bool is_reloc_root = (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID); + const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5846,24 +6015,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) struct btrfs_root_item *root_item = &root->root_item; struct walk_control *wc; struct btrfs_key key; - int err = 0; - int ret; + const u64 rootid = btrfs_root_id(root); + int ret = 0; int level; bool root_dropped = false; bool unfinished_drop = false; - btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); + btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { btrfs_free_path(path); - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -5876,12 +6045,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_free; } - err = btrfs_run_delayed_items(trans); - if (err) + ret = btrfs_run_delayed_items(trans); + if (ret) goto out_end_trans; /* @@ -5912,11 +6081,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->lowest_level = 0; - if (ret < 0) { - err = ret; + if (ret < 0) goto out_end_trans; - } + WARN_ON(ret > 0); + ret = 0; /* * unlock our path, this is safe because only this @@ -5929,14 +6098,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_tree_lock(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK; + /* + * btrfs_lookup_extent_info() returns 0 for success, + * or < 0 for error. + */ ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); - if (ret < 0) { - err = ret; + if (ret < 0) goto out_end_trans; - } + BUG_ON(wc->refs[level] == 0); if (level == btrfs_root_drop_level(root_item)) @@ -5962,19 +6134,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; break; } if (ret > 0) { BUG_ON(wc->stage != DROP_REFERENCE); + ret = 0; break; } @@ -5996,7 +6167,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) root_item); if (ret) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } @@ -6007,7 +6177,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, "drop snapshot early exit"); - err = -EAGAIN; + ret = -EAGAIN; goto out_free; } @@ -6021,19 +6191,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_free; } } } btrfs_release_path(path); - if (err) + if (ret) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } @@ -6042,16 +6211,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } else if (ret > 0) { - /* if we fail to delete the orphan item this time + ret = 0; + /* + * If we fail to delete the orphan item this time * around, it'll get picked up the next time. * * The most common failure here is just -ENOENT. */ - btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); + btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); } } @@ -6077,11 +6246,19 @@ out_free: kfree(wc); btrfs_free_path(path); out: + if (!ret && root_dropped) { + ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid); + if (ret < 0) + btrfs_warn_rl(fs_info, + "failed to cleanup qgroup 0/%llu: %d", + rootid, ret); + ret = 0; + } /* * We were an unfinished drop root, check to see if there are any * pending, and if not clear and wake up any waiters. */ - if (!err && unfinished_drop) + if (!ret && unfinished_drop) btrfs_maybe_wake_unfinished_drop(fs_info); /* @@ -6093,7 +6270,7 @@ out: */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); - return err; + return ret; } /* @@ -6113,9 +6290,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, int level; int parent_level; int ret = 0; - int wret; - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); if (!path) @@ -6149,17 +6325,16 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { - wret = walk_down_tree(trans, root, path, wc); - if (wret < 0) { - ret = wret; + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) break; - } - wret = walk_up_tree(trans, root, path, wc, parent_level); - if (wret < 0) - ret = wret; - if (wret != 0) + ret = walk_up_tree(trans, root, path, wc, parent_level); + if (ret) { + if (ret > 0) + ret = 0; break; + } } kfree(wc); @@ -6167,10 +6342,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end) +/* + * Unpin the extent range in an error context and don't add the space back. + * Errors are not propagated further. + */ +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { - return unpin_extent_range(fs_info, start, end, false); + unpin_extent_range(fs_info, start, end, false); } /* @@ -6271,7 +6449,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) start += len; *trimmed += bytes; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } @@ -6363,13 +6541,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); + + trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } - - trimmed += group_trimmed; } mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 2e066035ccee..cfa52264f678 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,11 +3,21 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include <linux/types.h> #include "misc.h" #include "block-group.h" +#include "locking.h" +struct extent_buffer; struct btrfs_free_cluster; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_path; +struct btrfs_ref; +struct btrfs_disk_key; struct btrfs_delayed_ref_head; +struct btrfs_delayed_ref_root; +struct btrfs_extent_inline_ref; enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, @@ -106,8 +116,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict, +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr, struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -117,10 +126,10 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 empty_size, u64 reloc_src_root, enum btrfs_lock_nesting nest); -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - u64 root_id, - struct extent_buffer *buf, - u64 parent, int last_ref); +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, @@ -153,5 +162,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8b4bef05e222..b2fae67f8fa3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -14,7 +14,6 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/fsverity.h> -#include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" @@ -22,7 +21,6 @@ #include "btrfs_inode.h" #include "bio.h" #include "locking.h" -#include "rcu-string.h" #include "backref.h" #include "disk-io.h" #include "subpage.h" @@ -78,10 +76,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); pr_err( - "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_buffer_cache, eb); } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); @@ -102,6 +101,13 @@ struct btrfs_bio_ctrl { blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; + + /* + * The sectors of the page which are going to be submitted by + * extent_writepage_io(). + * This is to avoid touching ranges covered by compression/inline. + */ + unsigned long submit_bitmap; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -118,7 +124,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); else - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -147,8 +153,8 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", - sizeof(struct extent_buffer), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_buffer), 0, 0, + NULL); if (!extent_buffer_cache) return -ENOMEM; @@ -165,26 +171,10 @@ void __cold extent_buffer_free_cachep(void) kmem_cache_destroy(extent_buffer_cache); } -void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +static void process_one_folio(struct btrfs_fs_info *fs_info, + struct folio *folio, const struct folio *locked_folio, + unsigned long page_ops, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - clear_page_dirty_for_io(page); - put_page(page); - index++; - } -} - -static void process_one_page(struct btrfs_fs_info *fs_info, - struct page *page, struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) -{ - struct folio *folio = page_folio(page); u32 len; ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); @@ -199,18 +189,17 @@ static void process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); - if (page != locked_page && (page_ops & PAGE_UNLOCK)) - btrfs_folio_end_writer_lock(fs_info, folio, start, len); + if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) + btrfs_folio_end_lock(fs_info, folio, start, len); } -static void __process_pages_contig(struct address_space *mapping, - struct page *locked_page, u64 start, u64 end, - unsigned long page_ops) +static void __process_folios_contig(struct address_space *mapping, + const struct folio *locked_folio, u64 start, + u64 end, unsigned long page_ops) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); - pgoff_t start_index = start >> PAGE_SHIFT; + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); + pgoff_t index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; - pgoff_t index = start_index; struct folio_batch fbatch; int i; @@ -223,43 +212,41 @@ static void __process_pages_contig(struct address_space *mapping, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - process_one_page(fs_info, &folio->page, locked_page, - page_ops, start, end); + process_one_folio(fs_info, folio, locked_folio, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); } } -static noinline void __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, +static noinline void unlock_delalloc_folio(const struct inode *inode, + const struct folio *locked_folio, u64 start, u64 end) { unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_page); - if (index == locked_page->index && end_index == index) + ASSERT(locked_folio); + if (index == locked_folio->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK); + __process_folios_contig(inode->i_mapping, locked_folio, start, end, + PAGE_UNLOCK); } -static noinline int lock_delalloc_pages(struct inode *inode, - struct page *locked_page, - u64 start, - u64 end) +static noinline int lock_delalloc_folios(struct inode *inode, + const struct folio *locked_folio, + u64 start, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; - pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; - pgoff_t index = start_index; u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_page->index && index == end_index) + if (index == locked_folio->index && index == end_index) return 0; folio_batch_init(&fbatch); @@ -273,23 +260,23 @@ static noinline int lock_delalloc_pages(struct inode *inode, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - struct page *page = folio_page(folio, 0); - u32 len = end + 1 - start; + u64 range_start; + u32 range_len; - if (page == locked_page) + if (folio == locked_folio) continue; - if (btrfs_folio_start_writer_lock(fs_info, folio, start, - len)) - goto out; - - if (!PageDirty(page) || page->mapping != mapping) { - btrfs_folio_end_writer_lock(fs_info, folio, start, - len); + folio_lock(folio); + if (!folio_test_dirty(folio) || folio->mapping != mapping) { + folio_unlock(folio); goto out; } + range_start = max_t(u64, folio_pos(folio), start); + range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + end + 1) - range_start; + btrfs_folio_set_lock(fs_info, folio, range_start, range_len); - processed_end = page_offset(page) + PAGE_SIZE - 1; + processed_end = range_start + range_len - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -299,7 +286,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_page, start, processed_end); + unlock_delalloc_folio(inode, locked_folio, start, processed_end); return -EAGAIN; } @@ -320,10 +307,10 @@ out: */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, - u64 *end) + struct folio *locked_folio, + u64 *start, u64 *end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; @@ -339,9 +326,9 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, /* Caller should pass a valid @end to indicate the search range end */ ASSERT(orig_end > orig_start); - /* The range should at least cover part of the page */ - ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || - orig_end <= page_offset(locked_page))); + /* The range should at least cover part of the folio */ + ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; @@ -358,25 +345,25 @@ again: } /* - * start comes from the offset of locked_page. We have to lock - * pages in order, so we can't process delalloc bytes before - * locked_page + * start comes from the offset of locked_folio. We have to lock + * folios in order, so we can't process delalloc bytes before + * locked_folio */ if (delalloc_start < *start) delalloc_start = *start; /* - * make sure to limit the number of pages we try to lock down + * make sure to limit the number of folios we try to lock down */ if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the pages after the page that has start */ - ret = lock_delalloc_pages(inode, locked_page, - delalloc_start, delalloc_end); + /* step two, lock all the folioss after the folios that has start */ + ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, + delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the pages are gone, lets avoid looping by + /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); @@ -397,15 +384,14 @@ again: /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, EXTENT_DELALLOC, cached_state); + + unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - unlock_extent(tree, delalloc_start, delalloc_end, - &cached_state); - __unlock_for_delalloc(inode, locked_page, - delalloc_start, delalloc_end); + unlock_delalloc_folio(inode, locked_folio, delalloc_start, + delalloc_end); cond_resched(); goto again; } - free_extent_state(cached_state); *start = delalloc_start; *end = delalloc_end; out_failed: @@ -413,41 +399,43 @@ out_failed: } void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - struct page *locked_page, + const struct folio *locked_folio, + struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); - __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops); + __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, + end, page_ops); } -static bool btrfs_verify_page(struct page *page, u64 start) +static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) { - if (!fsverity_active(page->mapping->host) || - PageUptodate(page) || - start >= i_size_read(page->mapping->host)) + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + + if (!fsverity_active(folio->mapping->host) || + btrfs_folio_test_uptodate(fs_info, folio, start, len) || + start >= i_size_read(folio->mapping->host)) return true; - return fsverity_verify_page(page); + return fsverity_verify_folio(folio); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); - if (uptodate && btrfs_verify_page(page, start)) + if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, page->mapping)) - unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) + folio_unlock(folio); else - btrfs_subpage_end_reader(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } /* @@ -462,16 +450,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) */ static void end_bbio_data_write(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; - struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; @@ -488,8 +475,8 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) "incomplete page write with offset %zu and length %zu", fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, - folio_page(folio, 0), start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, + !error); if (error) mapping_set_error(folio->mapping, error); btrfs_folio_clear_writeback(fs_info, folio, start, len); @@ -498,85 +485,14 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) bio_put(bio); } -/* - * Record previously processed extent range - * - * For endio_readpage_release_extent() to handle a full extent range, reducing - * the extent io operations. - */ -struct processed_extent { - struct btrfs_inode *inode; - /* Start of the range in @inode */ - u64 start; - /* End of the range in @inode */ - u64 end; - bool uptodate; -}; - -/* - * Try to release processed extent range - * - * May not release the extent range right now if the current range is - * contiguous to processed extent. - * - * Will release processed extent when any of @inode, @uptodate, the range is - * no longer contiguous to the processed range. - * - * Passing @inode == NULL will force processed extent to be released. - */ -static void endio_readpage_release_extent(struct processed_extent *processed, - struct btrfs_inode *inode, u64 start, u64 end, - bool uptodate) +static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { - struct extent_state *cached = NULL; - struct extent_io_tree *tree; - - /* The first extent, initialize @processed */ - if (!processed->inode) - goto update; - - /* - * Contiguous to processed extent, just uptodate the end. - * - * Several things to notice: - * - * - bio can be merged as long as on-disk bytenr is contiguous - * This means we can have page belonging to other inodes, thus need to - * check if the inode still matches. - * - bvec can contain range beyond current page for multi-page bvec - * Thus we need to do processed->end + 1 >= start check - */ - if (processed->inode == inode && processed->uptodate == uptodate && - processed->end + 1 >= start && end >= processed->end) { - processed->end = end; - return; - } - - tree = &processed->inode->io_tree; - /* - * Now we don't have range contiguous to the processed range, release - * the processed range now. - */ - unlock_extent(tree, processed->start, processed->end, &cached); - -update: - /* Update processed to current range */ - processed->inode = inode; - processed->start = start; - processed->end = end; - processed->uptodate = uptodate; -} - -static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) -{ - struct folio *folio = page_folio(page); - ASSERT(folio_test_locked(folio)); if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -593,28 +509,20 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) */ static void end_bbio_data_read(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; - struct processed_extent processed = { 0 }; struct folio_iter fi; - /* - * The offset to the beginning of a bio, since one bio can never be - * larger than UINT_MAX, u32 here is enough. - */ - u32 bio_offset = 0; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; u64 start; u64 end; u32 len; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, @@ -642,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -651,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -664,59 +573,70 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_page_read(folio_page(folio, 0), uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - + end_folio_read(folio, uptodate, start, len); } - /* Release the last extent */ - endio_readpage_release_extent(&processed, NULL, 0, 0, false); bio_put(bio); } /* - * Populate every free slot in a provided array with pages. + * Populate every free slot in a provided array with folios using GFP_NOFS. + * + * @nr_folios: number of folios to allocate + * @folio_array: the array to fill with folios; any existing non-NULL entries in + * the array will be skipped + * + * Return: 0 if all folios were able to be allocated; + * -ENOMEM otherwise, the partially allocated folios would be freed and + * the array slots zeroed + */ +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) +{ + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + continue; + folio_array[i] = folio_alloc(GFP_NOFS, 0); + if (!folio_array[i]) + goto error; + } + return 0; +error: + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + folio_put(folio_array[i]); + } + return -ENOMEM; +} + +/* + * Populate every free slot in a provided array with pages, using GFP_NOFS. * * @nr_pages: number of pages to allocate * @page_array: the array to fill with pages; any existing non-null entries in - * the array will be skipped - * @extra_gfp: the extra GFP flags for the allocation. + * the array will be skipped + * @nofail: whether using __GFP_NOFAIL flag * * Return: 0 if all pages were able to be allocated; * -ENOMEM otherwise, the partially allocated pages would be freed and * the array slots zeroed */ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, - gfp_t extra_gfp) + bool nofail) { + const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS; unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp, - nr_pages, page_array); - - if (allocated == nr_pages) - return 0; - - /* - * During this iteration, no page could be allocated, even - * though alloc_pages_bulk_array() falls back to alloc_page() - * if it could not bulk-allocate. So we must be out of memory. - */ - if (allocated == last) { + allocated = alloc_pages_bulk(gfp, nr_pages, page_array); + if (unlikely(allocated == last)) { + /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { __free_page(page_array[i]); page_array[i] = NULL; } return -ENOMEM; } - - memalloc_retry_wait(GFP_NOFS); } return 0; } @@ -726,28 +646,31 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, * * For now, the folios populated are always in order 0 (aka, single page). */ -static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp) +static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) { struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; int num_pages = num_extent_pages(eb); int ret; - ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp); + ret = btrfs_alloc_page_array(num_pages, page_array, nofail); if (ret < 0) return ret; for (int i = 0; i < num_pages; i++) eb->folios[i] = page_folio(page_array[i]); + eb->folio_size = PAGE_SIZE; + eb->folio_shift = PAGE_SHIFT; return 0; } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + struct folio *folio, u64 disk_bytenr, unsigned int pg_offset) { struct bio *bio = &bio_ctrl->bbio->bio; struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -760,7 +683,7 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, /* * The contig check requires the following conditions to be met: * - * 1) The pages are belonging to the same inode + * 1) The folios are belonging to the same inode * This is implied by the call chain. * * 2) The range has adjacent logical bytenr @@ -769,8 +692,8 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, * This is required for the usage of btrfs_bio->file_offset. */ return bio_end_sector(bio) == sector && - page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == - page_offset(page) + pg_offset; + folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == + folio_pos(folio) + pg_offset; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -783,6 +706,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; bbio->inode = inode; bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; @@ -823,17 +747,17 @@ static void alloc_new_bio(struct btrfs_inode *inode, * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num. */ -static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, - u64 disk_bytenr, struct page *page, +static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct folio *folio, size_t size, unsigned long pg_offset) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(folio); ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) submit_one_bio(bio_ctrl); do { @@ -842,31 +766,32 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, /* Allocate new bio if needed */ if (!bio_ctrl->bbio) { alloc_new_bio(inode, bio_ctrl, disk_bytenr, - page_offset(page) + pg_offset); + folio_pos(folio) + pg_offset); } /* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) { ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); - ASSERT(is_data_inode(&inode->vfs_inode)); + ASSERT(is_data_inode(inode)); len = bio_ctrl->len_to_oe_boundary; } - if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { /* bio full: move on to a new one */ submit_one_bio(bio_ctrl); continue; } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, + len); size -= len; pg_offset += len; disk_bytenr += len; /* - * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * len_to_oe_boundary defaults to U32_MAX, which isn't folio or * sector aligned. alloc_new_bio() then sets it to the end of * our ordered extent for writes into zoned devices. * @@ -876,15 +801,15 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, * boundary is correct. * * When len_to_oe_boundary is U32_MAX, the cap above would - * result in a 4095 byte IO for the last page right before - * we hit the bio limit of UINT_MAX. bio_add_page() has all + * result in a 4095 byte IO for the last folio right before + * we hit the bio limit of UINT_MAX. bio_add_folio() has all * the checks required to make sure we don't overflow the bio, * and we should just ignore len_to_oe_boundary completely * unless we're using it to track an ordered extent. * * It's pretty hard to make a bio sized U32_MAX, but it can * happen when the page cache is able to feed us contiguous - * pages for large extents. + * folios for large extents. */ if (bio_ctrl->len_to_oe_boundary != U32_MAX) bio_ctrl->len_to_oe_boundary -= len; @@ -934,49 +859,49 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, return ret; } -int set_page_extent_mapped(struct page *page) +int set_folio_extent_mapped(struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); if (folio_test_private(folio)) return 0; - fs_info = btrfs_sb(page->mapping->host->i_sb); + fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, page->mapping)) + if (btrfs_is_subpage(fs_info, folio->mapping)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; } -void clear_page_extent_mapped(struct page *page) +void clear_folio_extent_mapped(struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); if (!folio_test_private(folio)) return; - fs_info = btrfs_sb(page->mapping->host->i_sb); - if (btrfs_is_subpage(fs_info, page->mapping)) + fs_info = folio_to_fs_info(folio); + if (btrfs_is_subpage(fs_info, folio->mapping)) return btrfs_detach_subpage(fs_info, folio); folio_detach_private(folio); } -static struct extent_map * -__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, struct extent_map **em_cached) +static struct extent_map *get_extent_map(struct btrfs_inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; - if (em_cached && *em_cached) { + ASSERT(em_cached); + + if (*em_cached) { em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { @@ -988,12 +913,13 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR(em)) { + em = btrfs_get_extent(inode, folio, start, len); + if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } + return em; } /* @@ -1003,12 +929,12 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 start = page_offset(page); + struct inode *inode = folio->mapping->host; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + u64 start = folio_pos(folio); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; @@ -1018,26 +944,24 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int ret = 0; size_t pg_offset = 0; size_t iosize; - size_t blocksize = inode->i_sb->s_blocksize; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + size_t blocksize = fs_info->sectorsize; - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_extent(tree, start, end, NULL); - unlock_page(page); + folio_unlock(folio); return ret; } - if (page->index == last_byte >> PAGE_SHIFT) { - size_t zero_offset = offset_in_page(last_byte); + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { + size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { - iosize = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, iosize); + iosize = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, iosize); } } bio_ctrl->end_io_func = end_bbio_data_read; - begin_page_read(fs_info, page); + begin_folio_read(fs_info, folio); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; @@ -1045,17 +969,14 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = PAGE_SIZE - pg_offset; - memzero_page(page, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + iosize = folio_size(folio) - pg_offset; + folio_zero_range(folio, pg_offset, iosize); + end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, em_cached); + em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end, NULL); - end_page_read(page, false, cur, end + 1 - cur); + end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; @@ -1067,10 +988,10 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = min(extent_map_end(em) - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) - disk_bytenr = em->block_start; + disk_bytenr = em->disk_bytenr; else - disk_bytenr = em->block_start + extent_offset; - block_start = em->block_start; + disk_bytenr = extent_map_block_start(em) + extent_offset; + block_start = extent_map_block_start(em); if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; @@ -1080,8 +1001,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a - * single bio to populate the pages for the 2 ranges because - * this makes the compressed extent read zero out the pages + * single bio to populate the folios for the 2 ranges because + * this makes the compressed extent read zero out the folios * belonging to the 2nd range. Imagine the following scenario: * * File layout @@ -1094,13 +1015,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * [extent X, compressed length = 4K uncompressed length = 16K] * * If the bio to read the compressed extent covers both ranges, - * it will decompress extent X into the pages belonging to the + * it will decompress extent X into the folios belonging to the * first range and then it will stop, zeroing out the remaining - * pages that belong to the other range that points to extent X. + * folios that belong to the other range that points to extent X. * So here we make sure we submit 2 bios, one for the first * range and another one for the third range. Both will target * the same physical extent from disk, but we can't currently - * make the compressed bio endio callback populate the pages + * make the compressed bio endio callback populate the folios * for both ranges because each compressed bio is tightly * coupled with a single extent map, and each range can have * an extent map with a different offset value relative to the @@ -1121,18 +1042,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - memzero_page(page, pg_offset, iosize); + folio_zero_range(folio, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } - /* the get_extent function already copied into the page */ + /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -1145,8 +1064,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - pg_offset); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + pg_offset); cur = cur + iosize; pg_offset += iosize; } @@ -1156,16 +1075,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + + free_extent_map(em_cached); - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); /* * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. @@ -1174,60 +1097,231 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } -static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - u64 *prev_em_start) +static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, + u64 start, u32 len) { - struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); - int index; + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + unsigned int start_bit; + unsigned int nbits; + + ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + nbits = len >> fs_info->sectorsize_bits; + ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); + bitmap_set(delalloc_bitmap, start_bit, nbits); +} - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); +static bool find_next_delalloc_bitmap(struct folio *folio, + unsigned long *delalloc_bitmap, u64 start, + u64 *found_start, u32 *found_len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + const unsigned int bitmap_size = fs_info->sectors_per_page; + unsigned int start_bit; + unsigned int first_zero; + unsigned int first_set; + + ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); + if (first_set >= bitmap_size) + return false; - for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio_ctrl, - prev_em_start); - put_page(pages[index]); - } + *found_start = folio_start + (first_set << fs_info->sectorsize_bits); + first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); + *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; + return true; } /* - * helper for __extent_writepage, doing all of the delayed allocation setup. + * Do all of the delayed allocation setup. + * + * Return >0 if all the dirty blocks are submitted async (compression) or inlined. + * The @folio should no longer be touched (treat it as already unlocked). * - * This returns 1 if btrfs_run_delalloc_range function did all the work required - * to write the page (copy into inline extent). In this case the IO has - * been started and the page is already unlocked. + * Return 0 if there is still dirty block that needs to be submitted through + * extent_writepage_io(). + * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be + * submitted, and @folio is still kept locked. * - * This returns 0 if all went well (page still locked) - * This returns < 0 if there were errors (page still locked) + * Return <0 if there is any error hit. + * Any allocated ordered extent range covering this folio will be marked + * finished (IOERR), and @folio is still kept locked. */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc) + struct folio *folio, + struct btrfs_bio_ctrl *bio_ctrl) { - const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; + struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); + struct writeback_control *wbc = bio_ctrl->wbc; + const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const u64 page_start = folio_pos(folio); + const u64 page_end = page_start + folio_size(folio) - 1; + unsigned long delalloc_bitmap = 0; + /* + * Save the last found delalloc end. As the delalloc end can go beyond + * page boundary, thus we cannot rely on subpage bitmap to locate the + * last delalloc end. + */ + u64 last_delalloc_end = 0; + /* + * The range end (exclusive) of the last successfully finished delalloc + * range. + * Any range covered by ordered extent must either be manually marked + * finished (error handling), or has IO submitted (and finish the + * ordered extent normally). + * + * This records the end of ordered extent cleanup if we hit an error. + */ + u64 last_finished_delalloc_end = page_start; u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; int ret = 0; + int bit; + + /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ + if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { + ASSERT(fs_info->sectors_per_page > 1); + btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); + } else { + bio_ctrl->submit_bitmap = 1; + } + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + u64 start = page_start + (bit << fs_info->sectorsize_bits); + btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); + } + + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; - if (!find_lock_delalloc_range(&inode->vfs_inode, page, + if (!find_lock_delalloc_range(&inode->vfs_inode, folio, &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; } - - ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, wbc); - if (ret < 0) - return ret; - + set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, + min(delalloc_end, page_end) + 1 - delalloc_start); + last_delalloc_end = delalloc_end; delalloc_start = delalloc_end + 1; } + delalloc_start = page_start; + if (!last_delalloc_end) + goto out; + + /* Run the delalloc ranges for the above locked ranges. */ + while (delalloc_start < page_end) { + u64 found_start; + u32 found_len; + bool found; + + if (!is_subpage) { + /* + * For non-subpage case, the found delalloc range must + * cover this folio and there must be only one locked + * delalloc range. + */ + found_start = page_start; + found_len = last_delalloc_end + 1 - found_start; + found = true; + } else { + found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, + delalloc_start, &found_start, &found_len); + } + if (!found) + break; + /* + * The subpage range covers the last sector, the delalloc range may + * end beyond the folio boundary, use the saved delalloc_end + * instead. + */ + if (found_start + found_len >= page_end) + found_len = last_delalloc_end + 1 - found_start; + + if (ret >= 0) { + /* + * Some delalloc range may be created by previous folios. + * Thus we still need to clean up this range during error + * handling. + */ + last_finished_delalloc_end = found_start; + /* No errors hit so far, run the current delalloc range. */ + ret = btrfs_run_delalloc_range(inode, folio, + found_start, + found_start + found_len - 1, + wbc); + if (ret >= 0) + last_finished_delalloc_end = found_start + found_len; + if (unlikely(ret < 0)) + btrfs_err_rl(fs_info, +"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", + btrfs_root_id(inode->root), + btrfs_ino(inode), + folio_pos(folio), + fs_info->sectors_per_page, + &bio_ctrl->submit_bitmap, + found_start, found_len, ret); + } else { + /* + * We've hit an error during previous delalloc range, + * have to cleanup the remaining locked ranges. + */ + unlock_extent(&inode->io_tree, found_start, + found_start + found_len - 1, NULL); + unlock_delalloc_folio(&inode->vfs_inode, folio, + found_start, + found_start + found_len - 1); + } + + /* + * We have some ranges that's going to be submitted asynchronously + * (compression or inline). These range have their own control + * on when to unlock the pages. We should not touch them + * anymore, so clear the range from the submission bitmap. + */ + if (ret > 0) { + unsigned int start_bit = (found_start - page_start) >> + fs_info->sectorsize_bits; + unsigned int end_bit = (min(page_end + 1, found_start + found_len) - + page_start) >> fs_info->sectorsize_bits; + bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); + } + /* + * Above btrfs_run_delalloc_range() may have unlocked the folio, + * thus for the last range, we cannot touch the folio anymore. + */ + if (found_start + found_len >= last_delalloc_end + 1) + break; + + delalloc_start = found_start + found_len; + } + /* + * It's possible we had some ordered extents created before we hit + * an error, cleanup non-async successfully created delalloc ranges. + */ + if (unlikely(ret < 0)) { + unsigned int bitmap_size = min( + (last_finished_delalloc_end - page_start) >> + fs_info->sectorsize_bits, + fs_info->sectors_per_page); + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) + btrfs_mark_ordered_io_finished(inode, folio, + page_start + (bit << fs_info->sectorsize_bits), + fs_info->sectorsize, false); + return ret; + } +out: + if (last_delalloc_end) + delalloc_end = last_delalloc_end; + else + delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so * we don't subtract one from PAGE_SIZE @@ -1236,10 +1330,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); /* - * If btrfs_run_dealloc_range() already started I/O and unlocked - * the pages, we just need to account for them here. + * If all ranges are submitted asynchronously, we just need to account + * for them here. */ - if (ret == 1) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1257,179 +1351,165 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } /* - * Find the first byte we need to write. - * - * For subpage, one page can contain several sectors, and - * __extent_writepage_io() will just grab all extent maps in the page - * range and try to submit all non-inline/non-compressed extents. - * - * This is a big problem for subpage, we shouldn't re-submit already written - * data at all. - * This function will lookup subpage dirty bit to find which range we really - * need to submit. + * Return 0 if we have submitted or queued the sector for submission. + * Return <0 for critical errors. * - * Return the next dirty range in [@start, @end). - * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ -static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, - struct page *page, u64 *start, u64 *end) +static int submit_one_sector(struct btrfs_inode *inode, + struct folio *folio, + u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { - struct folio *folio = page_folio(page); - struct btrfs_subpage *subpage = folio_get_private(folio); - struct btrfs_subpage_info *spi = fs_info->subpage_info; - u64 orig_start = *start; - /* Declare as unsigned long so we can use bitmap ops */ - unsigned long flags; - int range_start_bit; - int range_end_bit; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map *em; + u64 block_start; + u64 disk_bytenr; + u64 extent_offset; + u64 em_end; + const u32 sectorsize = fs_info->sectorsize; - /* - * For regular sector size == page size case, since one page only - * contains one sector, we return the page offset directly. - */ - if (!btrfs_is_subpage(fs_info, page->mapping)) { - *start = page_offset(page); - *end = page_offset(page) + PAGE_SIZE; - return; - } + ASSERT(IS_ALIGNED(filepos, sectorsize)); - range_start_bit = spi->dirty_offset + - (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + /* @filepos >= i_size case should be handled by the caller. */ + ASSERT(filepos < i_size); - /* We should have the page locked, but just in case */ - spin_lock_irqsave(&subpage->lock, flags); - bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, - spi->dirty_offset + spi->bitmap_nr_bits); - spin_unlock_irqrestore(&subpage->lock, flags); + em = btrfs_get_extent(inode, NULL, filepos, sectorsize); + if (IS_ERR(em)) + return PTR_ERR(em); - range_start_bit -= spi->dirty_offset; - range_end_bit -= spi->dirty_offset; + extent_offset = filepos - em->start; + em_end = extent_map_end(em); + ASSERT(filepos <= em_end); + ASSERT(IS_ALIGNED(em->start, sectorsize)); + ASSERT(IS_ALIGNED(em->len, sectorsize)); - *start = page_offset(page) + range_start_bit * fs_info->sectorsize; - *end = page_offset(page) + range_end_bit * fs_info->sectorsize; + block_start = extent_map_block_start(em); + disk_bytenr = extent_map_block_start(em) + extent_offset; + + ASSERT(!extent_map_is_compressed(em)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + + free_extent_map(em); + em = NULL; + + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * a folio for a range already written to disk. + */ + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); + /* + * Above call should set the whole folio with writeback flag, even + * just for a single subpage sector. + * As long as the folio is properly locked and the range is correct, + * we should always get the folio with writeback flag. + */ + ASSERT(folio_test_writeback(folio)); + + submit_extent_folio(bio_ctrl, disk_bytenr, folio, + sectorsize, filepos - folio_pos(folio)); + return 0; } /* - * helper for __extent_writepage. This calls the writepage start hooks, + * Helper for extent_writepage(). This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * * We return 1 if the IO is started and the page is unlocked, * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - struct page *page, - struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size, - int *nr_ret) +static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, + struct folio *folio, + u64 start, u32 len, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 cur = page_offset(page); - u64 end = cur + PAGE_SIZE - 1; - u64 extent_offset; - u64 block_start; - struct extent_map *em; + unsigned long range_bitmap = 0; + bool submitted_io = false; + bool error = false; + const u64 folio_start = folio_pos(folio); + u64 cur; + int bit; int ret = 0; - int nr = 0; - ret = btrfs_writepage_cow_fixup(page); + ASSERT(start >= folio_start && + start + len <= folio_start + folio_size(folio)); + + ret = btrfs_writepage_cow_fixup(folio); if (ret) { /* Fixup worker will requeue */ - redirty_page_for_writepage(bio_ctrl->wbc, page); - unlock_page(page); + folio_redirty_for_writepage(bio_ctrl->wbc, folio); + folio_unlock(folio); return 1; } + for (cur = start; cur < start + len; cur += fs_info->sectorsize) + set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); + bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, + fs_info->sectors_per_page); + bio_ctrl->end_io_func = end_bbio_data_write; - while (cur <= end) { - u32 len = end - cur + 1; - u64 disk_bytenr; - u64 em_end; - u64 dirty_range_start = cur; - u64 dirty_range_end; - u32 iosize; + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, page, cur, len, - true); + btrfs_mark_ordered_io_finished(inode, folio, cur, + start + len - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. * But we still need to clear the dirty subpage bit, or - * the next time the page gets dirtied, we will try to + * the next time the folio gets dirtied, we will try to * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len); + btrfs_folio_clear_dirty(fs_info, folio, cur, + start + len - cur); break; } - - find_next_dirty_byte(fs_info, page, &dirty_range_start, - &dirty_range_end); - if (cur < dirty_range_start) { - cur = dirty_range_start; + ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); + if (unlikely(ret < 0)) { + /* + * bio_ctrl may contain a bio crossing several folios. + * Submit it immediately so that the bio has a chance + * to finish normally, other than marked as error. + */ + submit_one_bio(bio_ctrl); + /* + * Failed to grab the extent map which should be very rare. + * Since there is no bio submitted to finish the ordered + * extent, we have to manually finish this sector. + */ + btrfs_mark_ordered_io_finished(inode, folio, cur, + fs_info->sectorsize, false); + error = true; continue; } - - em = btrfs_get_extent(inode, NULL, 0, cur, len); - if (IS_ERR(em)) { - ret = PTR_ERR_OR_ZERO(em); - goto out_error; - } - - extent_offset = cur - em->start; - em_end = extent_map_end(em); - ASSERT(cur <= em_end); - ASSERT(cur < end); - ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); - ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); - - block_start = em->block_start; - disk_bytenr = em->block_start + extent_offset; - - ASSERT(!extent_map_is_compressed(em)); - ASSERT(block_start != EXTENT_MAP_HOLE); - ASSERT(block_start != EXTENT_MAP_INLINE); - - /* - * Note that em_end from extent_map_end() and dirty_range_end from - * find_next_dirty_byte() are all exclusive - */ - iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - free_extent_map(em); - em = NULL; - - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(inode->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } - - /* - * Although the PageDirty bit is cleared before entering this - * function, subpage dirty bit is not cleared. - * So clear subpage dirty bit here so next time we won't submit - * page for range already written to disk. - */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); - - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - cur - page_offset(page)); - cur += iosize; - nr++; + submitted_io = true; } - btrfs_folio_assert_not_dirty(fs_info, page_folio(page)); - *nr_ret = nr; - return 0; - -out_error: /* - * If we finish without problem, we should not only clear page dirty, - * but also empty subpage dirty bits + * If we didn't submitted any sector (>= i_size), folio dirty get + * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared + * by folio_start_writeback() if the folio is not dirty). + * + * Here we set writeback and clear for the range. If the full folio + * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. + * + * If we hit any error, the corresponding sector will still be dirty + * thus no need to clear PAGECACHE_TAG_DIRTY. */ - *nr_ret = nr; + if (!submitted_io && !error) { + btrfs_folio_set_writeback(fs_info, folio, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); + } return ret; } @@ -1442,70 +1522,70 @@ out_error: * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; - const u64 page_start = page_offset(page); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; - int nr = 0; size_t pg_offset; - loff_t i_size = i_size_read(inode); + loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(page, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); - WARN_ON(!PageLocked(page)); + WARN_ON(!folio_test_locked(folio)); - pg_offset = offset_in_page(i_size); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { + pg_offset = offset_in_folio(folio, i_size); + if (folio->index > end_index || + (folio->index == end_index && !pg_offset)) { folio_invalidate(folio, 0, folio_size(folio)); folio_unlock(folio); return 0; } - if (page->index == end_index) - memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); + if (folio->index == end_index) + folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); - ret = set_page_extent_mapped(page); + /* + * Default to unlock the whole folio. + * The proper bitmap can only be initialized until writepage_delalloc(). + */ + bio_ctrl->submit_bitmap = (unsigned long)-1; + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + ret = writepage_delalloc(inode, folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done; - ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); + ret = extent_writepage_io(inode, folio, folio_pos(folio), + PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; + if (ret < 0) + btrfs_err_rl(fs_info, +"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", + btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(folio), fs_info->sectors_per_page, + &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); - } - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, - PAGE_SIZE, !ret); - mapping_set_error(page->mapping, ret); - } - unlock_page(page); + if (ret < 0) + mapping_set_error(folio->mapping, ret); + /* + * Only unlock ranges that are submitted. As there can be some async + * submitted ranges inside the folio. + */ + btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } -void wait_on_extent_buffer_writeback(struct extent_buffer *eb) -{ - wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, - TASK_UNINTERRUPTIBLE); -} - /* * Lock extent buffer status and pages for writeback. * @@ -1581,7 +1661,7 @@ static void set_btree_ioerr(struct extent_buffer *eb) * can be no longer dirty nor marked anymore for writeback (if a * subsequent modification to the extent buffer didn't happen before the * transaction commit), which makes filemap_fdata[write|wait]_range not - * able to find the pages tagged with SetPageError at transaction + * able to find the pages which contain errors at transaction * commit time. So if this happens we must abort the transaction, * otherwise we commit a super block with btree roots that point to * btree nodes/leafs whose content on disk is invalid - either garbage @@ -1627,7 +1707,7 @@ static void set_btree_ioerr(struct extent_buffer *eb) * context. */ static struct extent_buffer *find_extent_buffer_nolock( - struct btrfs_fs_info *fs_info, u64 start) + const struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; @@ -1646,11 +1726,10 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; - bool uptodate = !bbio->bio.bi_status; struct folio_iter fi; u32 bio_offset = 0; - if (!uptodate) + if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { @@ -1727,7 +1806,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, ret = bio_add_folio(&bbio->bio, folio, eb->len, eb->start - folio_pos(folio)); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + wbc_account_cgroup_owner(wbc, folio, eb->len); folio_unlock(folio); } else { int num_folios = num_extent_folios(eb); @@ -1739,15 +1818,14 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_lock(folio); folio_clear_dirty_for_io(folio); folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - folio_size(folio)); + wbc_account_cgroup_owner(wbc, folio, eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } /* @@ -1764,17 +1842,16 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) +static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); int submitted = 0; - u64 page_start = page_offset(page); + u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { + while (bit_start < fs_info->sectors_per_page) { struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; @@ -1784,21 +1861,21 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); bit_start++; continue; } - start = page_start + bit_start * fs_info->sectorsize; + start = folio_start + bit_start * fs_info->sectorsize; bit_start += sectors_per_node; /* @@ -1807,7 +1884,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1846,19 +1923,18 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) +static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = page->mapping; - struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; struct extent_buffer *eb; int ret; if (!folio_test_private(folio)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { @@ -1915,7 +1991,7 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct btrfs_eb_write_context ctx = { .wbc = wbc }; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -1956,7 +2032,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &ctx); + ret = submit_eb_page(folio, &ctx); if (ret == 0) continue; if (ret < 0) { @@ -2010,7 +2086,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. * - * We can get ret > 0 from submit_extent_page() indicating how many ebs + * We can get ret > 0 from submit_extent_folio() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) @@ -2137,7 +2213,27 @@ retry: continue; } - if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * For subpage case, compression can lead to mixed + * writeback and dirty flags, e.g: + * 0 32K 64K 96K 128K + * | |//////||/////| |//| + * + * In above case, [32K, 96K) is asynchronously submitted + * for compression, and [124K, 128K) needs to be written back. + * + * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * won't be submitted as the page still has writeback flag + * and will be skipped in the next check. + * + * This mixed writeback and dirty case is only possible for + * subpage case. + * + * TODO: Remove this check after migrating compression to + * regular submission. + */ + if (wbc->sync_mode != WB_SYNC_NONE || + btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2149,7 +2245,7 @@ retry: continue; } - ret = __extent_writepage(&folio->page, bio_ctrl); + ret = extent_writepage(folio, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2196,14 +2292,14 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -void extent_write_locked_range(struct inode *inode, struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { bool found_error = false; int ret = 0; struct address_space *mapping = inode->i_mapping; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); const u32 sectorsize = fs_info->sectorsize; loff_t i_size = i_size_read(inode); u64 cur = start; @@ -2220,44 +2316,50 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); u32 cur_len = cur_end + 1 - cur; - struct page *page; - int nr = 0; - - page = find_get_page(mapping, cur >> PAGE_SHIFT); - ASSERT(PageLocked(page)); - if (pages_dirty && page != locked_page) { - ASSERT(PageDirty(page)); - clear_page_dirty_for_io(page); + struct folio *folio; + + folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); + + /* + * This shouldn't happen, the pages are pinned and locked, this + * code is just in case, but shouldn't actually be run. + */ + if (IS_ERR(folio)) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + cur, cur_len, false); + mapping_set_error(mapping, PTR_ERR(folio)); + cur = cur_end + 1; + continue; } - ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, - i_size, &nr); + ASSERT(folio_test_locked(folio)); + if (pages_dirty && folio != locked_folio) + ASSERT(folio_test_dirty(folio)); + + /* + * Set the submission bitmap to submit all sectors. + * extent_writepage_io() will do the truncation correctly. + */ + bio_ctrl.submit_bitmap = (unsigned long)-1; + ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, + &bio_ctrl, i_size); if (ret == 1) goto next_page; - /* Make sure the mapping tag for page dirty gets cleared. */ - if (nr == 0) { - set_page_writeback(page); - end_page_writeback(page); - } - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, - cur, cur_len, !ret); - mapping_set_error(page->mapping, ret); - } - btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len); + if (ret) + mapping_set_error(mapping, ret); + btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: - put_page(page); + folio_put(folio); cur = cur_end + 1; } submit_write_bio(&bio_ctrl, found_error ? ret : 0); } -int extent_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; int ret = 0; @@ -2277,21 +2379,23 @@ int extent_writepages(struct address_space *mapping, return ret; } -void extent_readahead(struct readahead_control *rac) +void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; - struct page *pagepool[16]; + struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - int nr; - while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = readahead_pos(rac); - u64 contig_end = contig_start + readahead_batch_length(rac) - 1; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); - contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio_ctrl, &prev_em_start); - } + while ((folio = readahead_folio(rac)) != NULL) + btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + + unlock_extent(&inode->io_tree, start, end, &cached_state); if (em_cached) free_extent_map(em_cached); @@ -2309,7 +2413,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); u64 end = start + folio_size(folio) - 1; - size_t blocksize = folio->mapping->host->i_sb->s_blocksize; + size_t blocksize = folio_to_fs_info(folio)->sectorsize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -2335,19 +2439,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -static int try_release_extent_state(struct extent_io_tree *tree, - struct page *page, gfp_t mask) +static bool try_release_extent_state(struct extent_io_tree *tree, + struct folio *folio) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; - int ret = 1; + bool ret; if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { - ret = 0; + ret = false; } else { u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED); + int ret2; /* * At this point we can safely clear everything except the @@ -2355,15 +2460,15 @@ static int try_release_extent_state(struct extent_io_tree *tree, * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. */ - if (ret < 0) - ret = 0; + if (ret2 < 0) + ret = false; else - ret = 1; + ret = true; } return ret; } @@ -2373,839 +2478,80 @@ static int try_release_extent_state(struct extent_io_tree *tree, * in the range corresponding to the page, both state records and extent * map records are removed */ -int try_release_extent_mapping(struct page *page, gfp_t mask) +bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { - struct extent_map *em; - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); - struct extent_io_tree *tree = &btrfs_inode->io_tree; - struct extent_map_tree *map = &btrfs_inode->extent_tree; - - if (gfpflags_allow_blocking(mask) && - page->mapping->host->i_size > SZ_16M) { - u64 len; - while (start <= end) { - struct btrfs_fs_info *fs_info; - u64 cur_gen; - - len = end - start + 1; - write_lock(&map->lock); - em = lookup_extent_mapping(map, start, len); - if (!em) { - write_unlock(&map->lock); - break; - } - if ((em->flags & EXTENT_FLAG_PINNED) || - em->start != start) { - write_unlock(&map->lock); - free_extent_map(em); - break; - } - if (test_range_bit_exists(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED)) - goto next; - /* - * If it's not in the list of modified extents, used - * by a fast fsync, we can remove it. If it's being - * logged we can safely remove it since fsync took an - * extra reference on the em. - */ - if (list_empty(&em->list) || - (em->flags & EXTENT_FLAG_LOGGING)) - goto remove_em; - /* - * If it's in the list of modified extents, remove it - * only if its generation is older then the current one, - * in which case we don't need it for a fast fsync. - * Otherwise don't remove it, we could be racing with an - * ongoing fast fsync that could miss the new extent. - */ - fs_info = btrfs_inode->root->fs_info; - spin_lock(&fs_info->trans_lock); - cur_gen = fs_info->generation; - spin_unlock(&fs_info->trans_lock); - if (em->generation >= cur_gen) - goto next; -remove_em: - /* - * We only remove extent maps that are not in the list of - * modified extents or that are in the list but with a - * generation lower then the current generation, so there - * is no need to set the full fsync flag on the inode (it - * hurts the fsync performance for workloads with a data - * size that exceeds or is close to the system's memory). - */ - remove_extent_mapping(map, em); - /* once for the rb tree */ - free_extent_map(em); -next: - start = extent_map_end(em); - write_unlock(&map->lock); - - /* once for us */ - free_extent_map(em); - - cond_resched(); /* Allow large-extent preemption. */ - } - } - return try_release_extent_state(tree, page, mask); -} - -/* - * To cache previous fiemap extent - * - * Will be used for merging fiemap extent - */ -struct fiemap_cache { - u64 offset; - u64 phys; - u64 len; - u32 flags; - bool cached; -}; - -/* - * Helper to submit fiemap extent. - * - * Will try to merge current fiemap extent specified by @offset, @phys, - * @len and @flags with cached one. - * And only when we fails to merge, cached one will be submitted as - * fiemap extent. - * - * Return value is the same as fiemap_fill_next_extent(). - */ -static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, - struct fiemap_cache *cache, - u64 offset, u64 phys, u64 len, u32 flags) -{ - u64 cache_end; - int ret = 0; - - /* Set at the end of extent_fiemap(). */ - ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); - - if (!cache->cached) - goto assign; - - /* - * When iterating the extents of the inode, at extent_fiemap(), we may - * find an extent that starts at an offset behind the end offset of the - * previous extent we processed. This happens if fiemap is called - * without FIEMAP_FLAG_SYNC and there are ordered extents completing - * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()). - * - * For example we are in leaf X processing its last item, which is the - * file extent item for file range [512K, 1M[, and after - * btrfs_next_leaf() releases the path, there's an ordered extent that - * completes for the file range [768K, 2M[, and that results in trimming - * the file extent item so that it now corresponds to the file range - * [512K, 768K[ and a new file extent item is inserted for the file - * range [768K, 2M[, which may end up as the last item of leaf X or as - * the first item of the next leaf - in either case btrfs_next_leaf() - * will leave us with a path pointing to the new extent item, for the - * file range [768K, 2M[, since that's the first key that follows the - * last one we processed. So in order not to report overlapping extents - * to user space, we trim the length of the previously cached extent and - * emit it. - * - * Upon calling btrfs_next_leaf() we may also find an extent with an - * offset smaller than or equals to cache->offset, and this happens - * when we had a hole or prealloc extent with several delalloc ranges in - * it, but after btrfs_next_leaf() released the path, delalloc was - * flushed and the resulting ordered extents were completed, so we can - * now have found a file extent item for an offset that is smaller than - * or equals to what we have in cache->offset. We deal with this as - * described below. - */ - cache_end = cache->offset + cache->len; - if (cache_end > offset) { - if (offset == cache->offset) { - /* - * We cached a dealloc range (found in the io tree) for - * a hole or prealloc extent and we have now found a - * file extent item for the same offset. What we have - * now is more recent and up to date, so discard what - * we had in the cache and use what we have just found. - */ - goto assign; - } else if (offset > cache->offset) { - /* - * The extent range we previously found ends after the - * offset of the file extent item we found and that - * offset falls somewhere in the middle of that previous - * extent range. So adjust the range we previously found - * to end at the offset of the file extent item we have - * just found, since this extent is more up to date. - * Emit that adjusted range and cache the file extent - * item we have just found. This corresponds to the case - * where a previously found file extent item was split - * due to an ordered extent completing. - */ - cache->len = offset - cache->offset; - goto emit; - } else { - const u64 range_end = offset + len; - - /* - * The offset of the file extent item we have just found - * is behind the cached offset. This means we were - * processing a hole or prealloc extent for which we - * have found delalloc ranges (in the io tree), so what - * we have in the cache is the last delalloc range we - * found while the file extent item we found can be - * either for a whole delalloc range we previously - * emmitted or only a part of that range. - * - * We have two cases here: - * - * 1) The file extent item's range ends at or behind the - * cached extent's end. In this case just ignore the - * current file extent item because we don't want to - * overlap with previous ranges that may have been - * emmitted already; - * - * 2) The file extent item starts behind the currently - * cached extent but its end offset goes beyond the - * end offset of the cached extent. We don't want to - * overlap with a previous range that may have been - * emmitted already, so we emit the currently cached - * extent and then partially store the current file - * extent item's range in the cache, for the subrange - * going the cached extent's end to the end of the - * file extent item. - */ - if (range_end <= cache_end) - return 0; - - if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) - phys += cache_end - offset; - - offset = cache_end; - len = range_end - cache_end; - goto emit; + struct btrfs_inode *inode = folio_to_inode(folio); + struct extent_io_tree *io_tree = &inode->io_tree; + + while (start <= end) { + const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); + const u64 len = end - start + 1; + struct extent_map_tree *extent_tree = &inode->extent_tree; + struct extent_map *em; + + write_lock(&extent_tree->lock); + em = lookup_extent_mapping(extent_tree, start, len); + if (!em) { + write_unlock(&extent_tree->lock); + break; } - } - - /* - * Only merges fiemap extents if - * 1) Their logical addresses are continuous - * - * 2) Their physical addresses are continuous - * So truly compressed (physical size smaller than logical size) - * extents won't get merged with each other - * - * 3) Share same flags - */ - if (cache->offset + cache->len == offset && - cache->phys + cache->len == phys && - cache->flags == flags) { - cache->len += len; - return 0; - } - -emit: - /* Not mergeable, need to submit cached one */ - ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, - cache->len, cache->flags); - cache->cached = false; - if (ret) - return ret; -assign: - cache->cached = true; - cache->offset = offset; - cache->phys = phys; - cache->len = len; - cache->flags = flags; - - return 0; -} - -/* - * Emit last fiemap cache - * - * The last fiemap cache may still be cached in the following case: - * 0 4k 8k - * |<- Fiemap range ->| - * |<------------ First extent ----------->| - * - * In this case, the first extent range will be cached but not emitted. - * So we must emit it before ending extent_fiemap(). - */ -static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, - struct fiemap_cache *cache) -{ - int ret; - - if (!cache->cached) - return 0; - - ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, - cache->len, cache->flags); - cache->cached = false; - if (ret > 0) - ret = 0; - return ret; -} - -static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) -{ - struct extent_buffer *clone; - struct btrfs_key key; - int slot; - int ret; - - path->slots[0]++; - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) - return 0; - - ret = btrfs_next_leaf(inode->root, path); - if (ret != 0) - return ret; - - /* - * Don't bother with cloning if there are no more file extent items for - * our inode. - */ - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 1; - - /* See the comment at fiemap_search_slot() about why we clone. */ - clone = btrfs_clone_extent_buffer(path->nodes[0]); - if (!clone) - return -ENOMEM; - - slot = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = clone; - path->slots[0] = slot; - - return 0; -} - -/* - * Search for the first file extent item that starts at a given file offset or - * the one that starts immediately before that offset. - * Returns: 0 on success, < 0 on error, 1 if not found. - */ -static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, - u64 file_offset) -{ - const u64 ino = btrfs_ino(inode); - struct btrfs_root *root = inode->root; - struct extent_buffer *clone; - struct btrfs_key key; - int slot; - int ret; - - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = file_offset; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ret; - - if (ret > 0 && path->slots[0] > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) - path->slots[0]--; - } - - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret != 0) - return ret; - - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) - return 1; - } - - /* - * We clone the leaf and use it during fiemap. This is because while - * using the leaf we do expensive things like checking if an extent is - * shared, which can take a long time. In order to prevent blocking - * other tasks for too long, we use a clone of the leaf. We have locked - * the file range in the inode's io tree, so we know none of our file - * extent items can change. This way we avoid blocking other tasks that - * want to insert items for other inodes in the same leaf or b+tree - * rebalance operations (triggered for example when someone is trying - * to push items into this leaf when trying to insert an item in a - * neighbour leaf). - * We also need the private clone because holding a read lock on an - * extent buffer of the subvolume's b+tree will make lockdep unhappy - * when we call fiemap_fill_next_extent(), because that may cause a page - * fault when filling the user space buffer with fiemap data. - */ - clone = btrfs_clone_extent_buffer(path->nodes[0]); - if (!clone) - return -ENOMEM; - - slot = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = clone; - path->slots[0] = slot; - - return 0; -} - -/* - * Process a range which is a hole or a prealloc extent in the inode's subvolume - * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc - * extent. The end offset (@end) is inclusive. - */ -static int fiemap_process_hole(struct btrfs_inode *inode, - struct fiemap_extent_info *fieinfo, - struct fiemap_cache *cache, - struct extent_state **delalloc_cached_state, - struct btrfs_backref_share_check_ctx *backref_ctx, - u64 disk_bytenr, u64 extent_offset, - u64 extent_gen, - u64 start, u64 end) -{ - const u64 i_size = i_size_read(&inode->vfs_inode); - u64 cur_offset = start; - u64 last_delalloc_end = 0; - u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; - bool checked_extent_shared = false; - int ret; - - /* - * There can be no delalloc past i_size, so don't waste time looking for - * it beyond i_size. - */ - while (cur_offset < end && cur_offset < i_size) { - struct extent_state *cached_state = NULL; - u64 delalloc_start; - u64 delalloc_end; - u64 prealloc_start; - u64 lockstart; - u64 lockend; - u64 prealloc_len = 0; - bool delalloc; - - lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize); - lockend = round_up(end, inode->root->fs_info->sectorsize); - - /* - * We are only locking for the delalloc range because that's the - * only thing that can change here. With fiemap we have a lock - * on the inode, so no buffered or direct writes can happen. - * - * However mmaps and normal page writeback will cause this to - * change arbitrarily. We have to lock the extent lock here to - * make sure that nobody messes with the tree while we're doing - * btrfs_find_delalloc_in_range. - */ - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, - delalloc_cached_state, - &delalloc_start, - &delalloc_end); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - if (!delalloc) + if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { + write_unlock(&extent_tree->lock); + free_extent_map(em); break; - + } + if (test_range_bit_exists(io_tree, em->start, + extent_map_end(em) - 1, EXTENT_LOCKED)) + goto next; /* - * If this is a prealloc extent we have to report every section - * of it that has no delalloc. + * If it's not in the list of modified extents, used by a fast + * fsync, we can remove it. If it's being logged we can safely + * remove it since fsync took an extra reference on the em. */ - if (disk_bytenr != 0) { - if (last_delalloc_end == 0) { - prealloc_start = start; - prealloc_len = delalloc_start - start; - } else { - prealloc_start = last_delalloc_end + 1; - prealloc_len = delalloc_start - prealloc_start; - } - } - - if (prealloc_len > 0) { - if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode, - disk_bytenr, - extent_gen, - backref_ctx); - if (ret < 0) - return ret; - else if (ret > 0) - prealloc_flags |= FIEMAP_EXTENT_SHARED; - - checked_extent_shared = true; - } - ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, - disk_bytenr + extent_offset, - prealloc_len, prealloc_flags); - if (ret) - return ret; - extent_offset += prealloc_len; - } - - ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, - delalloc_end + 1 - delalloc_start, - FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - if (ret) - return ret; - - last_delalloc_end = delalloc_end; - cur_offset = delalloc_end + 1; - extent_offset += cur_offset - delalloc_start; - cond_resched(); - } - - /* - * Either we found no delalloc for the whole prealloc extent or we have - * a prealloc extent that spans i_size or starts at or after i_size. - */ - if (disk_bytenr != 0 && last_delalloc_end < end) { - u64 prealloc_start; - u64 prealloc_len; - - if (last_delalloc_end == 0) { - prealloc_start = start; - prealloc_len = end + 1 - start; - } else { - prealloc_start = last_delalloc_end + 1; - prealloc_len = end + 1 - prealloc_start; - } - - if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode, - disk_bytenr, - extent_gen, - backref_ctx); - if (ret < 0) - return ret; - else if (ret > 0) - prealloc_flags |= FIEMAP_EXTENT_SHARED; - } - ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, - disk_bytenr + extent_offset, - prealloc_len, prealloc_flags); - if (ret) - return ret; - } - - return 0; -} - -static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, - struct btrfs_path *path, - u64 *last_extent_end_ret) -{ - const u64 ino = btrfs_ino(inode); - struct btrfs_root *root = inode->root; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *ei; - struct btrfs_key key; - u64 disk_bytenr; - int ret; - - /* - * Lookup the last file extent. We're not using i_size here because - * there might be preallocation past i_size. - */ - ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); - /* There can't be a file extent item at offset (u64)-1 */ - ASSERT(ret != 0); - if (ret < 0) - return ret; - - /* - * For a non-existing key, btrfs_search_slot() always leaves us at a - * slot > 0, except if the btree is empty, which is impossible because - * at least it has the inode item for this inode and all the items for - * the root inode 256. - */ - ASSERT(path->slots[0] > 0); - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { - /* No file extent items in the subvolume tree. */ - *last_extent_end_ret = 0; - return 0; - } - - /* - * For an inline extent, the disk_bytenr is where inline data starts at, - * so first check if we have an inline extent item before checking if we - * have an implicit hole (disk_bytenr == 0). - */ - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { - *last_extent_end_ret = btrfs_file_extent_end(path); - return 0; - } - - /* - * Find the last file extent item that is not a hole (when NO_HOLES is - * not enabled). This should take at most 2 iterations in the worst - * case: we have one hole file extent item at slot 0 of a leaf and - * another hole file extent item as the last item in the previous leaf. - * This is because we merge file extent items that represent holes. - */ - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); - while (disk_bytenr == 0) { - ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); - if (ret < 0) { - return ret; - } else if (ret > 0) { - /* No file extent items that are not holes. */ - *last_extent_end_ret = 0; - return 0; - } - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); - } - - *last_extent_end_ret = btrfs_file_extent_end(path); - return 0; -} - -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - const u64 ino = btrfs_ino(inode); - struct extent_state *delalloc_cached_state = NULL; - struct btrfs_path *path; - struct fiemap_cache cache = { 0 }; - struct btrfs_backref_share_check_ctx *backref_ctx; - u64 last_extent_end; - u64 prev_extent_end; - u64 range_start; - u64 range_end; - const u64 sectorsize = inode->root->fs_info->sectorsize; - bool stopped = false; - int ret; - - backref_ctx = btrfs_alloc_backref_share_check_ctx(); - path = btrfs_alloc_path(); - if (!backref_ctx || !path) { - ret = -ENOMEM; - goto out; - } - - range_start = round_down(start, sectorsize); - range_end = round_up(start + len, sectorsize); - prev_extent_end = range_start; - - ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); - if (ret < 0) - goto out; - btrfs_release_path(path); - - path->reada = READA_FORWARD; - ret = fiemap_search_slot(inode, path, range_start); - if (ret < 0) { - goto out; - } else if (ret > 0) { + if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) + goto remove_em; /* - * No file extent item found, but we may have delalloc between - * the current offset and i_size. So check for that. + * If it's in the list of modified extents, remove it only if + * its generation is older then the current one, in which case + * we don't need it for a fast fsync. Otherwise don't remove it, + * we could be racing with an ongoing fast fsync that could miss + * the new extent. */ - ret = 0; - goto check_eof_delalloc; - } - - while (prev_extent_end < range_end) { - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_file_extent_item *ei; - struct btrfs_key key; - u64 extent_end; - u64 extent_len; - u64 extent_offset = 0; - u64 extent_gen; - u64 disk_bytenr = 0; - u64 flags = 0; - int extent_type; - u8 compression; - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) - break; - - extent_end = btrfs_file_extent_end(path); - + if (em->generation >= cur_gen) + goto next; +remove_em: /* - * The first iteration can leave us at an extent item that ends - * before our range's start. Move to the next item. + * We only remove extent maps that are not in the list of + * modified extents or that are in the list but with a + * generation lower then the current generation, so there is no + * need to set the full fsync flag on the inode (it hurts the + * fsync performance for workloads with a data size that exceeds + * or is close to the system's memory). */ - if (extent_end <= range_start) - goto next_item; - - backref_ctx->curr_leaf_bytenr = leaf->start; - - /* We have in implicit hole (NO_HOLES feature enabled). */ - if (prev_extent_end < key.offset) { - const u64 hole_end = min(key.offset, range_end) - 1; - - ret = fiemap_process_hole(inode, fieinfo, &cache, - &delalloc_cached_state, - backref_ctx, 0, 0, 0, - prev_extent_end, hole_end); - if (ret < 0) { - goto out; - } else if (ret > 0) { - /* fiemap_fill_next_extent() told us to stop. */ - stopped = true; - break; - } - - /* We've reached the end of the fiemap range, stop. */ - if (key.offset >= range_end) { - stopped = true; - break; - } - } - - extent_len = extent_end - key.offset; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - compression = btrfs_file_extent_compression(leaf, ei); - extent_type = btrfs_file_extent_type(leaf, ei); - extent_gen = btrfs_file_extent_generation(leaf, ei); - - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); - if (compression == BTRFS_COMPRESS_NONE) - extent_offset = btrfs_file_extent_offset(leaf, ei); - } - - if (compression != BTRFS_COMPRESS_NONE) - flags |= FIEMAP_EXTENT_ENCODED; - - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - flags |= FIEMAP_EXTENT_DATA_INLINE; - flags |= FIEMAP_EXTENT_NOT_ALIGNED; - ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, - extent_len, flags); - } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - ret = fiemap_process_hole(inode, fieinfo, &cache, - &delalloc_cached_state, - backref_ctx, - disk_bytenr, extent_offset, - extent_gen, key.offset, - extent_end - 1); - } else if (disk_bytenr == 0) { - /* We have an explicit hole. */ - ret = fiemap_process_hole(inode, fieinfo, &cache, - &delalloc_cached_state, - backref_ctx, 0, 0, 0, - key.offset, extent_end - 1); - } else { - /* We have a regular extent. */ - if (fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode, - disk_bytenr, - extent_gen, - backref_ctx); - if (ret < 0) - goto out; - else if (ret > 0) - flags |= FIEMAP_EXTENT_SHARED; - } - - ret = emit_fiemap_extent(fieinfo, &cache, key.offset, - disk_bytenr + extent_offset, - extent_len, flags); - } - - if (ret < 0) { - goto out; - } else if (ret > 0) { - /* fiemap_fill_next_extent() told us to stop. */ - stopped = true; - break; - } - - prev_extent_end = extent_end; -next_item: - if (fatal_signal_pending(current)) { - ret = -EINTR; - goto out; - } - - ret = fiemap_next_leaf_item(inode, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - /* No more file extent items for this inode. */ - break; - } - cond_resched(); - } - -check_eof_delalloc: - /* - * Release (and free) the path before emitting any final entries to - * fiemap_fill_next_extent() to keep lockdep happy. This is because - * once we find no more file extent items exist, we may have a - * non-cloned leaf, and fiemap_fill_next_extent() can trigger page - * faults when copying data to the user space buffer. - */ - btrfs_free_path(path); - path = NULL; - - if (!stopped && prev_extent_end < range_end) { - ret = fiemap_process_hole(inode, fieinfo, &cache, - &delalloc_cached_state, backref_ctx, - 0, 0, 0, prev_extent_end, range_end - 1); - if (ret < 0) - goto out; - prev_extent_end = range_end; - } - - if (cache.cached && cache.offset + cache.len >= last_extent_end) { - const u64 i_size = i_size_read(&inode->vfs_inode); - - if (prev_extent_end < i_size) { - struct extent_state *cached_state = NULL; - u64 delalloc_start; - u64 delalloc_end; - u64 lockstart; - u64 lockend; - bool delalloc; + remove_extent_mapping(inode, em); + /* Once for the inode's extent map tree. */ + free_extent_map(em); +next: + start = extent_map_end(em); + write_unlock(&extent_tree->lock); - lockstart = round_down(prev_extent_end, sectorsize); - lockend = round_up(i_size, sectorsize); + /* Once for us, for the lookup_extent_mapping() reference. */ + free_extent_map(em); + if (need_resched()) { /* - * See the comment in fiemap_process_hole as to why - * we're doing the locking here. + * If we need to resched but we can't block just exit + * and leave any remaining extent maps. */ - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - delalloc = btrfs_find_delalloc_in_range(inode, - prev_extent_end, - i_size - 1, - &delalloc_cached_state, - &delalloc_start, - &delalloc_end); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - if (!delalloc) - cache.flags |= FIEMAP_EXTENT_LAST; - } else { - cache.flags |= FIEMAP_EXTENT_LAST; + if (!gfpflags_allow_blocking(mask)) + break; + + cond_resched(); } } - - ret = emit_last_fiemap_cache(fieinfo, &cache); -out: - free_extent_state(delalloc_cached_state); - btrfs_free_backref_share_ctx(backref_ctx); - btrfs_free_path(path); - return ret; -} - -static void __free_extent_buffer(struct extent_buffer *eb) -{ - kmem_cache_free(extent_buffer_cache, eb); + return try_release_extent_state(io_tree, folio); } static int extent_buffer_under_io(const struct extent_buffer *eb) @@ -3214,7 +2560,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) +static bool folio_range_has_eb(struct folio *folio) { struct btrfs_subpage *subpage; @@ -3224,17 +2570,11 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; - /* - * Even there is no eb refs here, we may still have - * end_page_read() call relying on page::private. - */ - if (atomic_read(&subpage->readers)) - return true; } return false; } -static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio) +static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -3288,14 +2628,14 @@ static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *f * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!folio_range_has_eb(fs_info, folio)) + if (!folio_range_has_eb(folio)) btrfs_detach_subpage(fs_info, folio); spin_unlock(&folio->mapping->i_private_lock); } -/* Release all pages attached to the extent buffer */ -static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +/* Release all folios attached to the extent buffer */ +static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) { ASSERT(!extent_buffer_under_io(eb)); @@ -3317,9 +2657,9 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) */ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - btrfs_release_extent_buffer_pages(eb); + btrfs_release_extent_buffer_folios(eb); btrfs_leak_debug_del_eb(eb); - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); } static struct extent_buffer * @@ -3361,7 +2701,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - ret = alloc_eb_folio_array(new, 0); + ret = alloc_eb_folio_array(new, false); if (ret) { btrfs_release_extent_buffer(new); return NULL; @@ -3369,7 +2709,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) for (int i = 0; i < num_folios; i++) { struct folio *folio = new->folios[i]; - int ret; ret = attach_extent_buffer_folio(new, folio, NULL); if (ret < 0) { @@ -3395,7 +2734,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return NULL; - ret = alloc_eb_folio_array(eb, 0); + ret = alloc_eb_folio_array(eb, false); if (ret) goto err; @@ -3415,10 +2754,10 @@ err: for (int i = 0; i < num_folios; i++) { if (eb->folios[i]) { detach_extent_buffer_folio(eb, eb->folios[i]); - __folio_put(eb->folios[i]); + folio_put(eb->folios[i]); } } - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); return NULL; } @@ -3545,12 +2884,13 @@ free_eb: } #endif -static struct extent_buffer *grab_extent_buffer( - struct btrfs_fs_info *fs_info, struct page *page) +static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, + struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *exists; + lockdep_assert_held(&folio->mapping->i_private_lock); + /* * For subpage case, we completely rely on radix tree to ensure we * don't try to insert two ebs for the same bytenr. So here we always @@ -3564,7 +2904,7 @@ static struct extent_buffer *grab_extent_buffer( return NULL; /* - * We could have already allocated an eb for this page and attached one + * We could have already allocated an eb for this folio and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can * just overwrite folio private. @@ -3573,16 +2913,19 @@ static struct extent_buffer *grab_extent_buffer( if (atomic_inc_not_zero(&exists->refs)) return exists; - WARN_ON(PageDirty(page)); + WARN_ON(folio_test_dirty(folio)); folio_detach_private(folio); return NULL; } -static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +/* + * Validate alignment constraints of eb at logical address @start. + */ +static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { if (!IS_ALIGNED(start, fs_info->sectorsize)) { btrfs_err(fs_info, "bad tree block start %llu", start); - return -EINVAL; + return true; } if (fs_info->nodesize < PAGE_SIZE && @@ -3590,14 +2933,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) btrfs_err(fs_info, "tree block crosses page boundary, start %llu nodesize %u", start, fs_info->nodesize); - return -EINVAL; + return true; } if (fs_info->nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start)) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", start, fs_info->nodesize); - return -EINVAL; + return true; } if (!IS_ALIGNED(start, fs_info->nodesize) && !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { @@ -3605,10 +2948,9 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", start, fs_info->nodesize); } - return 0; + return false; } - /* * Return 0 if eb->folios[i] is attached to btree inode successfully. * Return >0 if there is already another extent buffer for the range, @@ -3618,13 +2960,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * The caller needs to free the existing folios and retry using the same order. */ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct btrfs_subpage *prealloc, struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio; + struct folio *existing_folio = NULL; int ret; ASSERT(found_eb_ret); @@ -3636,37 +2979,38 @@ retry: ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) - return 0; + goto finish; existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) + if (IS_ERR(existing_folio)) { + existing_folio = NULL; goto retry; + } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); - if (folio_size(existing_folio) != folio_size(eb->folios[0])) { + if (folio_size(existing_folio) != eb->folio_size) { folio_unlock(existing_folio); folio_put(existing_folio); return -EAGAIN; } - if (fs_info->nodesize < PAGE_SIZE) { - /* - * We're going to reuse the existing page, can drop our page - * and subpage structure now. - */ +finish: + spin_lock(&mapping->i_private_lock); + if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; - } else { + } else if (existing_folio) { struct extent_buffer *existing_eb; - existing_eb = grab_extent_buffer(fs_info, - folio_page(existing_folio, 0)); + existing_eb = grab_extent_buffer(fs_info, existing_folio); if (existing_eb) { /* The extent buffer still exists, we can use it directly. */ *found_eb_ret = existing_eb; + spin_unlock(&mapping->i_private_lock); folio_unlock(existing_folio); folio_put(existing_folio); return 1; @@ -3675,6 +3019,22 @@ retry: __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; } + eb->folio_size = folio_size(eb->folios[i]); + eb->folio_shift = folio_shift(eb->folios[i]); + /* Should not fail, as we have preallocated the memory. */ + ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); + ASSERT(!ret); + /* + * To inform we have an extra eb under allocation, so that + * detach_extent_buffer_page() won't release the folio private when the + * eb hasn't been inserted into radix tree yet. + * + * The ref will be decreased when the eb releases the page, in + * detach_extent_buffer_page(). Thus needs no special handling in the + * error path. + */ + btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -3686,7 +3046,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; @@ -3741,7 +3100,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, reallocate: /* Allocate all pages first. */ - ret = alloc_eb_folio_array(eb, __GFP_NOFAIL); + ret = alloc_eb_folio_array(eb, true); if (ret < 0) { btrfs_free_subpage(prealloc); goto out; @@ -3752,7 +3111,7 @@ reallocate: for (int i = 0; i < num_folios; i++) { struct folio *folio; - ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); + ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); if (ret > 0) { ASSERT(existing_eb); goto out; @@ -3789,22 +3148,6 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - spin_lock(&mapping->i_private_lock); - /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_folio(eb, folio, prealloc); - ASSERT(!ret); - /* - * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the folio private - * when the eb hasn't yet been inserted into radix tree. - * - * The ref will be decreased when the eb released the page, in - * detach_extent_buffer_page(). - * Thus needs no special handling in error path. - */ - btrfs_folio_inc_eb_refs(fs_info, folio); - spin_unlock(&mapping->i_private_lock); - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); /* @@ -3860,7 +3203,7 @@ again: * live buffer and won't free them prematurely. */ for (int i = 0; i < num_folios; i++) - unlock_page(folio_page(eb->folios[i], 0)); + folio_unlock(eb->folios[i]); return eb; out: @@ -3884,13 +3227,13 @@ out: for (int i = 0; i < attached; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); - unlock_page(folio_page(eb->folios[i], 0)); + folio_unlock(eb->folios[i]); folio_put(eb->folios[i]); eb->folios[i] = NULL; } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utlizing page->mapping. + * so it can be cleaned up without utilizing page->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -3906,7 +3249,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) struct extent_buffer *eb = container_of(head, struct extent_buffer, rcu_head); - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); } static int release_extent_buffer(struct extent_buffer *eb) @@ -3930,11 +3273,11 @@ static int release_extent_buffer(struct extent_buffer *eb) } btrfs_leak_debug_del_eb(eb); - /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_pages(eb); + /* Should be safe to release folios at this point. */ + btrfs_release_extent_buffer_folios(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); return 1; } #endif @@ -4037,7 +3380,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, * The actual zeroout of the buffer will happen later in * btree_csum_one_bio. */ - if (btrfs_is_zoned(fs_info)) { + if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); return; } @@ -4076,6 +3419,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { bool subpage = eb->fs_info->nodesize < PAGE_SIZE; @@ -4092,12 +3436,12 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) * the above race. */ if (subpage) - lock_page(folio_page(eb->folios[0], 0)); + folio_lock(eb->folios[0]); for (int i = 0; i < num_folios; i++) btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], eb->start, eb->len); if (subpage) - unlock_page(folio_page(eb->folios[0], 0)); + folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, eb->len, eb->fs_info->dirty_metadata_batch); @@ -4153,6 +3497,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } +static void clear_extent_buffer_reading(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); +} + static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; @@ -4161,6 +3512,13 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) struct folio_iter fi; u32 bio_offset = 0; + /* + * If the extent buffer is marked UPTODATE before the read operation + * completes, other calls to read_extent_buffer_pages() will return + * early without waiting for the read to finish, causing data races. + */ + WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); + eb->read_mirror = bbio->mirror_num; if (uptodate && @@ -4187,16 +3545,14 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) bio_offset += len; } - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_extent_buffer_reading(eb); free_extent_buffer(eb); bio_put(&bbio->bio); } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, - struct btrfs_tree_parent_check *check) +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; bool ret; @@ -4214,7 +3570,18 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, /* Someone else is already reading the buffer, just wait for it. */ if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) - goto done; + return 0; + + /* + * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above + * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have + * started and finished reading the same eb. In this case, UPTODATE + * will now be set, and we shouldn't read it in again. + */ + if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { + clear_extent_buffer_reading(eb); + return 0; + } clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; @@ -4238,19 +3605,26 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, for (int i = 0; i < num_folios; i++) { struct folio *folio = eb->folios[i]; - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); } } - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bbio(bbio, mirror_num); + return 0; +} -done: - if (wait == WAIT_COMPLETE) { - wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return -EIO; - } +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *check) +{ + int ret; + ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); + if (ret < 0) + return ret; + + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return -EIO; return 0; } @@ -4258,7 +3632,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, unsigned long len) { btrfs_warn(eb->fs_info, - "access to eb bytenr %llu len %lu out of range start %lu len %lu", + "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); @@ -4287,7 +3661,7 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *dst = (char *)dstv; @@ -4327,7 +3701,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char __user *dst = (char __user *)dstv; @@ -4367,7 +3741,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4423,8 +3797,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) return; if (fs_info->nodesize < PAGE_SIZE) { - struct folio *folio = eb->folios[0]; - + folio = eb->folios[0]; ASSERT(i == 0); if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, eb->start, eb->len))) @@ -4438,11 +3811,11 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; - char *src = (char *)srcv; + const char *src = (const char *)srcv; unsigned long i = get_eb_folio_index(eb, start); /* For unmapped (dummy) ebs, no need to check their uptodate status. */ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -4487,7 +3860,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long cur = start; if (eb->addr) { @@ -4518,7 +3891,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - const int unit_size = folio_size(src->folios[0]); + const int unit_size = src->folio_size; unsigned long cur = 0; ASSERT(dst->len == src->len); @@ -4540,7 +3913,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; u64 dst_len = dst->len; size_t cur; size_t offset; @@ -4596,10 +3969,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset; + offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; - *folio_index = offset >> folio_shift(eb->folios[0]); - *folio_offset = offset_in_folio(eb->folios[0], offset); + *folio_index = offset >> eb->folio_shift; + *folio_offset = offset_in_eb_folio(eb, offset); } /* @@ -4713,7 +4086,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || @@ -4799,17 +4172,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, #define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( - struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) + const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) { struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; - u64 page_start = page_offset(page); - u64 cur = page_start; + u64 folio_start = folio_pos(folio); + u64 cur = folio_start; - ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { + while (cur < folio_start + PAGE_SIZE) { int ret; int i; @@ -4821,7 +4194,7 @@ static struct extent_buffer *get_next_extent_buffer( goto out; for (i = 0; i < ret; i++) { /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) + if (gang[i]->start >= folio_start + PAGE_SIZE) goto out; /* Found one */ if (gang[i]->start >= bytenr) { @@ -4835,11 +4208,11 @@ out: return found; } -static int try_release_subpage_extent_buffer(struct page *page) +static int try_release_subpage_extent_buffer(struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - u64 cur = page_offset(page); - const u64 end = page_offset(page) + PAGE_SIZE; + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + u64 cur = folio_pos(folio); + const u64 end = cur + PAGE_SIZE; int ret; while (cur < end) { @@ -4854,7 +4227,7 @@ static int try_release_subpage_extent_buffer(struct page *page) * with spinlock rather than RCU. */ spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, page, cur); + eb = get_next_extent_buffer(fs_info, folio, cur); if (!eb) { /* No more eb in the page range after or at cur */ spin_unlock(&fs_info->buffer_lock); @@ -4895,31 +4268,30 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. */ - spin_lock(&page->mapping->i_private_lock); - if (!folio_test_private(page_folio(page))) + spin_lock(&folio->mapping->i_private_lock); + if (!folio_test_private(folio)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return ret; } -int try_release_extent_buffer(struct page *page) +int try_release_extent_buffer(struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return try_release_subpage_extent_buffer(page); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return try_release_subpage_extent_buffer(folio); /* * We need to make sure nobody is changing folio private, as we rely on * folio private as the pointer to extent buffer. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 1; } @@ -4934,10 +4306,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, @@ -4968,7 +4340,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { struct btrfs_tree_parent_check check = { - .has_first_key = 0, .level = level, .transid = gen }; @@ -4984,7 +4355,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, return; } - ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); + ret = read_extent_buffer_pages_nowait(eb, 0, &check); if (ret < 0) free_extent_buffer_stale(eb); else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 46050500529b..6c5328bfabc2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,11 +7,33 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/slab.h> #include "compression.h" +#include "messages.h" #include "ulist.h" #include "misc.h" +struct page; +struct file; +struct folio; +struct inode; +struct fiemap_extent_info; +struct readahead_control; +struct address_space; +struct writeback_control; +struct extent_io_tree; +struct extent_map_tree; +struct extent_state; +struct btrfs_block_group; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, @@ -63,11 +85,6 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_root; -struct btrfs_inode; -struct btrfs_fs_info; -struct extent_io_tree; -struct btrfs_tree_parent_check; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); @@ -75,7 +92,8 @@ void __cold extent_buffer_free_cachep(void); #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; - unsigned long len; + u32 len; + u32 folio_size; unsigned long bflags; struct btrfs_fs_info *fs_info; @@ -90,6 +108,7 @@ struct extent_buffer { int read_mirror; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + u8 folio_shift; struct rcu_head rcu_head; struct rw_semaphore lock; @@ -113,6 +132,13 @@ struct btrfs_eb_write_context { struct btrfs_block_group *zoned_bg; }; +static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb, + u64 start) +{ + ASSERT(eb->folio_size); + return start & (eb->folio_size - 1); +} + /* * Get the correct offset inside the page of extent buffer. * @@ -151,13 +177,13 @@ static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, * the folio_shift would be large enough to always make us * return 0 as index. * 1.2) Several page sized folios - * The folio_shift() would be PAGE_SHIFT, giving us the correct + * The folio_shift would be PAGE_SHIFT, giving us the correct * index. * * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case * The folio would only be page sized, and always give us 0 as index. */ - return offset >> folio_shift(eb->folios[0]); + return offset >> eb->folio_shift; } /* @@ -189,6 +215,11 @@ static inline struct extent_changeset *extent_changeset_alloc(void) return ret; } +static inline void extent_changeset_prealloc(struct extent_changeset *changeset, gfp_t gfp_mask) +{ + ulist_prealloc(&changeset->range_changed, gfp_mask); +} + static inline void extent_changeset_release(struct extent_changeset *changeset) { if (!changeset) @@ -205,24 +236,19 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -struct extent_map_tree; - -int try_release_extent_mapping(struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page); +bool try_release_extent_mapping(struct folio *folio, gfp_t mask); +int try_release_extent_buffer(struct folio *folio); int btrfs_read_folio(struct file *file, struct folio *folio); -void extent_write_locked_range(struct inode *inode, struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); -int extent_writepages(struct address_space *mapping, - struct writeback_control *wbc); +int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -void extent_readahead(struct readahead_control *rac); -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len); -int set_page_extent_mapped(struct page *page); -void clear_page_extent_mapped(struct page *page); +void btrfs_readahead(struct readahead_control *rac); +int set_folio_extent_mapped(struct folio *folio); +void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); @@ -235,12 +261,17 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_stale(struct extent_buffer *eb); -#define WAIT_NONE 0 -#define WAIT_COMPLETE 1 -#define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, - struct btrfs_tree_parent_check *parent_check); -void wait_on_extent_buffer_writeback(struct extent_buffer *eb); +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *parent_check); +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *parent_check); + +static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); +} + void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); @@ -326,9 +357,9 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); -void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - struct page *locked_page, + const struct folio *locked_folio, + struct extent_state **cached, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset); @@ -336,11 +367,12 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, - gfp_t extra_gfp); + bool nofail); +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, + struct folio *locked_folio, u64 *start, u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b61099bf97a8..7f46abbd6311 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -5,10 +5,10 @@ #include <linux/spinlock.h> #include "messages.h" #include "ctree.h" -#include "volumes.h" #include "extent_map.h" #include "compression.h" #include "btrfs_inode.h" +#include "disk-io.h" static struct kmem_cache *extent_map_cache; @@ -16,8 +16,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", - sizeof(struct extent_map), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_map), 0, 0, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -34,7 +33,7 @@ void __cold extent_map_exit(void) */ void extent_map_tree_init(struct extent_map_tree *tree) { - tree->map = RB_ROOT_CACHED; + tree->root = RB_ROOT; INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -78,27 +77,35 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static int tree_insert(struct rb_root_cached *root, struct extent_map *em) +static void remove_em(struct btrfs_inode *inode, struct extent_map *em) { - struct rb_node **p = &root->rb_root.rb_node; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + rb_erase(&em->rb_node, &inode->extent_tree.root); + RB_CLEAR_NODE(&em->rb_node); + + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) + percpu_counter_dec(&fs_info->evictable_extent_maps); +} + +static int tree_insert(struct rb_root *root, struct extent_map *em) +{ + struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct extent_map *entry = NULL; struct rb_node *orig_parent = NULL; u64 end = range_end(em->start, em->len); - bool leftmost = true; while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - if (em->start < entry->start) { + if (em->start < entry->start) p = &(*p)->rb_left; - } else if (em->start >= extent_map_end(entry)) { + else if (em->start >= extent_map_end(entry)) p = &(*p)->rb_right; - leftmost = false; - } else { + else return -EEXIST; - } } orig_parent = parent; @@ -121,7 +128,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); - rb_insert_color_cached(&em->rb_node, root, leftmost); + rb_insert_color(&em->rb_node, root); return 0; } @@ -179,11 +186,22 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } +static inline u64 extent_map_block_len(const struct extent_map *em) +{ + if (extent_map_is_compressed(em)) + return em->disk_num_bytes; + return em->len; +} + static inline u64 extent_map_block_end(const struct extent_map *em) { - if (em->block_start + em->block_len < em->block_start) + const u64 block_start = extent_map_block_start(em); + const u64 block_end = block_start + extent_map_block_len(em); + + if (block_end < block_start) return (u64)-1; - return em->block_start + em->block_len; + + return block_end; } static bool can_merge_extent_map(const struct extent_map *em) @@ -215,18 +233,115 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma if (extent_map_end(prev) != next->start) return false; - if (prev->flags != next->flags) + /* + * The merged flag is not an on-disk flag, it just indicates we had the + * extent maps of 2 (or more) adjacent extents merged, so factor it out. + */ + if ((prev->flags & ~EXTENT_FLAG_MERGED) != + (next->flags & ~EXTENT_FLAG_MERGED)) return false; - if (next->block_start < EXTENT_MAP_LAST_BYTE - 1) - return next->block_start == extent_map_block_end(prev); + if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) + return extent_map_block_start(next) == extent_map_block_end(prev); /* HOLES and INLINE extents. */ - return next->block_start == prev->block_start; + return next->disk_bytenr == prev->disk_bytenr; +} + +/* + * Handle the on-disk data extents merge for @prev and @next. + * + * @prev: left extent to merge + * @next: right extent to merge + * @merged: the extent we will not discard after the merge; updated with new values + * + * After this, one of the two extents is the new merged extent and the other is + * removed from the tree and likely freed. Note that @merged is one of @prev/@next + * so there is const/non-const aliasing occurring here. + * + * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes. + * For now only uncompressed regular extent can be merged. + */ +static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next, + struct extent_map *merged) +{ + u64 new_disk_bytenr; + u64 new_disk_num_bytes; + u64 new_offset; + + /* @prev and @next should not be compressed. */ + ASSERT(!extent_map_is_compressed(prev)); + ASSERT(!extent_map_is_compressed(next)); + + /* + * There are two different cases where @prev and @next can be merged. + * + * 1) They are referring to the same data extent: + * + * |<----- data extent A ----->| + * |<- prev ->|<- next ->| + * + * 2) They are referring to different data extents but still adjacent: + * + * |<-- data extent A -->|<-- data extent B -->| + * |<- prev ->|<- next ->| + * + * The calculation here always merges the data extents first, then updates + * @offset using the new data extents. + * + * For case 1), the merged data extent would be the same. + * For case 2), we just merge the two data extents into one. + */ + new_disk_bytenr = min(prev->disk_bytenr, next->disk_bytenr); + new_disk_num_bytes = max(prev->disk_bytenr + prev->disk_num_bytes, + next->disk_bytenr + next->disk_num_bytes) - + new_disk_bytenr; + new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr; + + merged->disk_bytenr = new_disk_bytenr; + merged->disk_num_bytes = new_disk_num_bytes; + merged->ram_bytes = new_disk_num_bytes; + merged->offset = new_offset; +} + +static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, + struct extent_map *em) +{ + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) + return; + btrfs_crit(fs_info, +"%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x", + prefix, em->start, em->len, em->disk_bytenr, em->disk_num_bytes, + em->ram_bytes, em->offset, em->flags); + ASSERT(0); +} + +/* Internal sanity checks for btrfs debug builds. */ +static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em) +{ + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) + return; + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + if (em->disk_num_bytes == 0) + dump_extent_map(fs_info, "zero disk_num_bytes", em); + if (em->offset + em->len > em->ram_bytes) + dump_extent_map(fs_info, "ram_bytes too small", em); + if (em->offset + em->len > em->disk_num_bytes && + !extent_map_is_compressed(em)) + dump_extent_map(fs_info, "disk_num_bytes too small", em); + if (!extent_map_is_compressed(em) && + em->ram_bytes != em->disk_num_bytes) + dump_extent_map(fs_info, + "ram_bytes mismatch with disk_num_bytes for non-compressed em", + em); + } else if (em->offset) { + dump_extent_map(fs_info, "non-zero offset for hole/inline", em); + } } -static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) +static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *merge = NULL; struct rb_node *rb; @@ -250,17 +365,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; - em->orig_start = merge->orig_start; em->len += merge->len; - em->block_len += merge->block_len; - em->block_start = merge->block_start; - em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; - em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); + + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + merge_ondisk_extents(merge, em, em); em->flags |= EXTENT_FLAG_MERGED; - rb_erase_cached(&merge->rb_node, &tree->map); - RB_CLEAR_NODE(&merge->rb_node); + validate_extent_map(fs_info, em); + remove_em(inode, merge); free_extent_map(merge); } } @@ -270,12 +383,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; - em->block_len += merge->block_len; - rb_erase_cached(&merge->rb_node, &tree->map); - RB_CLEAR_NODE(&merge->rb_node); - em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + merge_ondisk_extents(em, merge, em); + validate_extent_map(fs_info, em); em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; + remove_em(inode, merge); free_extent_map(merge); } } @@ -291,6 +404,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) * Called after an extent has been written to disk properly. Set the generation * to the generation that actually added the file item to the inode so we know * we need to sync this extent when we call fsync(). + * + * Returns: 0 on success + * -ENOENT when the extent is not found in the tree + * -EUCLEAN if the found extent does not match the expected start */ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { @@ -298,7 +415,6 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; struct extent_map *em; - bool prealloc = false; write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); @@ -307,92 +423,89 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) btrfs_warn(fs_info, "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), - start, len, gen); + start, start + len, gen); + ret = -ENOENT; goto out; } - if (WARN_ON(em->start != start)) + if (WARN_ON(em->start != start)) { btrfs_warn(fs_info, "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), - em->start, start, len, gen); + em->start, start, start + len, gen); + ret = -EUCLEAN; + goto out; + } em->generation = gen; em->flags &= ~EXTENT_FLAG_PINNED; - em->mod_start = em->start; - em->mod_len = em->len; - if (em->flags & EXTENT_FLAG_FILLING) { - prealloc = true; - em->flags &= ~EXTENT_FLAG_FILLING; - } - - try_merge_map(tree, em); - - if (prealloc) { - em->mod_start = em->start; - em->mod_len = em->len; - } + try_merge_map(inode, em); - free_extent_map(em); out: write_unlock(&tree->lock); + free_extent_map(em); return ret; } -void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { - lockdep_assert_held_write(&tree->lock); + lockdep_assert_held_write(&inode->extent_tree.lock); em->flags &= ~EXTENT_FLAG_LOGGING; if (extent_map_in_tree(em)) - try_merge_map(tree, em); + try_merge_map(inode, em); } -static inline void setup_extent_mapping(struct extent_map_tree *tree, +static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, int modified) { refcount_inc(&em->refs); - em->mod_start = em->start; - em->mod_len = em->len; ASSERT(list_empty(&em->list)); if (modified) - list_add(&em->list, &tree->modified_extents); + list_add(&em->list, &inode->extent_tree.modified_extents); else - try_merge_map(tree, em); + try_merge_map(inode, em); } /* - * Add new extent map to the extent tree + * Add a new extent map to an inode's extent map tree. * - * @tree: tree to insert new map in + * @inode: the target inode * @em: map to insert * @modified: indicate whether the given @em should be added to the * modified list, which indicates the extent needs to be logged * - * Insert @em into @tree or perform a simple forward/backward merge with - * existing mappings. The extent_map struct passed in will be inserted - * into the tree directly, with an additional reference taken, or a - * reference dropped if the merge attempt was successful. + * Insert @em into the @inode's extent map tree or perform a simple + * forward/backward merge with existing mappings. The extent_map struct passed + * in will be inserted into the tree directly, with an additional reference + * taken, or a reference dropped if the merge attempt was successful. */ -static int add_extent_mapping(struct extent_map_tree *tree, +static int add_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, int modified) { - int ret = 0; + struct extent_map_tree *tree = &inode->extent_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; lockdep_assert_held_write(&tree->lock); - ret = tree_insert(&tree->map, em); + validate_extent_map(fs_info, em); + ret = tree_insert(&tree->root, em); if (ret) - goto out; + return ret; - setup_extent_mapping(tree, em, modified); -out: - return ret; + setup_extent_mapping(inode, em, modified); + + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) + percpu_counter_inc(&fs_info->evictable_extent_maps); + + return 0; } static struct extent_map * @@ -404,7 +517,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); + rb_node = __tree_search(&tree->root, start, &prev_or_next); if (!rb_node) { if (prev_or_next) rb_node = prev_or_next; @@ -458,40 +571,47 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, } /* - * Remove an extent_map from the extent tree. + * Remove an extent_map from its inode's extent tree. * - * @tree: extent tree to remove from + * @inode: the inode the extent map belongs to * @em: extent map being removed * - * Remove @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use. + * Remove @em from the extent tree of @inode. No reference counts are dropped, + * and no checks are done to see if the range is in use. */ -void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { + struct extent_map_tree *tree = &inode->extent_tree; + lockdep_assert_held_write(&tree->lock); WARN_ON(em->flags & EXTENT_FLAG_PINNED); - rb_erase_cached(&em->rb_node, &tree->map); if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); - RB_CLEAR_NODE(&em->rb_node); + + remove_em(inode, em); } -static void replace_extent_mapping(struct extent_map_tree *tree, +static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, int modified) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *tree = &inode->extent_tree; + lockdep_assert_held_write(&tree->lock); + validate_extent_map(fs_info, new); + WARN_ON(cur->flags & EXTENT_FLAG_PINNED); ASSERT(extent_map_in_tree(cur)); if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); - rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); + rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); RB_CLEAR_NODE(&cur->rb_node); - setup_extent_mapping(tree, new, modified); + setup_extent_mapping(inode, new, modified); } static struct extent_map *next_extent_map(const struct extent_map *em) @@ -520,7 +640,7 @@ static struct extent_map *prev_extent_map(struct extent_map *em) * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. */ -static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, +static noinline int merge_extent_mapping(struct btrfs_inode *inode, struct extent_map *existing, struct extent_map *em, u64 map_start) @@ -531,7 +651,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, u64 end; u64 start_diff; - BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + if (map_start < em->start || map_start >= extent_map_end(em)) + return -EINVAL; if (existing->start > map_start) { next = existing; @@ -548,19 +669,15 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, start_diff = start - em->start; em->start = start; em->len = end - start; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - !extent_map_is_compressed(em)) { - em->block_start += start_diff; - em->block_len = em->len; - } - return add_extent_mapping(em_tree, em, 0); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + em->offset += start_diff; + return add_extent_mapping(inode, em, 0); } /* - * Add extent mapping into em_tree. + * Add extent mapping into an inode's extent map tree. * - * @fs_info: the filesystem - * @em_tree: extent tree into which we want to insert the extent mapping + * @inode: target inode * @em_in: extent we are inserting * @start: start of the logical range btrfs_get_extent() is requesting * @len: length of the logical range btrfs_get_extent() is requesting @@ -568,8 +685,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. * - * Insert @em_in into @em_tree. In case there is an overlapping range, handle - * the -EEXIST by either: + * Insert @em_in into the inode's extent map tree. In case there is an + * overlapping range, handle the -EEXIST by either: * a) Returning the existing extent in @em_in if @start is within the * existing em. * b) Merge the existing extent with @em_in passed in. @@ -577,21 +694,21 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Return 0 on success, otherwise -EEXIST. * */ -int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len) { int ret; struct extent_map *em = *em_in; + struct btrfs_fs_info *fs_info = inode->root->fs_info; /* * Tree-checker should have rejected any inline extent with non-zero * file offset. Here just do a sanity check. */ - if (em->block_start == EXTENT_MAP_INLINE) + if (em->disk_bytenr == EXTENT_MAP_INLINE) ASSERT(em->start == 0); - ret = add_extent_mapping(em_tree, em, 0); + ret = add_extent_mapping(inode, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -599,7 +716,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (ret == -EEXIST) { struct extent_map *existing; - existing = search_extent_mapping(em_tree, start, len); + existing = search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -620,15 +737,14 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, * The existing extent map is the one nearest to * the [start, start + len) range which overlaps */ - ret = merge_extent_mapping(em_tree, existing, - em, start); - if (ret) { + ret = merge_extent_mapping(inode, existing, em, start); + if (WARN_ON(ret)) { free_extent_map(em); *em_in = NULL; - WARN_ONCE(ret, -"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", - ret, existing->start, existing->len, - orig_start, orig_len); + btrfs_warn(fs_info, +"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", + existing->start, extent_map_end(existing), + orig_start, orig_start + orig_len, start); } free_extent_map(existing); } @@ -643,19 +759,26 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, * if needed. This avoids searching the tree, from the root down to the first * extent map, before each deletion. */ -static void drop_all_extent_maps_fast(struct extent_map_tree *tree) +static void drop_all_extent_maps_fast(struct btrfs_inode *inode) { + struct extent_map_tree *tree = &inode->extent_tree; + struct rb_node *node; + write_lock(&tree->lock); - while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { + node = rb_first(&tree->root); + while (node) { struct extent_map *em; - struct rb_node *node; + struct rb_node *next = rb_next(node); - node = rb_first_cached(&tree->map); em = rb_entry(node, struct extent_map, rb_node); em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); - remove_extent_mapping(tree, em); + remove_extent_mapping(inode, em); free_extent_map(em); - cond_resched_rwlock_write(&tree->lock); + + if (cond_resched_rwlock_write(&tree->lock)) + node = rb_first(&tree->root); + else + node = next; } write_unlock(&tree->lock); } @@ -686,7 +809,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, WARN_ON(end < start); if (end == (u64)-1) { if (start == 0 && !skip_pinned) { - drop_all_extent_maps_fast(em_tree); + drop_all_extent_maps_fast(inode); return; } len = (u64)-1; @@ -716,7 +839,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, u64 gen; unsigned long flags; bool modified; - bool compressed; if (em_end < end) { next_em = next_extent_map(em); @@ -750,7 +872,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, goto remove_em; gen = em->generation; - compressed = extent_map_is_compressed(em); if (em->start < start) { if (!split) { @@ -762,28 +883,21 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->start = em->start; split->len = start - em->start; - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_start = em->orig_start; - split->block_start = em->block_start; - - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->orig_block_len = max(split->block_len, - em->orig_block_len); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + split->disk_bytenr = em->disk_bytenr; + split->disk_num_bytes = em->disk_num_bytes; + split->offset = em->offset; split->ram_bytes = em->ram_bytes; } else { - split->orig_start = split->start; - split->block_len = 0; - split->block_start = em->block_start; - split->orig_block_len = 0; + split->disk_bytenr = em->disk_bytenr; + split->disk_num_bytes = 0; + split->offset = 0; split->ram_bytes = split->len; } split->generation = gen; split->flags = flags; - replace_extent_mapping(em_tree, em, split, modified); + replace_extent_mapping(inode, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; @@ -797,40 +911,26 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, } split->start = end; split->len = em_end - end; - split->block_start = em->block_start; + split->disk_bytenr = em->disk_bytenr; split->flags = flags; split->generation = gen; - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_block_len = max(em->block_len, - em->orig_block_len); - + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + split->disk_num_bytes = em->disk_num_bytes; + split->offset = em->offset + end - em->start; split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->orig_start = em->orig_start; - } else { - const u64 diff = start + len - em->start; - - split->block_len = split->len; - split->block_start += diff; - split->orig_start = em->orig_start; - } } else { + split->disk_num_bytes = 0; + split->offset = 0; split->ram_bytes = split->len; - split->orig_start = split->start; - split->block_len = 0; - split->orig_block_len = 0; } if (extent_map_in_tree(em)) { - replace_extent_mapping(em_tree, em, split, - modified); + replace_extent_mapping(inode, em, split, modified); } else { int ret; - ret = add_extent_mapping(em_tree, split, - modified); + ret = add_extent_mapping(inode, split, modified); /* Logic error, shouldn't happen. */ ASSERT(ret == 0); if (WARN_ON(ret != 0) && modified) @@ -865,7 +965,7 @@ remove_em: ASSERT(!split); btrfs_set_inode_full_sync(inode); } - remove_extent_mapping(em_tree, em); + remove_extent_mapping(inode, em); } /* @@ -920,7 +1020,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, do { btrfs_drop_extent_map_range(inode, new_em->start, end, false); write_lock(&tree->lock); - ret = add_extent_mapping(tree, new_em, modified); + ret = add_extent_mapping(inode, new_em, modified); write_unlock(&tree->lock); } while (ret == -EEXIST); @@ -965,7 +1065,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, ASSERT(em->len == len); ASSERT(!extent_map_is_compressed(em)); - ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); + ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); ASSERT(em->flags & EXTENT_FLAG_PINNED); ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); ASSERT(!list_empty(&em->list)); @@ -976,15 +1076,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; split_pre->len = pre; - split_pre->orig_start = split_pre->start; - split_pre->block_start = new_logical; - split_pre->block_len = split_pre->len; - split_pre->orig_block_len = split_pre->block_len; + split_pre->disk_bytenr = new_logical; + split_pre->disk_num_bytes = split_pre->len; + split_pre->offset = 0; split_pre->ram_bytes = split_pre->len; split_pre->flags = flags; split_pre->generation = em->generation; - replace_extent_mapping(em_tree, em, split_pre, 1); + replace_extent_mapping(inode, em, split_pre, 1); /* * Now we only have an extent_map at: @@ -994,14 +1093,13 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, /* Insert the middle extent_map. */ split_mid->start = em->start + pre; split_mid->len = em->len - pre; - split_mid->orig_start = split_mid->start; - split_mid->block_start = em->block_start + pre; - split_mid->block_len = split_mid->len; - split_mid->orig_block_len = split_mid->block_len; + split_mid->disk_bytenr = extent_map_block_start(em) + pre; + split_mid->disk_num_bytes = split_mid->len; + split_mid->offset = 0; split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); + add_extent_mapping(inode, split_mid, 1); /* Once for us */ free_extent_map(em); @@ -1016,3 +1114,270 @@ out_free_pre: free_extent_map(split_pre); return ret; } + +struct btrfs_em_shrink_ctx { + long nr_to_scan; + long scanned; +}; + +static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); + struct extent_map_tree *tree = &inode->extent_tree; + long nr_dropped = 0; + struct rb_node *node; + + lockdep_assert_held_write(&tree->lock); + + /* + * Take the mmap lock so that we serialize with the inode logging phase + * of fsync because we may need to set the full sync flag on the inode, + * in case we have to remove extent maps in the tree's list of modified + * extents. If we set the full sync flag in the inode while an fsync is + * in progress, we may risk missing new extents because before the flag + * is set, fsync decides to only wait for writeback to complete and then + * during inode logging it sees the flag set and uses the subvolume tree + * to find new extents, which may not be there yet because ordered + * extents haven't completed yet. + * + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. + */ + if (!down_read_trylock(&inode->i_mmap_lock)) + return 0; + + node = rb_first(&tree->root); + while (node) { + struct rb_node *next = rb_next(node); + struct extent_map *em; + + em = rb_entry(node, struct extent_map, rb_node); + ctx->scanned++; + + if (em->flags & EXTENT_FLAG_PINNED) + goto next; + + /* + * If the inode is in the list of modified extents (new) and its + * generation is the same (or is greater than) the current fs + * generation, it means it was not yet persisted so we have to + * set the full sync flag so that the next fsync will not miss + * it. + */ + if (!list_empty(&em->list) && em->generation >= cur_fs_gen) + btrfs_set_inode_full_sync(inode); + + remove_extent_mapping(inode, em); + trace_btrfs_extent_map_shrinker_remove_em(inode, em); + /* Drop the reference for the tree. */ + free_extent_map(em); + nr_dropped++; +next: + if (ctx->scanned >= ctx->nr_to_scan) + break; + + /* + * Stop if we need to reschedule or there's contention on the + * lock. This is to avoid slowing other tasks trying to take the + * lock. + */ + if (need_resched() || rwlock_needbreak(&tree->lock) || + btrfs_fs_closing(fs_info)) + break; + node = next; + } + up_read(&inode->i_mmap_lock); + + return nr_dropped; +} + +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + +static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_inode *inode; + long nr_dropped = 0; + u64 min_ino = fs_info->em_shrinker_last_ino + 1; + + inode = find_first_inode_to_shrink(root, min_ino); + while (inode) { + nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); + + min_ino = btrfs_ino(inode) + 1; + fs_info->em_shrinker_last_ino = btrfs_ino(inode); + iput(&inode->vfs_inode); + + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) + break; + + cond_resched(); + + inode = find_first_inode_to_shrink(root, min_ino); + } + + if (inode) { + /* + * There are still inodes in this root or we happened to process + * the last one and reached the scan limit. In either case set + * the current root to this one, so we'll resume from the next + * inode if there is one or we will find out this was the last + * one and move to the next root. + */ + fs_info->em_shrinker_last_root = btrfs_root_id(root); + } else { + /* + * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so + * that when processing the next root we start from its first inode. + */ + fs_info->em_shrinker_last_ino = 0; + fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; + } + + return nr_dropped; +} + +static void btrfs_extent_map_shrinker_worker(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_em_shrink_ctx ctx; + u64 start_root_id; + u64 next_root_id; + bool cycled = false; + long nr_dropped = 0; + + fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); + + ctx.scanned = 0; + ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); + + start_root_id = fs_info->em_shrinker_last_root; + next_root_id = fs_info->em_shrinker_last_root; + + if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); + } + + while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { + struct btrfs_root *root; + unsigned long count; + + cond_resched(); + + spin_lock(&fs_info->fs_roots_radix_lock); + count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)&root, + (unsigned long)next_root_id, 1); + if (count == 0) { + spin_unlock(&fs_info->fs_roots_radix_lock); + if (start_root_id > 0 && !cycled) { + next_root_id = 0; + fs_info->em_shrinker_last_root = 0; + fs_info->em_shrinker_last_ino = 0; + cycled = true; + continue; + } + break; + } + next_root_id = btrfs_root_id(root) + 1; + root = btrfs_grab_root(root); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (!root) + continue; + + if (is_fstree(btrfs_root_id(root))) + nr_dropped += btrfs_scan_root(root, &ctx); + + btrfs_put_root(root); + } + + if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); + } + + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); +} + +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + /* + * Do nothing if the shrinker is already running. In case of high memory + * pressure we can have a lot of tasks calling us and all passing the + * same nr_to_scan value, but in reality we may need only to free + * nr_to_scan extent maps (or less). In case we need to free more than + * that, we will be called again by the fs shrinker, so no worries about + * not doing enough work to reclaim memory from extent maps. + * We can also be repeatedly called with the same nr_to_scan value + * simply because the shrinker runs asynchronously and multiple calls + * to this function are made before the shrinker does enough progress. + * + * That's why we set the atomic counter to nr_to_scan only if its + * current value is zero, instead of incrementing the counter by + * nr_to_scan. + */ + if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + return; + + queue_work(system_unbound_wq, &fs_info->em_shrinker_work); +} + +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) +{ + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index e380fc08bbe4..cd123b266b64 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,10 +3,17 @@ #ifndef BTRFS_EXTENT_MAP_H #define BTRFS_EXTENT_MAP_H +#include <linux/compiler_types.h> +#include <linux/spinlock_types.h> #include <linux/rbtree.h> +#include <linux/list.h> #include <linux/refcount.h> +#include "misc.h" #include "compression.h" +struct btrfs_inode; +struct btrfs_fs_info; + #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) @@ -22,29 +29,65 @@ enum { ENUM_BIT(EXTENT_FLAG_PREALLOC), /* Logging this extent */ ENUM_BIT(EXTENT_FLAG_LOGGING), - /* Filling in a preallocated extent */ - ENUM_BIT(EXTENT_FLAG_FILLING), /* This em is merged from two or more physically adjacent ems */ ENUM_BIT(EXTENT_FLAG_MERGED), }; /* + * This structure represents file extents and holes. + * + * Unlike on-disk file extent items, extent maps can be merged to save memory. + * This means members only match file extent items before any merging. + * * Keep this structure as compact as possible, as we can have really large * amounts of allocated extent maps at any time. */ struct extent_map { struct rb_node rb_node; - /* all of these are in bytes */ + /* All of these are in bytes. */ + + /* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */ u64 start; + + /* + * Length of the file extent. + * + * For non-inlined file extents it's btrfs_file_extent_item::num_bytes. + * For inline extents it's sectorsize, since inline data starts at + * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus + * btrfs_file_extent_item::num_bytes is not valid. + */ u64 len; - u64 mod_start; - u64 mod_len; - u64 orig_start; - u64 orig_block_len; + + /* + * The bytenr of the full on-disk extent. + * + * For regular extents it's btrfs_file_extent_item::disk_bytenr. + * For holes it's EXTENT_MAP_HOLE and for inline extents it's + * EXTENT_MAP_INLINE. + */ + u64 disk_bytenr; + + /* + * The full on-disk extent length, matching + * btrfs_file_extent_item::disk_num_bytes. + */ + u64 disk_num_bytes; + + /* + * Offset inside the decompressed extent. + * + * For regular extents it's btrfs_file_extent_item::offset. + * For holes and inline extents it's 0. + */ + u64 offset; + + /* + * The decompressed size of the whole on-disk extent, matching + * btrfs_file_extent_item::ram_bytes. + */ u64 ram_bytes; - u64 block_start; - u64 block_len; /* * Generation of the extent map, for merged em it's the highest @@ -58,7 +101,7 @@ struct extent_map { }; struct extent_map_tree { - struct rb_root_cached map; + struct rb_root root; struct list_head modified_extents; rwlock_t lock; }; @@ -106,6 +149,16 @@ static inline int extent_map_in_tree(const struct extent_map *em) return !RB_EMPTY_NODE(&em->rb_node); } +static inline u64 extent_map_block_start(const struct extent_map *em) +{ + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + if (extent_map_is_compressed(em)) + return em->disk_bytenr; + return em->disk_bytenr + em->offset; + } + return em->disk_bytenr; +} + static inline u64 extent_map_end(const struct extent_map *em) { if (em->start + em->len < em->start) @@ -116,7 +169,7 @@ static inline u64 extent_map_end(const struct extent_map *em) void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, u64 new_logical); @@ -125,11 +178,10 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void __cold extent_map_exit(void); int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); -void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); +void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len); void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, @@ -137,5 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified); +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c new file mode 100644 index 000000000000..b80c07ad8c5e --- /dev/null +++ b/fs/btrfs/fiemap.c @@ -0,0 +1,930 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "backref.h" +#include "btrfs_inode.h" +#include "fiemap.h" +#include "file.h" +#include "file-item.h" + +struct btrfs_fiemap_entry { + u64 offset; + u64 phys; + u64 len; + u32 flags; +}; + +/* + * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file + * range from the inode's io tree, unlock the subvolume tree search path, flush + * the fiemap cache and relock the file range and research the subvolume tree. + * The value here is something negative that can't be confused with a valid + * errno value and different from 1 because that's also a return value from + * fiemap_fill_next_extent() and also it's often used to mean some btree search + * did not find a key, so make it some distinct negative value. + */ +#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) + +/* + * Used to: + * + * - Cache the next entry to be emitted to the fiemap buffer, so that we can + * merge extents that are contiguous and can be grouped as a single one; + * + * - Store extents ready to be written to the fiemap buffer in an intermediary + * buffer. This intermediary buffer is to ensure that in case the fiemap + * buffer is memory mapped to the fiemap target file, we don't deadlock + * during btrfs_page_mkwrite(). This is because during fiemap we are locking + * an extent range in order to prevent races with delalloc flushing and + * ordered extent completion, which is needed in order to reliably detect + * delalloc in holes and prealloc extents. And this can lead to a deadlock + * if the fiemap buffer is memory mapped to the file we are running fiemap + * against (a silly, useless in practice scenario, but possible) because + * btrfs_page_mkwrite() will try to lock the same extent range. + */ +struct fiemap_cache { + /* An array of ready fiemap entries. */ + struct btrfs_fiemap_entry *entries; + /* Number of entries in the entries array. */ + int entries_size; + /* Index of the next entry in the entries array to write to. */ + int entries_pos; + /* + * Once the entries array is full, this indicates what's the offset for + * the next file extent item we must search for in the inode's subvolume + * tree after unlocking the extent range in the inode's io tree and + * releasing the search path. + */ + u64 next_search_offset; + /* + * This matches struct fiemap_extent_info::fi_mapped_extents, we use it + * to count ourselves emitted extents and stop instead of relying on + * fiemap_fill_next_extent() because we buffer ready fiemap entries at + * the @entries array, and we want to stop as soon as we hit the max + * amount of extents to map, not just to save time but also to make the + * logic at extent_fiemap() simpler. + */ + unsigned int extents_mapped; + /* Fields for the cached extent (unsubmitted, not ready, extent). */ + u64 offset; + u64 phys; + u64 len; + u32 flags; + bool cached; +}; + +static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + for (int i = 0; i < cache->entries_pos; i++) { + struct btrfs_fiemap_entry *entry = &cache->entries[i]; + int ret; + + ret = fiemap_fill_next_extent(fieinfo, entry->offset, + entry->phys, entry->len, + entry->flags); + /* + * Ignore 1 (reached max entries) because we keep track of that + * ourselves in emit_fiemap_extent(). + */ + if (ret < 0) + return ret; + } + cache->entries_pos = 0; + + return 0; +} + +/* + * Helper to submit fiemap extent. + * + * Will try to merge current fiemap extent specified by @offset, @phys, + * @len and @flags with cached one. + * And only when we fails to merge, cached one will be submitted as + * fiemap extent. + * + * Return value is the same as fiemap_fill_next_extent(). + */ +static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + u64 offset, u64 phys, u64 len, u32 flags) +{ + struct btrfs_fiemap_entry *entry; + u64 cache_end; + + /* Set at the end of extent_fiemap(). */ + ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); + + if (!cache->cached) + goto assign; + + /* + * When iterating the extents of the inode, at extent_fiemap(), we may + * find an extent that starts at an offset behind the end offset of the + * previous extent we processed. This happens if fiemap is called + * without FIEMAP_FLAG_SYNC and there are ordered extents completing + * after we had to unlock the file range, release the search path, emit + * the fiemap extents stored in the buffer (cache->entries array) and + * the lock the remainder of the range and re-search the btree. + * + * For example we are in leaf X processing its last item, which is the + * file extent item for file range [512K, 1M[, and after + * btrfs_next_leaf() releases the path, there's an ordered extent that + * completes for the file range [768K, 2M[, and that results in trimming + * the file extent item so that it now corresponds to the file range + * [512K, 768K[ and a new file extent item is inserted for the file + * range [768K, 2M[, which may end up as the last item of leaf X or as + * the first item of the next leaf - in either case btrfs_next_leaf() + * will leave us with a path pointing to the new extent item, for the + * file range [768K, 2M[, since that's the first key that follows the + * last one we processed. So in order not to report overlapping extents + * to user space, we trim the length of the previously cached extent and + * emit it. + * + * Upon calling btrfs_next_leaf() we may also find an extent with an + * offset smaller than or equals to cache->offset, and this happens + * when we had a hole or prealloc extent with several delalloc ranges in + * it, but after btrfs_next_leaf() released the path, delalloc was + * flushed and the resulting ordered extents were completed, so we can + * now have found a file extent item for an offset that is smaller than + * or equals to what we have in cache->offset. We deal with this as + * described below. + */ + cache_end = cache->offset + cache->len; + if (cache_end > offset) { + if (offset == cache->offset) { + /* + * We cached a dealloc range (found in the io tree) for + * a hole or prealloc extent and we have now found a + * file extent item for the same offset. What we have + * now is more recent and up to date, so discard what + * we had in the cache and use what we have just found. + */ + goto assign; + } else if (offset > cache->offset) { + /* + * The extent range we previously found ends after the + * offset of the file extent item we found and that + * offset falls somewhere in the middle of that previous + * extent range. So adjust the range we previously found + * to end at the offset of the file extent item we have + * just found, since this extent is more up to date. + * Emit that adjusted range and cache the file extent + * item we have just found. This corresponds to the case + * where a previously found file extent item was split + * due to an ordered extent completing. + */ + cache->len = offset - cache->offset; + goto emit; + } else { + const u64 range_end = offset + len; + + /* + * The offset of the file extent item we have just found + * is behind the cached offset. This means we were + * processing a hole or prealloc extent for which we + * have found delalloc ranges (in the io tree), so what + * we have in the cache is the last delalloc range we + * found while the file extent item we found can be + * either for a whole delalloc range we previously + * emitted or only a part of that range. + * + * We have two cases here: + * + * 1) The file extent item's range ends at or behind the + * cached extent's end. In this case just ignore the + * current file extent item because we don't want to + * overlap with previous ranges that may have been + * emitted already; + * + * 2) The file extent item starts behind the currently + * cached extent but its end offset goes beyond the + * end offset of the cached extent. We don't want to + * overlap with a previous range that may have been + * emitted already, so we emit the currently cached + * extent and then partially store the current file + * extent item's range in the cache, for the subrange + * going the cached extent's end to the end of the + * file extent item. + */ + if (range_end <= cache_end) + return 0; + + if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) + phys += cache_end - offset; + + offset = cache_end; + len = range_end - cache_end; + goto emit; + } + } + + /* + * Only merges fiemap extents if + * 1) Their logical addresses are continuous + * + * 2) Their physical addresses are continuous + * So truly compressed (physical size smaller than logical size) + * extents won't get merged with each other + * + * 3) Share same flags + */ + if (cache->offset + cache->len == offset && + cache->phys + cache->len == phys && + cache->flags == flags) { + cache->len += len; + return 0; + } + +emit: + /* Not mergeable, need to submit cached one */ + + if (cache->entries_pos == cache->entries_size) { + /* + * We will need to research for the end offset of the last + * stored extent and not from the current offset, because after + * unlocking the range and releasing the path, if there's a hole + * between that end offset and this current offset, a new extent + * may have been inserted due to a new write, so we don't want + * to miss it. + */ + entry = &cache->entries[cache->entries_size - 1]; + cache->next_search_offset = entry->offset + entry->len; + cache->cached = false; + + return BTRFS_FIEMAP_FLUSH_CACHE; + } + + entry = &cache->entries[cache->entries_pos]; + entry->offset = cache->offset; + entry->phys = cache->phys; + entry->len = cache->len; + entry->flags = cache->flags; + cache->entries_pos++; + cache->extents_mapped++; + + if (cache->extents_mapped == fieinfo->fi_extents_max) { + cache->cached = false; + return 1; + } +assign: + cache->cached = true; + cache->offset = offset; + cache->phys = phys; + cache->len = len; + cache->flags = flags; + + return 0; +} + +/* + * Emit last fiemap cache + * + * The last fiemap cache may still be cached in the following case: + * 0 4k 8k + * |<- Fiemap range ->| + * |<------------ First extent ----------->| + * + * In this case, the first extent range will be cached but not emitted. + * So we must emit it before ending extent_fiemap(). + */ +static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + int ret; + + if (!cache->cached) + return 0; + + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret > 0) + ret = 0; + return ret; +} + +static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) +{ + struct extent_buffer *clone = path->nodes[0]; + struct btrfs_key key; + int slot; + int ret; + + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) + return 0; + + /* + * Add a temporary extra ref to an already cloned extent buffer to + * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid + * the cost of allocating a new one. + */ + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); + atomic_inc(&clone->refs); + + ret = btrfs_next_leaf(inode->root, path); + if (ret != 0) + goto out; + + /* + * Don't bother with cloning if there are no more file extent items for + * our inode. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { + ret = 1; + goto out; + } + + /* + * Important to preserve the start field, for the optimizations when + * checking if extents are shared (see extent_fiemap()). + * + * We must set ->start before calling copy_extent_buffer_full(). If we + * are on sub-pagesize blocksize, we use ->start to determine the offset + * into the folio where our eb exists, and if we update ->start after + * the fact then any subsequent reads of the eb may read from a + * different offset in the folio than where we originally copied into. + */ + clone->start = path->nodes[0]->start; + /* See the comment at fiemap_search_slot() about why we clone. */ + copy_extent_buffer_full(clone, path->nodes[0]); + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; +out: + if (ret) + free_extent_buffer(clone); + + return ret; +} + +/* + * Search for the first file extent item that starts at a given file offset or + * the one that starts immediately before that offset. + * Returns: 0 on success, < 0 on error, 1 if not found. + */ +static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, + u64 file_offset) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = file_offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret != 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + } + + /* + * We clone the leaf and use it during fiemap. This is because while + * using the leaf we do expensive things like checking if an extent is + * shared, which can take a long time. In order to prevent blocking + * other tasks for too long, we use a clone of the leaf. We have locked + * the file range in the inode's io tree, so we know none of our file + * extent items can change. This way we avoid blocking other tasks that + * want to insert items for other inodes in the same leaf or b+tree + * rebalance operations (triggered for example when someone is trying + * to push items into this leaf when trying to insert an item in a + * neighbour leaf). + * We also need the private clone because holding a read lock on an + * extent buffer of the subvolume's b+tree will make lockdep unhappy + * when we check if extents are shared, as backref walking may need to + * lock the same leaf we are processing. + */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Process a range which is a hole or a prealloc extent in the inode's subvolume + * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc + * extent. The end offset (@end) is inclusive. + */ +static int fiemap_process_hole(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + struct extent_state **delalloc_cached_state, + struct btrfs_backref_share_check_ctx *backref_ctx, + u64 disk_bytenr, u64 extent_offset, + u64 extent_gen, + u64 start, u64 end) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + u64 cur_offset = start; + u64 last_delalloc_end = 0; + u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; + bool checked_extent_shared = false; + int ret; + + /* + * There can be no delalloc past i_size, so don't waste time looking for + * it beyond i_size. + */ + while (cur_offset < end && cur_offset < i_size) { + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; + u64 prealloc_len = 0; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + delalloc_cached_state, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; + + /* + * If this is a prealloc extent we have to report every section + * of it that has no delalloc. + */ + if (disk_bytenr != 0) { + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = delalloc_start - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = delalloc_start - prealloc_start; + } + } + + if (prealloc_len > 0) { + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + + checked_extent_shared = true; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + extent_offset += prealloc_len; + } + + ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, + FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ret) + return ret; + + last_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + extent_offset += cur_offset - delalloc_start; + cond_resched(); + } + + /* + * Either we found no delalloc for the whole prealloc extent or we have + * a prealloc extent that spans i_size or starts at or after i_size. + */ + if (disk_bytenr != 0 && last_delalloc_end < end) { + u64 prealloc_start; + u64 prealloc_len; + + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = end + 1 - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = end + 1 - prealloc_start; + } + + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + } + + return 0; +} + +static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + u64 *last_extent_end_ret) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 disk_bytenr; + int ret; + + /* + * Lookup the last file extent. We're not using i_size here because + * there might be preallocation past i_size. + */ + ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); + /* There can't be a file extent item at offset (u64)-1 */ + ASSERT(ret != 0); + if (ret < 0) + return ret; + + /* + * For a non-existing key, btrfs_search_slot() always leaves us at a + * slot > 0, except if the btree is empty, which is impossible because + * at least it has the inode item for this inode and all the items for + * the root inode 256. + */ + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* No file extent items in the subvolume tree. */ + *last_extent_end_ret = 0; + return 0; + } + + /* + * For an inline extent, the disk_bytenr is where inline data starts at, + * so first check if we have an inline extent item before checking if we + * have an implicit hole (disk_bytenr == 0). + */ + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; + } + + /* + * Find the last file extent item that is not a hole (when NO_HOLES is + * not enabled). This should take at most 2 iterations in the worst + * case: we have one hole file extent item at slot 0 of a leaf and + * another hole file extent item as the last item in the previous leaf. + * This is because we merge file extent items that represent holes. + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + while (disk_bytenr == 0) { + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* No file extent items that are not holes. */ + *last_extent_end_ret = 0; + return 0; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + } + + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; +} + +static int extent_fiemap(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; + struct extent_state *delalloc_cached_state = NULL; + struct btrfs_path *path; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_share_check_ctx *backref_ctx; + u64 last_extent_end = 0; + u64 prev_extent_end; + u64 range_start; + u64 range_end; + const u64 sectorsize = inode->root->fs_info->sectorsize; + bool stopped = false; + int ret; + + cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); + cache.entries = kmalloc_array(cache.entries_size, + sizeof(struct btrfs_fiemap_entry), + GFP_KERNEL); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + path = btrfs_alloc_path(); + if (!cache.entries || !backref_ctx || !path) { + ret = -ENOMEM; + goto out; + } + +restart: + range_start = round_down(start, sectorsize); + range_end = round_up(start + len, sectorsize); + prev_extent_end = range_start; + + lock_extent(&inode->io_tree, range_start, range_end, &cached_state); + + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) + goto out_unlock; + btrfs_release_path(path); + + path->reada = READA_FORWARD; + ret = fiemap_search_slot(inode, path, range_start); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* + * No file extent item found, but we may have delalloc between + * the current offset and i_size. So check for that. + */ + ret = 0; + goto check_eof_delalloc; + } + + while (prev_extent_end < range_end) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 extent_end; + u64 extent_len; + u64 extent_offset = 0; + u64 extent_gen; + u64 disk_bytenr = 0; + u64 flags = 0; + int extent_type; + u8 compression; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); + + /* + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. + */ + if (extent_end <= range_start) + goto next_item; + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { + const u64 hole_end = min(key.offset, range_end) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, + prev_extent_end, hole_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + /* We've reached the end of the fiemap range, stop. */ + if (key.offset >= range_end) { + stopped = true; + break; + } + } + + extent_len = extent_end - key.offset; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + compression = btrfs_file_extent_compression(leaf, ei); + extent_type = btrfs_file_extent_type(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (compression == BTRFS_COMPRESS_NONE) + extent_offset = btrfs_file_extent_offset(leaf, ei); + } + + if (compression != BTRFS_COMPRESS_NONE) + flags |= FIEMAP_EXTENT_ENCODED; + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + flags |= FIEMAP_EXTENT_DATA_INLINE; + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, + extent_len, flags); + } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, + disk_bytenr, extent_offset, + extent_gen, key.offset, + extent_end - 1); + } else if (disk_bytenr == 0) { + /* We have an explicit hole. */ + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, + key.offset, extent_end - 1); + } else { + /* We have a regular extent. */ + if (fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + goto out_unlock; + else if (ret > 0) + flags |= FIEMAP_EXTENT_SHARED; + } + + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, + disk_bytenr + extent_offset, + extent_len, flags); + } + + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* emit_fiemap_extent() told us to stop. */ + stopped = true; + break; + } + + prev_extent_end = extent_end; +next_item: + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out_unlock; + } + + ret = fiemap_next_leaf_item(inode, path); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* No more file extent items for this inode. */ + break; + } + cond_resched(); + } + +check_eof_delalloc: + if (!stopped && prev_extent_end < range_end) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, + 0, 0, 0, prev_extent_end, range_end - 1); + if (ret < 0) + goto out_unlock; + prev_extent_end = range_end; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_cached_state, + &delalloc_start, + &delalloc_end); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { + cache.flags |= FIEMAP_EXTENT_LAST; + } + } + +out_unlock: + unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + + if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { + btrfs_release_path(path); + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + len -= cache.next_search_offset - start; + start = cache.next_search_offset; + goto restart; + } else if (ret < 0) { + goto out; + } + + /* + * Must free the path before emitting to the fiemap buffer because we + * may have a non-cloned leaf and if the fiemap buffer is memory mapped + * to a file, a write into it (through btrfs_page_mkwrite()) may trigger + * waiting for an ordered extent that in order to complete needs to + * modify that leaf, therefore leading to a deadlock. + */ + btrfs_free_path(path); + path = NULL; + + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + + ret = emit_last_fiemap_cache(fieinfo, &cache); +out: + free_extent_state(delalloc_cached_state); + kfree(cache.entries); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); + return ret; +} + +int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + int ret; + + ret = fiemap_prep(inode, fieinfo, start, &len, 0); + if (ret) + return ret; + + /* + * fiemap_prep() called filemap_write_and_wait() for the whole possible + * file range (0 to LLONG_MAX), but that is not enough if we have + * compression enabled. The first filemap_fdatawrite_range() only kicks + * in the compression of data (in an async thread) and will return + * before the compression is done and writeback is started. A second + * filemap_fdatawrite_range() is needed to wait for the compression to + * complete and writeback to start. We also need to wait for ordered + * extents to complete, because our fiemap implementation uses mainly + * file extent items to list the extents, searching for extent maps + * only for file ranges with holes or prealloc extents to figure out + * if we have delalloc in those ranges. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); + if (ret) + return ret; + } + + btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); + + /* + * We did an initial flush to avoid holding the inode's lock while + * triggering writeback and waiting for the completion of IO and ordered + * extents. Now after we locked the inode we do it again, because it's + * possible a new write may have happened in between those two steps. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); + if (ret) { + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + return ret; + } + } + + ret = extent_fiemap(btrfs_inode, fieinfo, start, len); + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + + return ret; +} diff --git a/fs/btrfs/fiemap.h b/fs/btrfs/fiemap.h new file mode 100644 index 000000000000..cfd74b35988f --- /dev/null +++ b/fs/btrfs/fiemap.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FIEMAP_H +#define BTRFS_FIEMAP_H + +#include <linux/fiemap.h> + +int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); + +#endif /* BTRFS_FIEMAP_H */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 81ac1d474bf1..d04a3b47b1fb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -10,17 +10,14 @@ #include <linux/sched/mm.h> #include <crypto/hash.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "bio.h" -#include "print-tree.h" #include "compression.h" #include "fs.h" #include "accessors.h" #include "file-item.h" -#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -48,13 +45,12 @@ */ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start, end, i_size; int ret; spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); - if (btrfs_fs_incompat(fs_info, NO_HOLES)) { + if (!inode->file_extent_tree) { inode->disk_i_size = i_size; goto out_unlock; } @@ -87,13 +83,14 @@ out_unlock: int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len) { + if (!inode->file_extent_tree) + return 0; + if (len == 0) return 0; ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); - if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) - return 0; return set_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -115,14 +112,15 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len) { + if (!inode->file_extent_tree) + return 0; + if (len == 0) return 0; ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || len == (u64)-1); - if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) - return 0; return clear_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -153,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) * Calculate the total size needed to allocate for an ordered sum structure * spanning @bytes in the file. */ -static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes) { return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } @@ -179,7 +177,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -193,8 +190,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -356,7 +351,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) + test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; /* @@ -434,8 +429,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) memset(csum_dst, 0, csum_size); count = 1; - if (inode->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; set_extent_bit(&inode->io_tree, file_offset, @@ -454,9 +448,22 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) return ret; } +/* + * Search for checksums for a given logical range. + * + * @root: The root where to look for checksums. + * @start: Logical address of target checksum range. + * @end: End offset (inclusive) of the target checksum range. + * @list: List for adding each checksum that was found. + * Can be NULL in case the caller only wants to check if + * there any checksums for the range. + * @nowait: Indicate if the search must be non-blocking or not. + * + * Return < 0 on error, 0 if no checksums were found, or 1 if checksums were + * found. + */ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait) + struct list_head *list, bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -464,8 +471,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; struct btrfs_csum_item *item; - LIST_HEAD(tmplist); int ret; + bool found_csums = false; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -475,11 +482,6 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, return -ENOMEM; path->nowait = nowait; - if (search_commit) { - path->skip_locking = 1; - path->reada = READA_FORWARD; - path->search_commit_root = 1; - } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.offset = start; @@ -487,7 +489,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto fail; + goto out; if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); @@ -522,7 +524,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto fail; + goto out; if (ret > 0) break; leaf = path->nodes[0]; @@ -544,6 +546,10 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, continue; } + found_csums = true; + if (!list) + goto out; + csum_end = min(csum_end, end + 1); item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); @@ -557,7 +563,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, GFP_NOFS); if (!sums) { ret = -ENOMEM; - goto fail; + goto out; } sums->logical = start; @@ -571,21 +577,24 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, bytes_to_csum_size(fs_info, size)); start += size; - list_add_tail(&sums->list, &tmplist); + list_add_tail(&sums->list, list); } path->slots[0]++; } - ret = 0; -fail: - while (ret < 0 && !list_empty(&tmplist)) { - sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); +out: + btrfs_free_path(path); + if (ret < 0) { + if (list) { + struct btrfs_ordered_sum *tmp_sums; + + list_for_each_entry_safe(sums, tmp_sums, list, list) + kfree(sums); + } + + return ret; } - list_splice_tail(&tmplist, list); - btrfs_free_path(path); - return ret; + return found_csums ? 1 : 0; } /* @@ -874,8 +883,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; - ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); path = btrfs_alloc_path(); if (!path) @@ -1175,7 +1184,7 @@ extend_csum: * search, etc, because log trees are temporary anyway and it * would only save a few bytes of leaf space. */ - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { if (path->slots[0] + 1 >= btrfs_header_nritems(path->nodes[0])) { ret = find_next_csum_offset(root, path, &next_offset); @@ -1229,8 +1238,6 @@ insert: ins_size); if (ret < 0) goto out; - if (WARN_ON(ret != 0)) - goto out; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -1250,7 +1257,6 @@ found: ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - btrfs_mark_buffer_dirty(trans, path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); cond_resched(); @@ -1263,7 +1269,7 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1271,55 +1277,56 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; struct btrfs_key key; - u64 extent_start, extent_end; - u64 bytenr; + u64 extent_start; u8 type = btrfs_file_extent_type(leaf, fi); int compress_type = btrfs_file_extent_compression(leaf, fi); btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; - extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); em->generation = btrfs_file_extent_generation(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { + const u64 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + em->start = extent_start; - em->len = extent_end - extent_start; - em->orig_start = extent_start - - btrfs_file_extent_offset(leaf, fi); - em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (bytenr == 0) { - em->block_start = EXTENT_MAP_HOLE; + em->len = btrfs_file_extent_end(path) - extent_start; + if (disk_bytenr == 0) { + em->disk_bytenr = EXTENT_MAP_HOLE; + em->disk_num_bytes = 0; + em->offset = 0; return; } + em->disk_bytenr = disk_bytenr; + em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + em->offset = btrfs_file_extent_offset(leaf, fi); if (compress_type != BTRFS_COMPRESS_NONE) { extent_map_set_compression(em, compress_type); - em->block_start = bytenr; - em->block_len = em->orig_block_len; } else { - bytenr += btrfs_file_extent_offset(leaf, fi); - em->block_start = bytenr; - em->block_len = em->len; + /* + * Older kernels can create regular non-hole data + * extents with ram_bytes smaller than disk_num_bytes. + * Not a big deal, just always use disk_num_bytes + * for ram_bytes. + */ + em->ram_bytes = em->disk_num_bytes; if (type == BTRFS_FILE_EXTENT_PREALLOC) em->flags |= EXTENT_FLAG_PREALLOC; } } else if (type == BTRFS_FILE_EXTENT_INLINE) { - em->block_start = EXTENT_MAP_INLINE; - em->start = extent_start; - em->len = extent_end - extent_start; - /* - * Initialize orig_start and block_len with the same values - * as in inode.c:btrfs_get_extent(). - */ - em->orig_start = EXTENT_MAP_HOLE; - em->block_len = (u64)-1; + /* Tree-checker has ensured this. */ + ASSERT(extent_start == 0); + + em->disk_bytenr = EXTENT_MAP_INLINE; + em->start = 0; + em->len = fs_info->sectorsize; + em->offset = 0; extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " "root %llu", type, btrfs_ino(inode), extent_start, - root->root_key.objectid); + btrfs_root_id(root)); } } @@ -1340,12 +1347,10 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path) ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { - end = btrfs_file_extent_ram_bytes(leaf, fi); - end = ALIGN(key.offset + end, leaf->fs_info->sectorsize); - } else { + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) + end = leaf->fs_info->sectorsize; + else end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - } return end; } diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 04bd2d34efb1..0e13661a71f3 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,21 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/list.h> +#include <uapi/linux/btrfs_tree.h> #include "accessors.h" +struct extent_map; +struct btrfs_file_extent_item; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_bio; +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_ordered_sum; +struct btrfs_path; +struct btrfs_inode; + #define BTRFS_FILE_EXTENT_INLINE_DATA_START \ (offsetof(struct btrfs_file_extent_item, disk_bytenr)) @@ -55,14 +68,13 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); + struct list_head *list, bool nowait); int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, u64 start, u64 end, u8 *csum_buf, unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 38dfcac47609..0b568c8d24cb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -17,15 +17,13 @@ #include <linux/uio.h> #include <linux/iversion.h> #include <linux/fsverity.h> -#include <linux/iomap.h> #include "ctree.h" +#include "direct-io.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "volumes.h" #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" @@ -38,100 +36,42 @@ #include "ioctl.h" #include "file.h" #include "super.h" - -/* simple helper to fault in pages and copy. This should go away - * and be replaced with calls into generic code. - */ -static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, - struct page **prepared_pages, - struct iov_iter *i) -{ - size_t copied = 0; - size_t total_copied = 0; - int pg = 0; - int offset = offset_in_page(pos); - - while (write_bytes > 0) { - size_t count = min_t(size_t, - PAGE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[pg]; - /* - * Copy data from userspace to the current page - */ - copied = copy_page_from_iter_atomic(page, offset, count, i); - - /* Flush processor's dcache for this page */ - flush_dcache_page(page); - - /* - * if we get a partial write, we can end up with - * partially up to date pages. These add - * a lot of complexity, so make sure they don't - * happen by forcing this copy to be retried. - * - * The rest of the btrfs_file_write code will fall - * back to page at a time copies after we return 0. - */ - if (unlikely(copied < count)) { - if (!PageUptodate(page)) { - iov_iter_revert(i, copied); - copied = 0; - } - if (!copied) - break; - } - - write_bytes -= copied; - total_copied += copied; - offset += copied; - if (offset == PAGE_SIZE) { - pg++; - offset = 0; - } - } - return total_copied; -} +#include "print-tree.h" /* - * unlocks pages after btrfs_file_write is done with them + * Unlock folio after btrfs_file_write() is done with it. */ -static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, - struct page **pages, size_t num_pages, +static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, u64 pos, u64 copied) { - size_t i; u64 block_start = round_down(pos, fs_info->sectorsize); u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; ASSERT(block_len <= U32_MAX); - for (i = 0; i < num_pages; i++) { - /* page checked is some magic around finding pages that - * have been modified without going through btrfs_set_page_dirty - * clear it here. There should be no need to mark the pages - * accessed as prepare_pages should have marked them accessed - * in prepare_pages via find_or_create_page() - */ - btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), - block_start, block_len); - unlock_page(pages[i]); - put_page(pages[i]); - } + /* + * Folio checked is some magic around finding folios that have been + * modified without going through btrfs_dirty_folio(). Clear it here. + * There should be no need to mark the pages accessed as + * prepare_one_folio() should have marked them accessed in + * prepare_one_folio() via find_or_create_page() + */ + btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); + folio_unlock(folio); + folio_put(folio); } /* - * After btrfs_copy_from_user(), update the following things for delalloc: - * - Mark newly dirtied pages as DELALLOC in the io tree. + * After copy_folio_from_iter_atomic(), update the following things for delalloc: + * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. - * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Mark modified folio as Uptodate/Dirty and not needing COW fixup * - Update inode size for past EOF write */ -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve) +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int err = 0; - int i; + int ret = 0; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -149,6 +89,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); + ASSERT(folio_pos(folio) <= pos && + folio_pos(folio) + folio_size(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -160,21 +102,14 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached); - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); - if (err) - return err; + if (ret) + return ret; - for (i = 0; i < num_pages; i++) { - struct page *p = pages[i]; - - btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), - start_pos, num_bytes); - } + btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); /* * we've only changed i_size in ram, and we haven't updated @@ -208,7 +143,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); @@ -245,10 +179,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); - if (args->start >= inode->disk_i_size && !args->replace_extent) + if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; - update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -266,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } ret = btrfs_next_leaf(root, path); if (ret < 0) break; @@ -342,7 +280,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end < extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -372,18 +314,19 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, - disk_bytenr, num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - new_key.objectid, - args->start - extent_offset, - 0, false); + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, new_key.objectid, + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -416,7 +359,6 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; @@ -428,7 +370,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end >= extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -436,7 +382,6 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) @@ -456,7 +401,11 @@ delete_extent_item: del_slot = path->slots[0]; del_nr = 1; } else { - BUG_ON(del_slot + del_nr != path->slots[0]); + if (WARN_ON(del_slot + del_nr != path->slots[0])) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } del_nr++; } @@ -466,15 +415,17 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_DROP_DELAYED_REF, - disk_bytenr, num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - key.objectid, - key.offset - extent_offset, 0, - false); + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, key.objectid, + key.offset - extent_offset, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -685,7 +636,6 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -714,7 +664,6 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -748,12 +697,14 @@ again: btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_mark_buffer_dirty(trans, leaf); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, - num_bytes, 0, root->root_key.objectid); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset, 0, false); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -776,10 +727,14 @@ again: other_start = end; other_end = 0; - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, 0, root->root_key.objectid); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, - 0, false); + + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -820,7 +775,6 @@ again: btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); @@ -829,7 +783,6 @@ again: btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { @@ -843,53 +796,47 @@ out: } /* - * on error we return an unlocked page and the error value - * on success we return a locked page and 0 + * On error return an unlocked folio and the error value + * On success return a locked folio and 0 */ -static int prepare_uptodate_page(struct inode *inode, - struct page *page, u64 pos, - bool force_uptodate) +static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, + u64 len, bool force_uptodate) { - struct folio *folio = page_folio(page); + u64 clamp_start = max_t(u64, pos, folio_pos(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); int ret = 0; - if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && - !PageUptodate(page)) { - ret = btrfs_read_folio(NULL, folio); - if (ret) - return ret; - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - return -EIO; - } - - /* - * Since btrfs_read_folio() will unlock the folio before it - * returns, there is a window where btrfs_release_folio() can be - * called to release the page. Here we check both inode - * mapping and PagePrivate() to make sure the page was not - * released. - * - * The private flag check is essential for subpage as we need - * to store extra bitmap using folio private. - */ - if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { - unlock_page(page); - return -EAGAIN; - } - } - return 0; -} + if (folio_test_uptodate(folio)) + return 0; -static fgf_t get_prepare_fgp_flags(bool nowait) -{ - fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + if (!force_uptodate && + IS_ALIGNED(clamp_start, PAGE_SIZE) && + IS_ALIGNED(clamp_end, PAGE_SIZE)) + return 0; - if (nowait) - fgp_flags |= FGP_NOWAIT; + ret = btrfs_read_folio(NULL, folio); + if (ret) + return ret; + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + return -EIO; + } - return fgp_flags; + /* + * Since btrfs_read_folio() will unlock the folio before it returns, + * there is a window where btrfs_release_folio() can be called to + * release the page. Here we check both inode mapping and page + * private to make sure the page was not released. + * + * The private flag check is essential for subpage as we need to store + * extra bitmap using folio private. + */ + if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { + folio_unlock(folio); + return -EAGAIN; + } + return 0; } static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) @@ -906,89 +853,68 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) } /* - * this just gets pages into the page cache and locks them down. + * Get folio into the page cache and lock it. */ -static noinline int prepare_pages(struct inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, bool force_uptodate, - bool nowait) +static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, + loff_t pos, size_t write_bytes, + bool force_uptodate, bool nowait) { - int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = get_prepare_fgp_flags(nowait); - int err = 0; - int faili; + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); + struct folio *folio; + int ret = 0; - for (i = 0; i < num_pages; i++) { again: - pages[i] = pagecache_get_page(inode->i_mapping, index + i, - fgp_flags, mask | __GFP_WRITE); - if (!pages[i]) { - faili = i - 1; - if (nowait) - err = -EAGAIN; - else - err = -ENOMEM; - goto fail; - } - - err = set_page_extent_mapped(pages[i]); - if (err < 0) { - faili = i; - goto fail; - } - - if (i == 0) - err = prepare_uptodate_page(inode, pages[i], pos, - force_uptodate); - if (!err && i == num_pages - 1) - err = prepare_uptodate_page(inode, pages[i], - pos + write_bytes, false); - if (err) { - put_page(pages[i]); - if (!nowait && err == -EAGAIN) { - err = 0; - goto again; - } - faili = i - 1; - goto fail; + folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); + if (IS_ERR(folio)) { + if (nowait) + ret = -EAGAIN; + else + ret = PTR_ERR(folio); + return ret; + } + folio_wait_writeback(folio); + /* Only support page sized folio yet. */ + ASSERT(folio_order(folio) == 0); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + return ret; + } + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); + if (ret) { + /* The folio is already unlocked. */ + folio_put(folio); + if (!nowait && ret == -EAGAIN) { + ret = 0; + goto again; } - wait_on_page_writeback(pages[i]); + return ret; } - + *folio_ret = folio; return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - put_page(pages[faili]); - faili--; - } - return err; - } /* - * This function locks the extent and properly waits for data=ordered extents - * to finish before allowing the pages to be modified if need. + * Locks the extent and properly waits for data=ordered extents to finish + * before allowing the folios to be modified if need. * - * The return value: + * Return: * 1 - the extent is locked * 0 - the extent is not locked, and everything is OK - * -EAGAIN - need re-prepare the pages - * the other < 0 number - Something wrong happens + * -EAGAIN - need to prepare the folios again */ static noinline int -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, + loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start_pos; u64 last_pos; - int i; int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); @@ -1000,12 +926,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, if (nowait) { if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, cached_state)) { - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - pages[i] = NULL; - } - + folio_unlock(folio); + folio_put(folio); return -EAGAIN; } } else { @@ -1019,10 +941,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered->file_offset <= last_pos) { unlock_extent(&inode->io_tree, start_pos, last_pos, cached_state); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; @@ -1036,11 +956,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * We should be called after prepare_pages() which should have locked + * We should be called after prepare_one_folio() which should have locked * all pages in the range. */ - for (i = 0; i < num_pages; i++) - WARN_ON(!PageLocked(pages[i])); + WARN_ON(!folio_test_locked(folio)); return ret; } @@ -1096,7 +1015,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, &cached_state); } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL, nowait, false); + NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else @@ -1112,36 +1031,14 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now, ts; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - ts = inode_get_mtime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_mtime_to_ts(inode, now); - - ts = inode_get_ctime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_ctime_to_ts(inode, now); - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - -static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, - size_t count) +int btrfs_write_check(struct kiocb *iocb, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1163,11 +1060,13 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - update_time_for_write(inode); + if (!IS_NOCMTIME(inode)) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + inode_inc_iversion(inode); + } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); @@ -1179,27 +1078,23 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, return 0; } -static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, - struct iov_iter *i) +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; loff_t pos; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct page **pages = NULL; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; size_t num_written = 0; - int nrptrs; ssize_t ret; - bool only_release_metadata = false; - bool force_page_uptodate = false; - loff_t old_isize = i_size_read(inode); + loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + bool only_release_metadata = false; if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1208,42 +1103,37 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (ret < 0) return ret; + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); + ret = generic_write_checks(iocb, i); if (ret <= 0) goto out; - ret = btrfs_write_check(iocb, i, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) goto out; pos = iocb->ki_pos; - nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), - PAGE_SIZE / (sizeof(struct page *))); - nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); - nrptrs = max(nrptrs, 8); - pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out; - } - while (iov_iter_count(i) > 0) { struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), - nrptrs * (size_t)PAGE_SIZE - - offset); - size_t num_pages; + size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); size_t reserve_bytes; - size_t dirty_pages; size_t copied; size_t dirty_sectors; size_t num_sectors; + struct folio *folio = NULL; int extents_locked; + bool force_page_uptodate = false; /* - * Fault pages before locking them in prepare_pages + * Fault pages before locking them in prepare_one_folio() * to avoid recursive lock */ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { @@ -1282,8 +1172,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, only_release_metadata = true; } - num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); - WARN_ON(num_pages > nrptrs); reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); @@ -1311,23 +1199,17 @@ again: break; } - /* - * This is going to setup the pages array with the number of - * pages we want, so we don't really need to worry about the - * contents of pages from loop to loop - */ - ret = prepare_pages(inode, pages, num_pages, - pos, write_bytes, force_page_uptodate, false); + ret = prepare_one_folio(inode, &folio, pos, write_bytes, + force_page_uptodate, false); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); break; } - extents_locked = lock_and_cleanup_extent_if_need( - BTRFS_I(inode), pages, - num_pages, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); + extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), + folio, pos, write_bytes, &lockstart, + &lockend, nowait, &cached_state); if (extents_locked < 0) { if (!nowait && extents_locked == -EAGAIN) goto again; @@ -1338,28 +1220,34 @@ again: break; } - copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + copied = copy_folio_from_iter_atomic(folio, + offset_in_folio(folio, pos), write_bytes, i); + flush_dcache_folio(folio); + + /* + * If we get a partial write, we can end up with partially + * uptodate page. Although if sector size < page size we can + * handle it, but if it's not sector aligned it can cause + * a lot of complexity, so make sure they don't happen by + * forcing retry this copy. + */ + if (unlikely(copied < write_bytes)) { + if (!folio_test_uptodate(folio)) { + iov_iter_revert(i, copied); + copied = 0; + } + } num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); dirty_sectors = round_up(copied + sector_offset, fs_info->sectorsize); dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); - /* - * if we have trouble faulting in the pages, fall - * back to one page at a time - */ - if (copied < write_bytes) - nrptrs = 1; - if (copied == 0) { force_page_uptodate = true; dirty_sectors = 0; - dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_SIZE); } if (num_sectors > dirty_sectors) { @@ -1369,13 +1257,10 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { - u64 __pos; - - __pos = round_down(pos, - fs_info->sectorsize) + - (dirty_pages << PAGE_SHIFT); + u64 release_start = round_up(pos + copied, + fs_info->sectorsize); btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, __pos, + data_reserved, release_start, release_bytes, true); } } @@ -1383,15 +1268,14 @@ again: release_bytes = round_up(copied + sector_offset, fs_info->sectorsize); - ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, + ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's * start offset is >= i_size, we might still have a non-NULL * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_pages(). Therefore free any + * as delalloc through btrfs_dirty_page(). Therefore free any * possible cached extent state to avoid a memory leak. */ if (extents_locked) @@ -1402,7 +1286,7 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + btrfs_drop_folio(fs_info, folio, pos, copied); break; } @@ -1410,7 +1294,7 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + btrfs_drop_folio(fs_info, folio, pos, copied); cond_resched(); @@ -1418,8 +1302,6 @@ again: num_written += copied; } - kfree(pages); - if (release_bytes) { if (only_release_metadata) { btrfs_check_nocow_unlock(BTRFS_I(inode)); @@ -1443,194 +1325,6 @@ out: return num_written ? num_written : ret; } -static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - const u32 blocksize_mask = fs_info->sectorsize - 1; - - if (offset & blocksize_mask) - return -EINVAL; - - if (iov_iter_alignment(iter) & blocksize_mask) - return -EINVAL; - - return 0; -} - -static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - loff_t pos; - ssize_t written = 0; - ssize_t written_buffered; - size_t prev_left = 0; - loff_t endbyte; - ssize_t err; - unsigned int ilock_flags = 0; - struct iomap_dio *dio; - - if (iocb->ki_flags & IOCB_NOWAIT) - ilock_flags |= BTRFS_ILOCK_TRY; - - /* - * If the write DIO is within EOF, use a shared lock and also only if - * security bits will likely not be dropped by file_remove_privs() called - * from btrfs_write_check(). Either will need to be rechecked after the - * lock was acquired. - */ - if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) - ilock_flags |= BTRFS_ILOCK_SHARED; - -relock: - err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (err < 0) - return err; - - /* Shared lock cannot be used with security bits set. */ - if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - ilock_flags &= ~BTRFS_ILOCK_SHARED; - goto relock; - } - - err = generic_write_checks(iocb, from); - if (err <= 0) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - return err; - } - - err = btrfs_write_check(iocb, from, err); - if (err < 0) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - goto out; - } - - pos = iocb->ki_pos; - /* - * Re-check since file size may have changed just before taking the - * lock or pos may have changed because of O_APPEND in generic_write_check() - */ - if ((ilock_flags & BTRFS_ILOCK_SHARED) && - pos + iov_iter_count(from) > i_size_read(inode)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - ilock_flags &= ~BTRFS_ILOCK_SHARED; - goto relock; - } - - if (check_direct_IO(fs_info, from, pos)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - goto buffered; - } - - /* - * The iov_iter can be mapped to the same file range we are writing to. - * If that's the case, then we will deadlock in the iomap code, because - * it first calls our callback btrfs_dio_iomap_begin(), which will create - * an ordered extent, and after that it will fault in the pages that the - * iov_iter refers to. During the fault in we end up in the readahead - * pages code (starting at btrfs_readahead()), which will lock the range, - * find that ordered extent and then wait for it to complete (at - * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since - * obviously the ordered extent can never complete as we didn't submit - * yet the respective bio(s). This always happens when the buffer is - * memory mapped to the same file range, since the iomap DIO code always - * invalidates pages in the target file range (after starting and waiting - * for any writeback). - * - * So here we disable page faults in the iov_iter and then retry if we - * got -EFAULT, faulting in the pages before the retry. - */ - from->nofault = true; - dio = btrfs_dio_write(iocb, from, written); - from->nofault = false; - - /* - * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync - * iocb, and that needs to lock the inode. So unlock it before calling - * iomap_dio_complete() to avoid a deadlock. - */ - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - - if (IS_ERR_OR_NULL(dio)) - err = PTR_ERR_OR_ZERO(dio); - else - err = iomap_dio_complete(dio); - - /* No increment (+=) because iomap returns a cumulative value. */ - if (err > 0) - written = err; - - if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { - const size_t left = iov_iter_count(from); - /* - * We have more data left to write. Try to fault in as many as - * possible of the remainder pages and retry. We do this without - * releasing and locking again the inode, to prevent races with - * truncate. - * - * Also, in case the iov refers to pages in the file range of the - * file we want to write to (due to a mmap), we could enter an - * infinite loop if we retry after faulting the pages in, since - * iomap will invalidate any pages in the range early on, before - * it tries to fault in the pages of the iov. So we keep track of - * how much was left of iov in the previous EFAULT and fallback - * to buffered IO in case we haven't made any progress. - */ - if (left == prev_left) { - err = -ENOTBLK; - } else { - fault_in_iov_iter_readable(from, left); - prev_left = left; - goto relock; - } - } - - /* - * If 'err' is -ENOTBLK or we have not written all data, then it means - * we must fallback to buffered IO. - */ - if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) - goto out; - -buffered: - /* - * If we are in a NOWAIT context, then return -EAGAIN to signal the caller - * it must retry the operation in a context where blocking is acceptable, - * because even if we end up not blocking during the buffered IO attempt - * below, we will block when flushing and waiting for the IO. - */ - if (iocb->ki_flags & IOCB_NOWAIT) { - err = -EAGAIN; - goto out; - } - - pos = iocb->ki_pos; - written_buffered = btrfs_buffered_write(iocb, from); - if (written_buffered < 0) { - err = written_buffered; - goto out; - } - /* - * Ensure all data is persisted. We want the next direct IO read to be - * able to read what was just written. - */ - endbyte = pos + written_buffered - 1; - err = btrfs_fdatawrite_range(inode, pos, endbyte); - if (err) - goto out; - err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); - if (err) - goto out; - written += written_buffered; - iocb->ki_pos = pos + written_buffered; - invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); -out: - return err < 0 ? err : written; -} - static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded) { @@ -1652,7 +1346,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (ret || encoded->len == 0) goto out; - ret = btrfs_write_check(iocb, from, encoded->len); + ret = btrfs_write_check(iocb, encoded->len); if (ret < 0) goto out; @@ -1730,7 +1424,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) return 0; } -static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end) { int ret; struct blk_plug plug; @@ -1750,7 +1444,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) { - struct btrfs_inode *inode = BTRFS_I(ctx->inode); + struct btrfs_inode *inode = ctx->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && @@ -1786,14 +1480,21 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; u64 len; bool full_sync; + bool skip_ilock = false; + + if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { + skip_ilock = true; + current->journal_info = NULL; + btrfs_assert_inode_locked(inode); + } trace_btrfs_sync_file(file, datasync); @@ -1821,7 +1522,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + if (skip_ilock) + down_write(&inode->i_mmap_lock); + else + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -1845,7 +1549,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -1857,8 +1564,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * running delalloc the full sync flag may be set if we need to drop * extra extent map ranges due to temporary memory allocation failures. */ - full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We have to do this here to avoid the priority inversion of waiting on @@ -1877,15 +1583,29 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); + clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); } else { /* * Get our ordered extents as soon as possible to avoid doing * checksum lookups in the csum tree, and use instead the * checksums attached to the ordered extents. */ - btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), - &ctx.ordered_extents); - ret = filemap_fdatawait_range(inode->i_mapping, start, end); + btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); + if (ret) + goto out_release_extents; + + /* + * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after + * starting and waiting for writeback, because for buffered IO + * it may have been set during the end IO callback + * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in + * case an error happened and we need to wait for ordered + * extents to complete so that any extent maps that point to + * unwritten locations are dropped and we don't log them. + */ + if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags)) + ret = btrfs_wait_ordered_range(inode, start, len); } if (ret) @@ -1899,8 +1619,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ - clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * An ordered extent might have started before and completed * already with io errors, in which case the inode was not @@ -1908,10 +1627,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * for any errors that might have happened since we last * checked called fsync. */ - ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); + ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); goto out_release_extents; } + btrfs_init_log_ctx_scratch_eb(&ctx); + /* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for @@ -1931,6 +1652,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + /* + * Scratch eb no longer needed, release before syncing log or commit + * transaction, to avoid holding unnecessary memory during such long + * operations. + */ + if (ctx.scratch_eb) { + free_extent_buffer(ctx.scratch_eb); + ctx.scratch_eb = NULL; + } btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -1947,7 +1677,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); @@ -2006,6 +1739,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_commit_transaction(trans); out: + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.list)); ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); @@ -2015,10 +1749,179 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct folio *folio = page_folio(page); + struct inode *inode = file_inode(vmf->vma->vm_file); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + unsigned long zero_start; + loff_t size; + vm_fault_t ret; + int ret2; + int reserved = 0; + u64 reserved_space; + u64 page_start; + u64 page_end; + u64 end; + + ASSERT(folio_order(folio) == 0); + + reserved_space = PAGE_SIZE; + + sb_start_pagefault(inode->i_sb); + page_start = folio_pos(folio); + page_end = page_start + folio_size(folio) - 1; + end = page_end; + + /* + * Reserving delalloc space after obtaining the page lock can lead to + * deadlock. For example, if a dirty page is locked by this function + * and the call to btrfs_delalloc_reserve_space() ends up triggering + * dirty page write out, then the btrfs_writepages() function could + * end up waiting indefinitely to get a lock on the page currently + * being processed by btrfs_page_mkwrite() function. + */ + ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (!ret2) { + ret2 = file_update_time(vmf->vma->vm_file); + reserved = 1; + } + if (ret2) { + ret = vmf_error(ret2); + if (reserved) + goto out; + goto out_noreserve; + } + + /* Make the VM retry the fault. */ + ret = VM_FAULT_NOPAGE; +again: + down_read(&BTRFS_I(inode)->i_mmap_lock); + folio_lock(folio); + size = i_size_read(inode); + + if ((folio->mapping != inode->i_mapping) || + (page_start >= size)) { + /* Page got truncated out from underneath us. */ + goto out_unlock; + } + folio_wait_writeback(folio); + + lock_extent(io_tree, page_start, page_end, &cached_state); + ret2 = set_folio_extent_mapped(folio); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } + + /* + * We can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish. + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + folio_unlock(folio); + up_read(&BTRFS_I(inode)->i_mmap_lock); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } + + if (folio->index == ((size - 1) >> PAGE_SHIFT)) { + reserved_space = round_up(size - page_start, fs_info->sectorsize); + if (reserved_space < PAGE_SIZE) { + end = page_start + reserved_space - 1; + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, page_start, + PAGE_SIZE - reserved_space, true); + } + } + + /* + * page_mkwrite gets called when the page is firstly dirtied after it's + * faulted in, but write(2) could also dirty a page and set delalloc + * bits, thus in this case for space account reason, we still need to + * clear any delalloc bits within this page range since we have to + * reserve data&meta space before lock_page() (see above comments). + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); + + ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + &cached_state); + if (ret2) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } + + /* Page is wholly or partially inside EOF. */ + if (page_start + folio_size(folio) > size) + zero_start = offset_in_folio(folio, size); + else + zero_start = PAGE_SIZE; + + if (zero_start != PAGE_SIZE) + folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); + + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); + + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + + unlock_extent(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); + + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; + +out_unlock: + folio_unlock(folio); + up_read(&BTRFS_I(inode)->i_mmap_lock); +out: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, + reserved_space, (ret != 0)); +out_noreserve: + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return ret; +} + static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -2110,7 +2013,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } @@ -2127,7 +2029,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } btrfs_release_path(path); @@ -2148,11 +2049,9 @@ out: hole_em->start = offset; hole_em->len = end - offset; hole_em->ram_bytes = hole_em->len; - hole_em->orig_start = offset; - hole_em->block_start = EXTENT_MAP_HOLE; - hole_em->block_len = 0; - hole_em->orig_block_len = 0; + hole_em->disk_bytenr = EXTENT_MAP_HOLE; + hole_em->disk_num_bytes = 0; hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -2176,14 +2075,14 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) struct extent_map *em; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, + em = btrfs_get_extent(inode, NULL, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ - if (em->block_start == EXTENT_MAP_HOLE) { + if (em->disk_bytenr == EXTENT_MAP_HOLE) { ret = 1; *len = em->start + em->len > *start + *len ? 0 : *start + *len - em->start - em->len; @@ -2248,7 +2147,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; int slot; - struct btrfs_ref ref = { 0 }; int ret; if (replace_len == 0) @@ -2278,7 +2176,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, @@ -2304,15 +2201,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->qgroup_reserved, &key); } else { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = extent_info->disk_offset, + .num_bytes = extent_info->disk_len, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; u64 ref_offset; - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - extent_info->disk_offset, - extent_info->disk_len, 0, - root->root_key.objectid); ref_offset = extent_info->file_offset - extent_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset, 0, false); + btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2593,7 +2492,7 @@ out: static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; @@ -2611,7 +2510,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); - ret = btrfs_wait_ordered_range(inode, offset, len); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len); if (ret) goto out_only_mutex; @@ -2835,11 +2734,11 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); - if (em->block_start == EXTENT_MAP_HOLE) + if (em->disk_bytenr == EXTENT_MAP_HOLE) ret = RANGE_BOUNDARY_HOLE; else if (em->flags & EXTENT_FLAG_PREALLOC) ret = RANGE_BOUNDARY_PREALLOC_EXTENT; @@ -2866,7 +2765,7 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -2903,14 +2802,13 @@ static int btrfs_zero_range(struct inode *inode, ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; - alloc_hint = em->block_start + em->len; + alloc_hint = extent_map_block_start(em) + em->len; } free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, - sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -2922,7 +2820,7 @@ static int btrfs_zero_range(struct inode *inode, mode); goto out; } - if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { + if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { free_extent_map(em); ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, 0); @@ -3005,7 +2903,7 @@ reserve_space: } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, - i_blocksize(inode), + fs_info->sectorsize, offset + len, &alloc_hint); unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); @@ -3049,7 +2947,7 @@ static long btrfs_fallocate(struct file *file, int mode, int ret; /* Do not allow fallocate in ZONED mode */ - if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) + if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; alloc_start = round_down(offset, blocksize); @@ -3107,7 +3005,7 @@ static long btrfs_fallocate(struct file *file, int mode, * the file range and, due to the previous locking we did, we know there * can't be more delalloc or ordered extents in the range. */ - ret = btrfs_wait_ordered_range(inode, alloc_start, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start, alloc_end - alloc_start); if (ret) goto out; @@ -3126,7 +3024,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ while (cur_offset < alloc_end) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -3135,7 +3033,7 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !(em->flags & EXTENT_FLAG_PREALLOC))) { const u64 range_len = last_byte - cur_offset; @@ -3177,7 +3075,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, i_blocksize(inode), + range->len, blocksize, offset + len, &alloc_hint); /* * btrfs_prealloc_file_range() releases space even @@ -3460,7 +3358,7 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) { struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); - struct btrfs_file_private *private = file->private_data; + struct btrfs_file_private *private; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; struct extent_state **delalloc_cached_state; @@ -3488,7 +3386,19 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) inode_get_bytes(&inode->vfs_inode) == i_size) return i_size; - if (!private) { + spin_lock(&inode->lock); + private = file->private_data; + spin_unlock(&inode->lock); + + if (private && private->owner_task != current) { + /* + * Not allocated by us, don't use it as its cached state is used + * by the task that allocated it and we don't want neither to + * mess with it nor get incorrect results because it reflects an + * invalid state for the current task. + */ + private = NULL; + } else if (!private) { private = kzalloc(sizeof(*private), GFP_KERNEL); /* * No worries if memory allocation failed. @@ -3496,7 +3406,23 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, * so everything will still be correct. */ - file->private_data = private; + if (private) { + bool free = false; + + private->owner_task = current; + + spin_lock(&inode->lock); + if (file->private_data) + free = true; + else + file->private_data = private; + spin_unlock(&inode->lock); + + if (free) { + kfree(private); + private = NULL; + } + } } if (private) @@ -3710,8 +3636,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_CAN_ODIRECT; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) @@ -3719,97 +3644,6 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } -static int check_direct_read(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - int ret; - int i, seg; - - ret = check_direct_IO(fs_info, iter, offset); - if (ret < 0) - return ret; - - if (!iter_is_iovec(iter)) - return 0; - - for (seg = 0; seg < iter->nr_segs; seg++) { - for (i = seg + 1; i < iter->nr_segs; i++) { - const struct iovec *iov1 = iter_iov(iter) + seg; - const struct iovec *iov2 = iter_iov(iter) + i; - - if (iov1->iov_base == iov2->iov_base) - return -EINVAL; - } - } - return 0; -} - -static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) -{ - struct inode *inode = file_inode(iocb->ki_filp); - size_t prev_left = 0; - ssize_t read = 0; - ssize_t ret; - - if (fsverity_active(inode)) - return 0; - - if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) - return 0; - - btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); -again: - /* - * This is similar to what we do for direct IO writes, see the comment - * at btrfs_direct_write(), but we also disable page faults in addition - * to disabling them only at the iov_iter level. This is because when - * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), - * which can still trigger page fault ins despite having set ->nofault - * to true of our 'to' iov_iter. - * - * The difference to direct IO writes is that we deadlock when trying - * to lock the extent range in the inode's tree during he page reads - * triggered by the fault in (while for writes it is due to waiting for - * our own ordered extent). This is because for direct IO reads, - * btrfs_dio_iomap_begin() returns with the extent range locked, which - * is only unlocked in the endio callback (end_bio_extent_readpage()). - */ - pagefault_disable(); - to->nofault = true; - ret = btrfs_dio_read(iocb, to, read); - to->nofault = false; - pagefault_enable(); - - /* No increment (+=) because iomap returns a cumulative value. */ - if (ret > 0) - read = ret; - - if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { - const size_t left = iov_iter_count(to); - - if (left == prev_left) { - /* - * We didn't make any progress since the last attempt, - * fallback to a buffered read for the remainder of the - * range. This is just to avoid any possibility of looping - * for too long. - */ - ret = read; - } else { - /* - * We made some progress since the last retry or this is - * the first time we are retrying. Fault in as many pages - * as possible and retry. - */ - fault_in_iov_iter_writeable(to, left); - prev_left = left; - goto again; - } - } - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); - return ret < 0 ? ret : read; -} - static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; @@ -3841,10 +3675,13 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .uring_cmd = btrfs_uring_cmd, + .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) { + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; /* @@ -3861,10 +3698,9 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) * know better and pull this out at some point in the future, it is * right and you are wrong. */ - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + ret = filemap_fdatawrite_range(mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) + ret = filemap_fdatawrite_range(mapping, start, end); return ret; } diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 82b34fbb295f..de89e644be29 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -3,6 +3,21 @@ #ifndef BTRFS_FILE_H #define BTRFS_FILE_H +#include <linux/types.h> + +struct file; +struct extent_state; +struct kiocb; +struct iov_iter; +struct page; +struct btrfs_ioctl_encoded_io_args; +struct btrfs_drop_extents_args; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_path; +struct btrfs_replace_extent_info; +struct btrfs_trans_handle; + extern const struct file_operations btrfs_file_operations; int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); @@ -19,15 +34,16 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve); -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve); +int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait); void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret); +int btrfs_write_check(struct kiocb *iocb, size_t count); +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d372c7ce0e6b..d42b6f882f57 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,7 +11,8 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> -#include "ctree.h" +#include <linux/string_choices.h> +#include "extent-tree.h" #include "fs.h" #include "messages.h" #include "misc.h" @@ -19,9 +20,7 @@ #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "volumes.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" #include "discard.h" #include "subpage.h" @@ -84,7 +83,6 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_key location; struct btrfs_disk_key disk_key; @@ -118,7 +116,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); + inode = btrfs_iget_path(location.objectid, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) @@ -140,7 +138,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, spin_lock(&block_group->lock); if (block_group->inode) - inode = igrab(block_group->inode); + inode = igrab(&block_group->inode->vfs_inode); spin_unlock(&block_group->lock); if (inode) return inode; @@ -159,7 +157,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, } if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) - block_group->inode = igrab(inode); + block_group->inode = BTRFS_I(igrab(inode)); spin_unlock(&block_group->lock); return inode; @@ -200,7 +198,6 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -218,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -399,7 +395,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return -ENOMEM; io_ctl->num_pages = num_pages; - io_ctl->fs_info = btrfs_sb(inode->i_sb); + io_ctl->fs_info = inode_to_fs_info(inode); io_ctl->inode = inode; return 0; @@ -465,7 +461,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) return -ENOMEM; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(page_folio(page)); if (ret < 0) { unlock_page(page); put_page(page); @@ -860,6 +856,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, spin_unlock(&ctl->tree_lock); btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); + kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -1190,7 +1187,6 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -1270,7 +1266,7 @@ static int flush_dirty_cache(struct inode *inode) { int ret; - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); @@ -1389,6 +1385,7 @@ static int __btrfs_write_out_cache(struct inode *inode, int bitmaps = 0; int ret; int must_iput = 0; + int i_size; if (!i_size_read(inode)) return -EIO; @@ -1459,11 +1456,16 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, - io_ctl->num_pages, 0, i_size_read(inode), - &cached_state, false); - if (ret) - goto out_nospc; + i_size = i_size_read(inode); + for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { + u64 dirty_start = i * PAGE_SIZE; + u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; + + ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), + dirty_start, dirty_len, &cached_state, false); + if (ret < 0) + goto out_nospc; + } if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -1485,7 +1487,7 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl->entries = entries; io_ctl->bitmaps = bitmaps; - ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); + ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1); if (ret) goto out; @@ -1913,9 +1915,9 @@ static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, ctl->free_space -= bytes; } -static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) +static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) { unsigned long start, count, end; int extent_delta = 1; @@ -2251,7 +2253,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bytes_to_set = min(end - offset, bytes); - bitmap_set_bits(ctl, info, offset, bytes_to_set); + btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); return bytes_to_set; @@ -2621,7 +2623,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_block_group *block_group, +static int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { @@ -2699,15 +2701,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; int bg_reclaim_threshold = 0; - bool initial = (size == block_group->length); + bool initial; u64 reclaimable_unusable; - WARN_ON(!initial && offset + size > block_group->zone_capacity); + spin_lock(&block_group->lock); + initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); + WARN_ON(!initial && offset + size > block_group->zone_capacity); if (!initial) bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); - spin_lock(&ctl->tree_lock); if (!used) to_free = size; else if (initial) @@ -2720,18 +2723,19 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, to_free = offset + size - block_group->alloc_offset; to_unusable = size - to_free; + spin_lock(&ctl->tree_lock); ctl->free_space += to_free; + spin_unlock(&ctl->tree_lock); /* * If the block group is read-only, we should account freed space into * bytes_readonly. */ - if (!block_group->ro) + if (!block_group->ro) { block_group->zone_unusable += to_unusable; - spin_unlock(&ctl->tree_lock); + WARN_ON(block_group->zone_unusable > block_group->length); + } if (!used) { - spin_lock(&block_group->lock); block_group->alloc_offset -= size; - spin_unlock(&block_group->lock); } reclaimable_unusable = block_group->zone_unusable - @@ -2745,6 +2749,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, btrfs_mark_bg_to_reclaim(block_group); } + spin_unlock(&block_group->lock); + return 0; } @@ -2934,12 +2940,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, if (info->bytes >= bytes && !block_group->ro) count++; btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", - info->offset, info->bytes, - (info->bitmap) ? "yes" : "no"); + info->offset, info->bytes, str_yes_no(info->bitmap)); } spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", - list_empty(&block_group->cluster_list) ? "no" : "yes"); + str_no_yes(list_empty(&block_group->cluster_list))); btrfs_info(fs_info, "%d free space entries at or bigger than %llu bytes", count, bytes); @@ -3807,7 +3812,7 @@ next: if (async && *total_trimmed) break; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } @@ -3998,7 +4003,7 @@ next: } block_group->discard_cursor = start; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { if (start != offset) reset_trimming_bitmap(ctl, offset); ret = -ERESTARTSYS; @@ -4156,15 +4161,13 @@ out: int __init btrfs_free_space_init(void) { - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); if (!btrfs_free_space_cachep) return -ENOMEM; btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); + 0, NULL); if (!btrfs_free_space_bitmap_cachep) { kmem_cache_destroy(btrfs_free_space_cachep); return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33b4da3271b1..9f1dbfdee8ca 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -6,6 +6,20 @@ #ifndef BTRFS_FREE_SPACE_CACHE_H #define BTRFS_FREE_SPACE_CACHE_H +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/freezer.h> +#include "fs.h" + +struct inode; +struct page; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_trans_handle; +struct btrfs_trim_block_group; + /* * This is the trim state of an extent or bitmap. * @@ -43,6 +57,11 @@ static inline bool btrfs_free_space_trimming_bitmap( return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); } +static inline bool btrfs_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + /* * Deltas are an effective way to populate global statistics. Give macro names * to make it clear what we're doing. An example is discard_extents in @@ -114,8 +133,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); -int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, - u64 size, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7b598b070700..cae540ec15ed 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -89,7 +89,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -287,7 +286,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, flags |= BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); if (extent_count != expected_extent_count) { @@ -324,7 +322,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, bitmap_cursor, ptr, data_size); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); i += extent_size; @@ -430,7 +427,6 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; @@ -495,7 +491,6 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, extent_count += new_extents; btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && @@ -1176,12 +1171,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { ret = PTR_ERR(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } ret = btrfs_global_root_insert(free_space_root); if (ret) { btrfs_put_root(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } node = rb_first_cached(&fs_info->block_group_cache_tree); @@ -1189,8 +1188,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; + } node = rb_next(node); } @@ -1206,11 +1208,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: +out_clear: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); return ret; } @@ -1273,12 +1273,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } btrfs_global_root_delete(free_space_root); @@ -1289,17 +1295,16 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_tree_lock(free_space_root->node); btrfs_clear_buffer_dirty(trans, free_space_root->node); btrfs_tree_unlock(free_space_root->node); - btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), - free_space_root->node, 0, 1); - + ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), + free_space_root->node, 0, 1); btrfs_put_root(free_space_root); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } return btrfs_commit_transaction(trans); - -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1322,8 +1327,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { @@ -1332,8 +1340,17 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + if (btrfs_should_end_transaction(trans)) { + btrfs_end_transaction(trans); + trans = btrfs_start_transaction(free_space_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + } node = rb_next(node); } @@ -1344,10 +1361,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_commit_transaction(trans); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 6d5551d0ced8..e6c6d6f4f221 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -6,7 +6,13 @@ #ifndef BTRFS_FREE_SPACE_TREE_H #define BTRFS_FREE_SPACE_TREE_H +#include <linux/bits.h> + struct btrfs_caching_control; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_block_group; +struct btrfs_trans_handle; /* * The default size for new free space bitmap items. The last bitmap in a block diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 31c1648bc0b4..09cfb43580cb 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -4,6 +4,136 @@ #include "ctree.h" #include "fs.h" #include "accessors.h" +#include "volumes.h" + +static const struct btrfs_csums { + u16 size; + const char name[10]; + const char driver[12]; +} btrfs_csums[] = { + [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, + [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, + [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", + .driver = "blake2b-256" }, +}; + +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + +int btrfs_super_csum_size(const struct btrfs_super_block *s) +{ + u16 t = btrfs_super_csum_type(s); + + /* csum type is validated at mount time. */ + return btrfs_csum_type_size(t); +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time. */ + return btrfs_csums[csum_type].name; +} + +/* + * Return driver name if defined, otherwise the name that's also a valid driver + * name. + */ +const char *btrfs_super_csum_driver(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].driver[0] ? + btrfs_csums[csum_type].driver : + btrfs_csums[csum_type].name; +} + +size_t __attribute_const__ btrfs_get_num_csums(void) +{ + return ARRAY_SIZE(btrfs_csums); +} + +/* + * Start exclusive operation @type, return true on success. + */ +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + bool ret = false; + + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { + fs_info->exclusive_operation = type; + ret = true; + } + spin_unlock(&fs_info->super_lock); + + return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one. This must be paired with btrfs_exclop_start_unlock() + * and btrfs_exclop_finish(). + * + * Compatibility: + * - the same type is already running + * - when trying to add a device and balance has been paused + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + * must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) + return true; + + spin_unlock(&fs_info->super_lock); + return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ + spin_unlock(&fs_info->super_lock); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->super_lock); + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + spin_unlock(&fs_info->super_lock); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) +{ + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || + fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || + fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index f8bb73d6ab68..b572d6b9730b 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -4,14 +4,49 @@ #define BTRFS_FS_H #include <linux/blkdev.h> -#include <linux/fs.h> -#include <linux/btrfs_tree.h> #include <linux/sizes.h> +#include <linux/time64.h> +#include <linux/compiler.h> +#include <linux/math.h> +#include <linux/atomic.h> +#include <linux/percpu_counter.h> +#include <linux/completion.h> +#include <linux/lockdep.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/semaphore.h> +#include <linux/list.h> +#include <linux/pagemap.h> +#include <linux/radix-tree.h> +#include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/wait_bit.h> +#include <linux/sched.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "extent-io-tree.h" -#include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" +struct inode; +struct super_block; +struct kobject; +struct reloc_control; +struct crypto_shash; +struct ulist; +struct btrfs_device; +struct btrfs_block_group; +struct btrfs_root; +struct btrfs_fs_devices; +struct btrfs_transaction; +struct btrfs_delayed_root; +struct btrfs_balance_control; +struct btrfs_subpage_info; +struct btrfs_stripe_hash_table; +struct btrfs_space_info; + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -63,7 +98,9 @@ enum { /* The btrfs_fs_info created for self-tests */ BTRFS_FS_STATE_DUMMY_FS_INFO, - BTRFS_FS_STATE_NO_CSUMS, + /* Checksum errors are ignored. */ + BTRFS_FS_STATE_NO_DATA_CSUMS, + BTRFS_FS_STATE_SKIP_META_CSUMS, /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, @@ -158,37 +195,39 @@ enum { * Note: don't forget to add new options to btrfs_show_options() */ enum { - BTRFS_MOUNT_NODATASUM = (1UL << 0), - BTRFS_MOUNT_NODATACOW = (1UL << 1), - BTRFS_MOUNT_NOBARRIER = (1UL << 2), - BTRFS_MOUNT_SSD = (1UL << 3), - BTRFS_MOUNT_DEGRADED = (1UL << 4), - BTRFS_MOUNT_COMPRESS = (1UL << 5), - BTRFS_MOUNT_NOTREELOG = (1UL << 6), - BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), - BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), - BTRFS_MOUNT_NOSSD = (1UL << 9), - BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), - BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), - BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), - BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), - BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), - BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), - BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), - BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), - BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24), - BTRFS_MOUNT_REF_VERIFY = (1UL << 25), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), - BTRFS_MOUNT_NODISCARD = (1UL << 29), - BTRFS_MOUNT_NOSPACECACHE = (1UL << 30), + BTRFS_MOUNT_NODATASUM = (1ULL << 0), + BTRFS_MOUNT_NODATACOW = (1ULL << 1), + BTRFS_MOUNT_NOBARRIER = (1ULL << 2), + BTRFS_MOUNT_SSD = (1ULL << 3), + BTRFS_MOUNT_DEGRADED = (1ULL << 4), + BTRFS_MOUNT_COMPRESS = (1ULL << 5), + BTRFS_MOUNT_NOTREELOG = (1ULL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1ULL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1ULL << 8), + BTRFS_MOUNT_NOSSD = (1ULL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1ULL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1ULL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1ULL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1ULL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1ULL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1ULL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1ULL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1ULL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1ULL << 18), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1ULL << 19), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1ULL << 20), + BTRFS_MOUNT_FRAGMENT_DATA = (1ULL << 21), + BTRFS_MOUNT_FRAGMENT_METADATA = (1ULL << 22), + BTRFS_MOUNT_FREE_SPACE_TREE = (1ULL << 23), + BTRFS_MOUNT_NOLOGREPLAY = (1ULL << 24), + BTRFS_MOUNT_REF_VERIFY = (1ULL << 25), + BTRFS_MOUNT_DISCARD_ASYNC = (1ULL << 26), + BTRFS_MOUNT_IGNOREBADROOTS = (1ULL << 27), + BTRFS_MOUNT_IGNOREDATACSUMS = (1ULL << 28), + BTRFS_MOUNT_NODISCARD = (1ULL << 29), + BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30), + BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31), + BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32), }; /* @@ -224,10 +263,10 @@ enum { BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Features under developmen like Extent tree v2 support is enabled - * only under CONFIG_BTRFS_DEBUG. + * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ @@ -278,6 +317,8 @@ struct btrfs_dev_replace { struct percpu_counter bio_counter; wait_queue_head_t replace_wait; + + struct task_struct *replace_task; }; /* @@ -442,7 +483,7 @@ struct btrfs_fs_info { * required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt; + unsigned long long mount_opt; unsigned long compress_type:4; unsigned int compress_level; @@ -586,6 +627,9 @@ struct btrfs_fs_info { struct kobject *qgroups_kobj; struct kobject *discard_kobj; + /* Track the number of blocks (sectors) read by the filesystem. */ + struct percpu_counter stats_read_blocks; + /* Used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; @@ -593,6 +637,12 @@ struct btrfs_fs_info { s32 dirty_metadata_batch; s32 delalloc_batch; + struct percpu_counter evictable_extent_maps; + u64 em_shrinker_last_root; + u64 em_shrinker_last_ino; + atomic64_t em_shrinker_nr_to_scan; + struct work_struct em_shrinker_work; + /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; @@ -659,8 +709,8 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; + u32 sectors_per_page; struct workqueue_struct *scrub_workers; - struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -732,10 +782,13 @@ struct btrfs_fs_info { /* Reclaim partially filled block groups in the background */ struct work_struct reclaim_bgs_work; + /* Protected by unused_bgs_lock. */ struct list_head reclaim_bgs; int bg_reclaim_threshold; + /* Protects the lists unused_bgs and reclaim_bgs. */ spinlock_t unused_bgs_lock; + /* Protected by unused_bgs_lock. */ struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ @@ -829,6 +882,19 @@ struct btrfs_fs_info { #endif }; +#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ + struct folio *: (_folio))->mapping->host)) + +#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) + +#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ + struct inode *: (_inode)))->root->fs_info) + +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_constraint(mapping, ~__GFP_FS); +} + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -895,6 +961,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) +#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits) + static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; @@ -903,7 +971,7 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) /* * Count how many fs_info->max_extent_size cover the @size */ -static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 size) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (!fs_info) @@ -922,6 +990,19 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation op); +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); + +u16 btrfs_csum_type_size(u16 type); +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + +static inline bool btrfs_is_empty_uuid(const u8 *uuid) +{ + return uuid_is_null((const uuid_t *)uuid); +} + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -962,7 +1043,7 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { @@ -981,7 +1062,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) * since setting and checking for SB_RDONLY in the superblock's flags is not * atomic. */ -static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) +static inline int btrfs_need_cleaner_sleep(const struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || btrfs_fs_closing(fs_info); @@ -998,11 +1079,19 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) +/* + * We use folio flag owner_2 to indicate there is an ordered extent with + * unfinished IO. + */ +#define folio_test_ordered(folio) folio_test_owner_2(folio) +#define folio_set_ordered(folio) folio_set_owner_2(folio) +#define folio_clear_ordered(folio) folio_clear_owner_2(folio) + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS #define EXPORT_FOR_TESTS -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); } @@ -1013,7 +1102,7 @@ void btrfs_test_destroy_inode(struct inode *inode); #define EXPORT_FOR_TESTS static -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) { return 0; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 7d734830e514..448aa1a682d6 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -9,13 +9,12 @@ #include "inode-item.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "space-info.h" #include "accessors.h" #include "extent-tree.h" #include "file-item.h" -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name) { @@ -43,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, } struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; @@ -142,8 +141,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name); if (!extref) { - btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); - ret = -EROFS; + btrfs_abort_transaction(trans, -ENOENT); + ret = -ENOENT; goto out; } @@ -299,8 +298,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - out: btrfs_free_path(path); return ret; @@ -364,8 +361,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); } write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - out: btrfs_free_path(path); @@ -424,9 +419,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } -static inline void btrfs_trace_truncate(struct btrfs_inode *inode, - struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, +static inline void btrfs_trace_truncate(const struct btrfs_inode *inode, + const struct extent_buffer *leaf, + const struct btrfs_file_extent_item *fi, u64 offset, int extent_type, int slot) { if (!inode) @@ -591,7 +586,6 @@ search_again: num_dec = (orig_num_bytes - extent_num_bytes); if (extent_start != 0) control->sub_bytes += num_dec; - btrfs_mark_buffer_dirty(trans, leaf); } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); @@ -671,16 +665,18 @@ delete: } if (del_item && extent_start != 0 && !control->skip_ref_updates) { - struct btrfs_ref ref = { 0 }; + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = extent_start, + .num_bytes = extent_num_bytes, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_header_owner(leaf), + }; bytes_deleted += extent_num_bytes; - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - control->ino, extent_offset, - root->root_key.objectid, false); + btrfs_init_data_ref(&ref, control->ino, extent_offset, + btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index 4337bb26f419..c11b97fdccc4 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -6,14 +6,15 @@ #include <linux/types.h> #include <linux/crc32c.h> +struct fscrypt_str; +struct extent_buffer; struct btrfs_trans_handle; struct btrfs_root; struct btrfs_path; struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; -struct extent_buffer; -struct fscrypt_str; +struct btrfs_truncate_control; /* * Return this if we need to call truncate_block for the last bit of the @@ -108,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref( u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4795738d5785..38756f8cef46 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,21 +32,19 @@ #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/iomap.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "bio.h" #include "compression.h" #include "locking.h" -#include "free-space-cache.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -72,31 +70,13 @@ #include "orphan.h" #include "backref.h" #include "raid-stripe-tree.h" +#include "fiemap.h" struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; -struct btrfs_dio_data { - ssize_t submitted; - struct extent_changeset *data_reserved; - struct btrfs_ordered_extent *ordered; - bool data_space_reserved; - bool nocow_done; -}; - -struct btrfs_dio_private { - /* Range of I/O */ - u64 file_offset; - u32 bytes; - - /* This must be last */ - struct btrfs_bio bbio; -}; - -static struct bio_set btrfs_dio_bioset; - struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; @@ -136,14 +116,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type); static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) @@ -256,7 +231,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, btrfs_ino(inode), file_off, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); @@ -266,7 +241,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off logical += file_off; btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -333,15 +308,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); /* Output without objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -349,7 +324,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } else { btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -418,37 +393,16 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct page *locked_page, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = 0, page_end = 0; - struct page *page; - - if (locked_page) { - page_start = page_offset(locked_page); - page_end = page_start + PAGE_SIZE - 1; - } + struct folio *folio; while (index <= end_index) { - /* - * For locked page, we will call btrfs_mark_ordered_io_finished - * through btrfs_mark_ordered_io_finished() on it - * in run_delalloc_range() for the error handling, which will - * clear page Ordered and run the ordered extent accounting. - * - * Here we can't just clear the Ordered bit, or - * btrfs_mark_ordered_io_finished() would skip the accounting - * for the page range, and the ordered extent will never finish. - */ - if (locked_page && index == (page_start >> PAGE_SHIFT)) { - index++; - continue; - } - page = find_get_page(inode->vfs_inode.i_mapping, index); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; - if (!page) + if (IS_ERR(folio)) continue; /* @@ -456,25 +410,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_folio_clamp_clear_ordered(inode->root->fs_info, - page_folio(page), offset, bytes); - put_page(page); - } - - if (locked_page) { - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being - * instantiated then skip it, since the first page of a range is - * going to be properly cleaned up by the caller of - * run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; - } + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, + offset, bytes); + folio_put(folio); } return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); @@ -514,12 +452,12 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages, + struct folio *compressed_folio, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct page *page = NULL; + const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; @@ -527,10 +465,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; u64 i_size; - ASSERT((compressed_size > 0 && compressed_pages) || - (compressed_size == 0 && !compressed_pages)); + /* + * The decompressed size must still be no larger than a sector. Under + * heavy race, we can have size == 0 passed in, but that shouldn't be a + * big deal and we can continue the insertion. + */ + ASSERT(size <= sectorsize); - if (compressed_size && compressed_pages) + /* + * The compressed size also needs to be no larger than a sector. + * That's also why we only need one page as the parameter. + */ + if (compressed_folio) + ASSERT(compressed_size <= sectorsize); + else + ASSERT(compressed_size == 0); + + if (compressed_size && compressed_folio) cur_size = compressed_size; if (!extent_inserted) { @@ -558,32 +509,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { - struct page *cpage; - int i = 0; - while (compressed_size > 0) { - cpage = compressed_pages[i]; - cur_size = min_t(unsigned long, compressed_size, - PAGE_SIZE); - - kaddr = kmap_local_page(cpage); - write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_local(kaddr); + kaddr = kmap_local_folio(compressed_folio, 0); + write_extent_buffer(leaf, kaddr, ptr, compressed_size); + kunmap_local(kaddr); - i++; - ptr += cur_size; - compressed_size -= cur_size; - } btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->vfs_inode.i_mapping, 0); + struct folio *folio; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); + ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); - put_page(page); + folio_put(folio); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -613,17 +555,62 @@ fail: return ret; } +static bool can_cow_file_range_inline(struct btrfs_inode *inode, + u64 offset, u64 size, + size_t compressed_size) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 data_len = (compressed_size ?: size); + + /* Inline extents must start at offset 0. */ + if (offset != 0) + return false; + + /* + * Due to the page size limit, for subpage we can only trigger the + * writeback for the dirty sectors of page, that means data writeback + * is doing more writeback than what we want. + * + * This is especially unexpected for some call sites like fallocate, + * where we only increase i_size after everything is done. + * This means we can trigger inline extent even if we didn't want to. + * So here we skip inline extent creation completely. + */ + if (fs_info->sectorsize != PAGE_SIZE) + return false; + + /* Inline extents are limited to sectorsize. */ + if (size > fs_info->sectorsize) + return false; + + /* We cannot exceed the maximum inline data size. */ + if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + return false; + + /* We cannot exceed the user specified max_inline size. */ + if (data_len > fs_info->max_inline) + return false; + + /* Inline extents must be the entirety of the file. */ + if (size < i_size_read(&inode->vfs_inode)) + return false; + + return true; +} /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. + * + * If being used directly, you must have already checked we're allowed to cow + * the range by getting true from can_cow_file_range_inline(). */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, - size_t compressed_size, - int compress_type, - struct page **compressed_pages, - bool update_i_size) +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, + u64 size, size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; @@ -633,18 +620,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, int ret; struct btrfs_path *path; - /* - * We can create an inline extent if it ends at or beyond the current - * i_size, is no larger than a sector (decompressed), and the (possibly - * compressed) data fits in a leaf and the configured maximum inline - * size. - */ - if (size < i_size_read(&inode->vfs_inode) || - size > fs_info->sectorsize || - data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - data_len > fs_info->max_inline) - return 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -670,7 +645,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, - compressed_pages, update_i_size); + compressed_folio, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -703,19 +678,68 @@ out: return ret; } +static noinline int cow_file_range_inline(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 offset, u64 end, + size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) +{ + struct extent_state *cached = NULL; + unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; + u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); + int ret; + + if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) + return 1; + + lock_extent(&inode->io_tree, offset, end, &cached); + ret = __cow_file_range_inline(inode, size, compressed_size, + compress_type, compressed_folio, + update_i_size); + if (ret > 0) { + unlock_extent(&inode->io_tree, offset, end, &cached); + return ret; + } + + /* + * In the successful case (ret == 0 here), cow_file_range will return 1. + * + * Quite a bit further up the callstack in extent_writepage(), ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * In the failure case, the locked_folio does get unlocked by + * btrfs_folio_end_all_writers, which asserts that it is still locked + * at that point, so we must *not* unlock it here. + * + * The other two callsites in compress_file_range do not have a + * locked_folio, so they are not relevant to this logic. + */ + if (ret == 0) + locked_folio = NULL; + + extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, + clear_flags, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + return ret; +} + struct async_extent { u64 start; u64 ram_size; u64 compressed_size; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; int compress_type; struct list_head list; }; struct async_chunk { struct btrfs_inode *inode; - struct page *locked_page; + struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; @@ -733,19 +757,20 @@ struct async_cow { static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, - struct page **pages, - unsigned long nr_pages, + struct folio **folios, + unsigned long nr_folios, int compress_type) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); - BUG_ON(!async_extent); /* -ENOMEM */ + if (!async_extent) + return -ENOMEM; async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; - async_extent->pages = pages; - async_extent->nr_pages = nr_pages; + async_extent->folios = folios; + async_extent->nr_folios = nr_folios; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; @@ -767,32 +792,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } /* - * Special check for subpage. - * - * We lock the full page then run each delalloc range in the page, thus - * for the following case, we will hit some subpage specific corner case: - * - * 0 32K 64K - * | |///////| |///////| - * \- A \- B - * - * In above case, both range A and range B will try to unlock the full - * page [0, 64K), causing the one finished later will have page - * unlocked already, triggering various page lock requirement BUG_ON()s. + * Only enable sector perfect compression for experimental builds. * - * So here we add an artificial limit that subpage compression can only - * if the range is fully page aligned. + * This is a big feature change for subpage cases, and can hit + * different corner cases, so only limit this feature for + * experimental build for now. * - * In theory we only need to ensure the first page is fully covered, but - * the tailing partial page will be locked until the full compression - * finishes, delaying the write of other range. - * - * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range - * first to prevent any submitted async extent to unlock the full page. - * By this, we can ensure for subpage case that only the last async_cow - * will unlock the full page. + * ETA for moving this out of experimental builds is 6.15. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->sectorsize < PAGE_SIZE && + !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; @@ -810,7 +819,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) - return btrfs_compress_heuristic(&inode->vfs_inode, start, end); + return btrfs_compress_heuristic(inode, start, end); return 0; } @@ -820,7 +829,28 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode, small_write); + btrfs_add_inode_defrag(inode, small_write); +} + +static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long end_index = end >> PAGE_SHIFT; + struct folio *folio; + int ret = 0; + + for (unsigned long index = start >> PAGE_SHIFT; + index <= end_index; index++) { + folio = filemap_get_folio(inode->i_mapping, index); + if (IS_ERR(folio)) { + if (!ret) + ret = PTR_ERR(folio); + continue; + } + btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + end + 1 - start); + folio_put(folio); + } + return ret; } /* @@ -849,8 +879,8 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int poff; @@ -864,7 +894,16 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* + * All the folios should have been locked thus no failure. + * + * And even if some folios are missing, btrfs_compress_folios() + * would handle them correctly, so here just do an ASSERT() check for + * early logic errors. + */ + ASSERT(ret == 0); /* * We need to save i_size before now because it could change in between @@ -880,9 +919,9 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - pages = NULL; - nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); + folios = NULL; + nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through @@ -907,17 +946,6 @@ again: (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* - * For subpage case, we require full page alignment for the sector - * aligned range. - * Thus we must also check against @actual_end, not just @end. - */ - if (blocksize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(round_up(actual_end, blocksize))) - goto cleanup_and_bail_uncompressed; - } - total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -931,8 +959,8 @@ again: if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { + folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); + if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. @@ -946,9 +974,9 @@ again: compress_type = inode->prop_compress; /* Compression level is applied here. */ - ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), - mapping, start, pages, &nr_pages, &total_in, - &total_compressed); + ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + mapping, start, folios, &nr_folios, &total_in, + &total_compressed); if (ret) goto mark_incompressible; @@ -958,7 +986,7 @@ again: */ poff = offset_in_page(total_compressed); if (poff) - memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); + folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); /* * Try to create an inline extent. @@ -969,43 +997,16 @@ again: * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - if (total_in < actual_end) { - ret = cow_file_range_inline(inode, actual_end, 0, - BTRFS_COMPRESS_NONE, NULL, - false); - } else { - ret = cow_file_range_inline(inode, actual_end, - total_compressed, - compress_type, pages, - false); - } - if (ret <= 0) { - unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING; - - if (ret < 0) - mapping_set_error(mapping, -EIO); - - /* - * inline extent creation worked or returned error, - * we don't need to create any more async work items. - * Unlock and free up our temp pages. - * - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be done _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - NULL, - clear_flags, - PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - goto free_pages; - } + if (total_in < actual_end) + ret = cow_file_range_inline(inode, NULL, start, end, 0, + BTRFS_COMPRESS_NONE, NULL, false); + else + ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, + compress_type, folios[0], false); + if (ret <= 0) { + if (ret < 0) + mapping_set_error(mapping, -EIO); + goto free_pages; } /* @@ -1027,8 +1028,9 @@ again: * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - add_async_extent(async_chunk, start, total_in, total_compressed, pages, - nr_pages, compress_type); + ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, + nr_folios, compress_type); + BUG_ON(ret); if (start + total_in < end) { start += total_in; cond_resched(); @@ -1040,15 +1042,16 @@ mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: - add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); + ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + BUG_ON(ret); free_pages: - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - btrfs_free_compr_page(pages[i]); + if (folios) { + for (i = 0; i < nr_folios; i++) { + WARN_ON(folios[i]->mapping); + btrfs_free_compr_folio(folios[i]); } - kfree(pages); + kfree(folios); } } @@ -1056,21 +1059,21 @@ static void free_async_extent_pages(struct async_extent *async_extent) { int i; - if (!async_extent->pages) + if (!async_extent->folios) return; - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - btrfs_free_compr_page(async_extent->pages[i]); + for (i = 0; i < async_extent->nr_folios; i++) { + WARN_ON(async_extent->folios[i]->mapping); + btrfs_free_compr_folio(async_extent->folios[i]); } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + kfree(async_extent->folios); + async_extent->nr_folios = 0; + async_extent->folios = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, - struct page *locked_page) + struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1083,21 +1086,18 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + ret = run_delalloc_cow(inode, locked_folio, start, end, + &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); - if (locked_page) { - const u64 page_start = page_offset(locked_page); - - set_page_writeback(locked_page); - end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, locked_page, - page_start, PAGE_SIZE, - !ret); - mapping_set_error(locked_page->mapping, ret); - unlock_page(locked_page); - } + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + if (locked_folio) + btrfs_folio_end_lock(inode->root->fs_info, locked_folio, + start, async_extent->ram_size); + btrfs_err_rl(inode->root->fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, async_extent->ram_size, ret); } } @@ -1110,8 +1110,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; struct btrfs_key ins; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; + struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; u64 start = async_extent->start; @@ -1121,20 +1123,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, kthread_associate_blkcg(async_chunk->blkcg_css); /* - * If async_chunk->locked_page is in the async_extent range, we need to + * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ - if (async_chunk->locked_page) { - u64 locked_page_start = page_offset(async_chunk->locked_page); - u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + if (async_chunk->locked_folio) { + u64 locked_folio_start = folio_pos(async_chunk->locked_folio); + u64 locked_folio_end = locked_folio_start + + folio_size(async_chunk->locked_folio) - 1; - if (!(start >= locked_page_end || end <= locked_page_start)) - locked_page = async_chunk->locked_page; + if (!(start >= locked_folio_end || end <= locked_folio_start)) + locked_folio = async_chunk->locked_folio; } - lock_extent(io_tree, start, end, NULL); if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } @@ -1144,39 +1146,34 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, 0, *alloc_hint, &ins, 1, 1); if (ret) { /* - * Here we used to try again by going back to non-compressed - * path for ENOSPC. But we can't reserve space even for - * compressed size, how could it work for uncompressed size - * which requires larger size? So here we directly go error - * path. + * We can't reserve contiguous space for the compressed size. + * Unlikely, but it's possible that we could have enough + * non-contiguous space for the uncompressed size instead. So + * fall back to uncompressed. */ - goto out_free; + submit_uncompressed_range(inode, async_extent, locked_folio); + goto done; } + lock_extent(io_tree, start, end, &cached); + /* Here we're doing allocation and writeback of the compressed pages */ - em = create_io_em(inode, start, - async_extent->ram_size, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - async_extent->ram_size, /* ram_bytes */ - async_extent->compress_type, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.ram_bytes = async_extent->ram_size; + file_extent.num_bytes = async_extent->ram_size; + file_extent.offset = 0; + file_extent.compression = async_extent->compress_type; + + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserve; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ - async_extent->ram_size, /* num_bytes */ - async_extent->ram_size, /* ram_bytes */ - ins.objectid, /* disk_bytenr */ - ins.offset, /* disk_num_bytes */ - 0, /* offset */ - 1 << BTRFS_ORDERED_COMPRESSED, - async_extent->compress_type); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1 << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1186,11 +1183,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, - async_extent->pages, /* compressed_pages */ - async_extent->nr_pages, + async_extent->folios, /* compressed_folios */ + async_extent->nr_folios, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: @@ -1202,10 +1199,10 @@ done: out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); -out_free: mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + NULL, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | @@ -1215,13 +1212,13 @@ out_free: kthread_associate_blkcg(NULL); btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - root->root_key.objectid, btrfs_ino(inode), start, + btrfs_root_id(root), btrfs_ino(inode), start, async_extent->ram_size, ret); kfree(async_extent); } -static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, - u64 num_bytes) +u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1235,15 +1232,15 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em); em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; + if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + alloc_hint = extent_map_block_start(em); if (em) free_extent_map(em); } else { - alloc_hint = em->block_start; + alloc_hint = extent_map_block_start(em); free_extent_map(em); } } @@ -1258,21 +1255,21 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * - * locked_page is the page that writepage had locked already. We use + * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_page. + * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_page and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_page is + * unlocks all pages including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_page are unlocked. + * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1281,16 +1278,16 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, u64 start, u64 end, - u64 *done_offset, + struct folio *locked_folio, u64 start, + u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_state *cached = NULL; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; @@ -1298,7 +1295,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct extent_map *em; unsigned clear_bits; unsigned long page_ops; - bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { @@ -1312,57 +1308,36 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { - u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), - end + 1); - + if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, actual_end, 0, + ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); - if (ret == 0) { - /* - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be run _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + if (ret <= 0) { /* - * locked_page is locked by the caller of - * writepage_delalloc(), not locked by - * __process_pages_contig(). + * We succeeded, return 1 so the caller knows we're done + * with this page and already handled the IO. * - * We can't let __process_pages_contig() to unlock it, - * as it doesn't have any subpage::writers recorded. - * - * Here we manually unlock the page, since the caller - * can't determine if it's an inline extent or a - * compressed extent. + * If there was an error then cow_file_range_inline() has + * already done the cleanup. */ - unlock_page(locked_page); - ret = 1; + if (ret == 0) + ret = 1; goto done; - } else if (ret < 0) { - goto out_unlock; } } - alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); + + /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; /* * Relocation relies on the relocated extents to have exactly the same @@ -1382,9 +1357,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, while (num_bytes > 0) { struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; - cur_alloc_size = num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { @@ -1407,36 +1382,49 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; - extent_reserved = true; - - ram_size = ins.offset; - em = create_io_em(inode, start, ins.offset, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - ram_size, /* ram_bytes */ - BTRFS_COMPRESS_NONE, /* compress_type */ - BTRFS_ORDERED_REGULAR /* type */); + + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = ins.offset; + file_extent.ram_bytes = ins.offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; + + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ + lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, + &cached); + + em = btrfs_create_io_em(inode, start, &file_extent, + BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, - ram_size, ins.objectid, cur_alloc_size, - 0, 1 << BTRFS_ORDERED_REGULAR, - BTRFS_COMPRESS_NONE); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { + unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1457,35 +1445,20 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (ret) btrfs_drop_extent_map_range(inode, start, - start + ram_size - 1, + start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered (Private2) bit so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; - extent_reserved = false; + cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1495,13 +1468,15 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; return ret; out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); + btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1515,29 +1490,31 @@ out_unlock: * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * btrfs_run_delalloc_range(). + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the cleanup function. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_page) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { - if (!locked_page) + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, @@ -1548,13 +1525,12 @@ out_unlock: * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ - if (extent_reserved) { + if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_page, - clear_bits, + locked_folio, &cached, clear_bits, page_ops); - start += cur_alloc_size; + btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } /* @@ -1563,11 +1539,18 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - if (start < end) { + if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); - } + extent_clear_unlock_delalloc(inode, start + cur_alloc_size, + end, locked_folio, + &cached, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, + end - start - cur_alloc_size + 1, NULL); + } + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret; } @@ -1589,10 +1572,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ u64 alloc_hint = 0; if (do_free) { - struct async_chunk *async_chunk; struct async_cow *async_cow; - async_chunk = container_of(work, struct async_chunk, work); btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -1620,7 +1601,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1639,7 +1620,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, if (!ctx) return false; - unlock_extent(&inode->io_tree, start, end, NULL); set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; @@ -1661,15 +1641,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, INIT_LIST_HEAD(&async_chunk[i].extents); /* - * The locked_page comes all the way from writepage and its - * the original page we were actually given. As we spread + * The locked_folio comes all the way from writepage and its + * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk - * structs, only the first struct needs a pointer to locked_page + * structs, only the first struct needs a pointer to + * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ - if (locked_page) { + if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to @@ -1679,12 +1660,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, locked_page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); - async_chunk[i].locked_page = locked_page; - locked_page = NULL; + async_chunk[i].locked_folio = locked_folio; + locked_folio = NULL; } else { - async_chunk[i].locked_page = NULL; + async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { @@ -1713,7 +1694,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -1721,48 +1702,27 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, &done_offset, - true, false); + ret = cow_file_range(inode, locked_folio, start, end, + &done_offset, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, - done_offset, wbc, pages_dirty); + extent_write_locked_range(&inode->vfs_inode, locked_folio, + start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } -static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes, bool nowait) -{ - struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); - struct btrfs_ordered_sum *sums; - int ret; - LIST_HEAD(list); - - ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, - &list, 0, nowait); - if (ret == 0 && list_empty(&list)) - return 0; - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); - } - if (ret < 0) - return ret; - return 1; -} - -static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end) +static int fallback_to_cow(struct btrfs_inode *inode, + struct folio *locked_folio, const u64 start, + const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_state *cached_state = NULL; u64 range_start = start; u64 count; int ret; @@ -1799,6 +1759,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ + lock_extent(io_tree, start, end, &cached_state); count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { @@ -1810,20 +1771,22 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, bytes = range_bytes; spin_lock(&sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, NULL); } + unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, + true); ASSERT(ret != 1); return ret; } @@ -1836,20 +1799,17 @@ struct can_nocow_file_extent_args { /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; - bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. */ bool free_path; - /* Output fields. Only set when can_nocow_file_extent() returns 1. */ - - u64 disk_bytenr; - u64 disk_num_bytes; - u64 extent_offset; - /* Number of bytes that can be written to in NOCOW mode. */ - u64 num_bytes; + /* + * Output fields. Only set when can_nocow_file_extent() returns 1. + * The expected file extent for the NOCOW write. + */ + struct btrfs_file_extent file_extent; }; /* @@ -1870,6 +1830,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; + struct btrfs_root *csum_root; + u64 io_start; u64 extent_end; u8 extent_type; int can_nocow = 0; @@ -1882,11 +1844,6 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out; - /* Can't access these fields unless we know it's not an inline extent. */ - args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - args->extent_offset = btrfs_file_extent_offset(leaf, fi); - if (!(inode->flags & BTRFS_INODE_NODATACOW) && extent_type == BTRFS_FILE_EXTENT_REG) goto out; @@ -1896,13 +1853,12 @@ static int can_nocow_file_extent(struct btrfs_path *path, * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ - if (!args->strict && - btrfs_file_extent_generation(leaf, fi) <= + if (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; /* An explicit hole, must COW. */ - if (args->disk_bytenr == 0) + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out; /* Compressed/encrypted/encoded extents must be COWed. */ @@ -1913,6 +1869,12 @@ static int can_nocow_file_extent(struct btrfs_path *path, extent_end = btrfs_file_extent_end(path); + args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); + args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); + /* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid @@ -1920,9 +1882,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ btrfs_release_path(path); - ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->extent_offset, - args->disk_bytenr, args->strict, path); + ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1930,7 +1891,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (args->free_path) { /* * We don't need the path anymore, plus through the - * csum_exist_in_range() call below we will end up allocating + * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage. */ @@ -1943,16 +1904,19 @@ static int can_nocow_file_extent(struct btrfs_path *path, atomic_read(&root->snapshot_force_cow)) goto out; - args->disk_bytenr += args->extent_offset; - args->disk_bytenr += args->start - key->offset; - args->num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.offset += args->start - key->offset; + io_start = args->file_extent.disk_bytenr + args->file_extent.offset; /* * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ - ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, - nowait); + + csum_root = btrfs_csum_root(root->fs_info, io_start); + ret = btrfs_lookup_csums_list(csum_root, io_start, + io_start + args->file_extent.num_bytes - 1, + NULL, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1966,6 +1930,53 @@ static int can_nocow_file_extent(struct btrfs_path *path, } /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + +/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * @@ -1973,13 +1984,18 @@ static int can_nocow_file_extent(struct btrfs_path *path, * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2002,14 +2018,14 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.end = end; nocow_args.writeback_path = true; - while (1) { + while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; + struct extent_state *cached_state = NULL; u64 extent_end; - u64 ram_bytes; u64 nocow_end; int extent_type; bool is_prealloc; @@ -2088,7 +2104,6 @@ next_slot: ret = -EUCLEAN; goto error; } - ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); extent_end = btrfs_file_extent_end(path); /* @@ -2108,7 +2123,9 @@ next_slot: goto must_cow; ret = 0; - nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + nocow_bg = btrfs_inc_nocow_writers(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset); if (!nocow_bg) { must_cow: /* @@ -2135,29 +2152,29 @@ must_cow: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1); + ret = fallback_to_cow(inode, locked_folio, cow_start, + found_key.offset - 1); cow_start = (u64)-1; if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } } - nocow_end = cur_offset + nocow_args.num_bytes - 1; + nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; + lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); + is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; if (is_prealloc) { - u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; - em = create_io_em(inode, cur_offset, nocow_args.num_bytes, - orig_start, - nocow_args.disk_bytenr, /* block_start */ - nocow_args.num_bytes, /* block_len */ - nocow_args.disk_num_bytes, /* orig_block_len */ - ram_bytes, BTRFS_COMPRESS_NONE, - BTRFS_ORDERED_PREALLOC); + em = btrfs_create_io_em(inode, cur_offset, + &nocow_args.file_extent, + BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, cur_offset, + nocow_end, &cached_state); btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; @@ -2166,18 +2183,18 @@ must_cow: } ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - nocow_args.num_bytes, nocow_args.num_bytes, - nocow_args.disk_bytenr, nocow_args.num_bytes, 0, + &nocow_args.file_extent, is_prealloc ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW), - BTRFS_COMPRESS_NONE); + : (1 << BTRFS_ORDERED_NOCOW)); btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); } + unlock_extent(&inode->io_tree, cur_offset, + nocow_end, &cached_state); ret = PTR_ERR(ordered); goto error; } @@ -2192,8 +2209,8 @@ must_cow: btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | + locked_folio, &cached_state, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2206,8 +2223,6 @@ must_cow: */ if (ret) goto error; - if (cur_offset > end) - break; } btrfs_release_path(path); @@ -2215,11 +2230,12 @@ must_cow: cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end); + ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; - if (ret) + if (ret) { + cow_end = end; goto error; + } } btrfs_free_path(path); @@ -2227,20 +2243,64 @@ must_cow: error: /* + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_offset end + * |/////////////| | + * + * For range [start, cur_offset) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Only need to clear the dirty flag as they will never be submitted. + * Ordered extent and extent maps are handled by + * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * + * 2) Failed with error from fallback_to_cow() + * start cur_offset cow_end end + * |/////////////|-----------| | + * + * For range [start, cur_offset) it's the same as case 1). + * But for range [cur_offset, cow_end), the folios have dirty flag + * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * + * Thus we should not call extent_clear_unlock_delalloc() on range + * [cur_offset, cow_end), as the folios are already unlocked. + * + * So clear the folio dirty flags for [start, cur_offset) first. + */ + if (cur_offset > start) + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + + /* * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. + */ + if (cow_end) + cur_offset = cow_end + 1; + + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; - if (cur_offset < end) + if (cur_offset < end) { + struct extent_state *cached = NULL; + + lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_DEFRAG | + locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + } btrfs_free_path(path); + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, end + 1 - start, ret); return ret; } @@ -2259,40 +2319,39 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* - * The range must cover part of the @locked_page, or a return of 1 + * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= page_offset(locked_page) || - start >= page_offset(locked_page) + PAGE_SIZE)); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_pos(locked_folio) + folio_size(locked_folio))); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_page, start, end); + ret = run_delalloc_nocow(inode, locked_folio, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, locked_page, start, end, wbc)) + run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_page, start, end, NULL, + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_page, start, - end - start + 1); + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2302,6 +2361,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; @@ -2340,6 +2401,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state u64 new_size, old_size; u32 num_extents; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; @@ -2387,55 +2450,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state spin_unlock(&inode->lock); } -static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct btrfs_inode *inode) +static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&inode->delalloc_inodes)) { - list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); - root->nr_delalloc_inodes++; - if (root->nr_delalloc_inodes == 1) { - spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(!list_empty(&root->delalloc_root)); - list_add_tail(&root->delalloc_root, - &fs_info->delalloc_roots); - spin_unlock(&fs_info->delalloc_root_lock); - } + ASSERT(list_empty(&inode->delalloc_inodes)); + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&fs_info->delalloc_root_lock); + ASSERT(list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } spin_unlock(&root->delalloc_lock); } -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) +void btrfs_del_delalloc_inode(struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + lockdep_assert_held(&root->delalloc_lock); + + /* + * We may be called after the inode was already deleted from the list, + * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), + * and then later through btrfs_clear_delalloc_extent() while the inode + * still has ->delalloc_bytes > 0. + */ if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(list_empty(&root->delalloc_root)); + ASSERT(!list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } -static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) -{ - spin_lock(&root->delalloc_lock); - __btrfs_del_delalloc_inode(root, inode); - spin_unlock(&root->delalloc_lock); -} - /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. @@ -2445,6 +2503,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s { struct btrfs_fs_info *fs_info = inode->root->fs_info; + lockdep_assert_held(&inode->io_tree.lock); + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* @@ -2453,10 +2513,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; + u64 prev_delalloc_bytes; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(inode); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, num_extents); @@ -2469,13 +2528,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&inode->lock); + prev_delalloc_bytes = inode->delalloc_bytes; inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) inode->defrag_bytes += len; - if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_add_delalloc_inodes(root, inode); spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_clear_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) + btrfs_add_delalloc_inode(inode); } if (!(state->state & EXTENT_DELALLOC_NEW) && @@ -2497,6 +2563,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); + lockdep_assert_held(&inode->io_tree.lock); + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; @@ -2510,7 +2578,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; - bool do_list = !btrfs_is_free_space_inode(inode); + u64 new_delalloc_bytes; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); @@ -2523,14 +2591,15 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, len, false); + btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) return; if (!btrfs_is_data_reloc_root(root) && - do_list && !(state->state & EXTENT_NORESERVE) && + !btrfs_is_free_space_inode(inode) && + !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2538,11 +2607,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; - if (do_list && inode->delalloc_bytes == 0 && - test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_del_delalloc_inode(root, inode); + new_delalloc_bytes = inode->delalloc_bytes; spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_set_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { + spin_lock(&root->delalloc_lock); + btrfs_del_delalloc_inode(inode); + spin_unlock(&root->delalloc_lock); + } } if ((state->state & EXTENT_DELALLOC_NEW) && @@ -2556,44 +2634,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered) -{ - u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - u64 len = bbio->bio.bi_iter.bi_size; - struct btrfs_ordered_extent *new; - int ret; - - /* Must always be called for the beginning of an ordered extent. */ - if (WARN_ON_ONCE(start != ordered->disk_bytenr)) - return -EINVAL; - - /* No need to split if the ordered extent covers the entire bio. */ - if (ordered->disk_num_bytes == len) { - refcount_inc(&ordered->refs); - bbio->ordered = ordered; - return 0; - } - - /* - * Don't split the extent_map for NOCOW extents, as we're writing into - * a pre-existing one. - */ - if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); - if (ret) - return ret; - } - - new = btrfs_split_ordered_extent(ordered, len); - if (IS_ERR(new)) - return PTR_ERR(new); - bbio->ordered = new; - return 0; -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2632,11 +2672,11 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + em = btrfs_get_extent(inode, NULL, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); - if (em->block_start != EXTENT_MAP_HOLE) + if (em->disk_bytenr != EXTENT_MAP_HOLE) goto next; em_len = em->len; @@ -2686,7 +2726,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { - struct page *page; + struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2698,50 +2738,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page = fixup->page; + struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 page_start = page_offset(page); - u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = folio_pos(folio) + folio_size(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before - * we take the page lock. + * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + folio_size(folio)); again: - lock_page(page); + folio_lock(folio); /* - * Before we queued this fixup, we took a reference on the page. - * page->mapping may go NULL, but it shouldn't be moved to a different + * Before we queued this fixup, we took a reference on the folio. + * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + if (!folio->mapping || !folio_test_dirty(folio) || + !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * - * 1) We got here and our page had already been dealt with and + * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. - * 2) Our page was already dealt with, but we happened to get an + * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but - * because the page was already dealt with we don't want to - * mark the page with an error, so make sure we're resetting + * because the folio was already dealt with we don't want to + * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC - * when the page was already properly dealt with. + * when the folio was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE, + page_start, folio_size(folio), true); } ret = 0; @@ -2749,7 +2790,7 @@ again: } /* - * We can't mess with the page state unless it is locked, so now that + * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) @@ -2758,14 +2799,14 @@ again: lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PageOrdered(page)) + if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -2783,7 +2824,7 @@ again: * * The page was dirty when we started, nothing should have cleaned it. */ - BUG_ON(!PageDirty(page)); + BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); @@ -2797,14 +2838,14 @@ out_page: * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ - mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page, page_start, - PAGE_SIZE, !ret); - clear_page_dirty_for_io(page); - } - btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); - unlock_page(page); - put_page(page); + mapping_set_error(folio->mapping, ret); + btrfs_mark_ordered_io_finished(inode, folio, page_start, + folio_size(folio), !ret); + folio_clear_dirty_for_io(folio); + } + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* @@ -2817,33 +2858,34 @@ out_page: /* * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a + * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. + * the delalloc bit and make it safe to write the folio. */ -int btrfs_writepage_cow_fixup(struct page *page) +int btrfs_writepage_cow_fixup(struct folio *folio) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct inode *inode = folio->mapping->host; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; - /* This page has ordered extent covering it already */ - if (PageOrdered(page)) + /* This folio has ordered extent covering it already */ + if (folio_test_ordered(folio)) return 0; /* - * PageChecked is set below when we create a fixup worker for this page, - * don't try to create another one if we're already PageChecked() + * folio_checked is set below when we create a fixup worker for this + * folio, don't try to create another one if we're already + * folio_test_checked. * - * The extent_io writepage code will redirty the page if we send back + * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ - if (PageChecked(page)) + if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); @@ -2853,14 +2895,14 @@ int btrfs_writepage_cow_fixup(struct page *page) /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation - * takes place outside of the page lock, and we can't trust - * page->mapping outside of the page lock. + * takes place outside of the folio lock, and we can't trust + * page->mapping outside of the folio lock. */ ihold(inode); - btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); - get_page(page); + btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); + folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = page; + fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -2924,7 +2966,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -2992,10 +3033,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) num_bytes = oe->truncated_len; - ram_bytes = num_bytes; - } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3011,7 +3050,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); - return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + return insert_reserved_file_extent(trans, oe->inode, oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } @@ -3023,7 +3062,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, */ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { - struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); + struct btrfs_inode *inode = ordered_extent->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; @@ -3068,29 +3107,19 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - - btrfs_inode_safe_disk_i_size_write(inode, 0); - if (freespace_inode) - trans = btrfs_join_transaction_spacecache(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, ret); - goto out; + /* + * If it's a COW write we need to lock the extent range as we will be + * inserting/replacing file extent items and unpinning an extent map. + * This must be taken before joining a transaction, as it's a higher + * level lock (like the inode's VFS lock), otherwise we can run into an + * ABBA deadlock with other tasks (transactions work like a lock, + * depending on their current state). + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); } - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3104,8 +3133,28 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + /* Logic error */ + ASSERT(list_empty(&ordered_extent->list)); + if (!list_empty(&ordered_extent->list)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, inode); + if (ret) { + /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + } + goto out; + } if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -3127,8 +3176,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3157,7 +3211,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_abort_transaction(trans, ret); goto out; } - ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, &cached_state); @@ -3176,9 +3229,8 @@ out: * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ - if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, - &ordered_extent->flags)) - mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + if (ret) + btrfs_mark_ordered_extent_error(ordered_extent); if (truncated) unwritten_start += logical_len; @@ -3232,7 +3284,7 @@ out: * Actually free the qgroup rsv which was released when * the ordered extent was created. */ - btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } @@ -3254,7 +3306,7 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { - if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && + if (btrfs_is_zoned(ordered->inode->root->fs_info) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); @@ -3548,7 +3600,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, last_objectid, root); + inode = btrfs_iget(last_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; @@ -3733,14 +3785,69 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 1; } +static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (WARN_ON_ONCE(inode->file_extent_tree)) + return 0; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + if (!S_ISREG(inode->vfs_inode.i_mode)) + return 0; + if (btrfs_is_free_space_inode(inode)) + return 0; + + inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!inode->file_extent_tree) + return -ENOMEM; + + extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); + + return 0; +} + +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; +} + /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode and add it to + * its root list/tree. + * + * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) +static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_path *path = in_path; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3752,23 +3859,27 @@ static int btrfs_read_locked_inode(struct inode *inode, bool filled = false; int first_xattr_slot; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + goto out; + ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } + ASSERT(path); - memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3808,7 +3919,9 @@ static int btrfs_read_locked_inode(struct inode *inode, inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = (u64)-1; + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); @@ -3899,10 +4012,8 @@ cache_acl: btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), - root->root_key.objectid, ret); + btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -3929,7 +4040,15 @@ cache_acl: } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret) + goto out; + return 0; +out: + iget_failed(inode); + return ret; } /* @@ -3990,13 +4109,15 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; struct btrfs_path *path; struct extent_buffer *leaf; + struct btrfs_key key; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); + btrfs_get_inode_key(inode, &key); + ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -4008,7 +4129,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: @@ -4158,6 +4278,7 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode(trans, dir); @@ -4258,9 +4379,9 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { - objectid = inode->root->root_key.objectid; + objectid = btrfs_root_id(inode->root); } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - objectid = inode->location.objectid; + objectid = inode->ref_root_id; } else { WARN_ON(1); fscrypt_free_filename(&fname); @@ -4301,11 +4422,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); - if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + if (IS_ERR(di)) { + ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); goto out; } @@ -4316,7 +4434,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_release_path(path); } else { ret = btrfs_del_root_ref(trans, objectid, - root->root_key.objectid, dir_ino, + btrfs_root_id(root), dir_ino, &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -4366,7 +4484,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.objectid == root->root_key.objectid) { + if (key.objectid == btrfs_root_id(root)) { ret = -EPERM; btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", @@ -4376,21 +4494,27 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_release_path(path); } - key.objectid = root->root_key.objectid; + key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto out; + } ret = 0; if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == root->root_key.objectid && - key.type == BTRFS_ROOT_REF_KEY) + if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } out: @@ -4402,75 +4526,38 @@ out: static void btrfs_prune_dentries(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - u64 objectid = 0; + struct btrfs_inode *inode; + u64 min_ino = 0; if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(entry)) - node = node->rb_left; - else if (objectid > btrfs_ino(entry)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(entry)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(entry) + 1; - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - if (atomic_read(&inode->i_count) > 1) - d_prune_aliases(inode); - /* - * btrfs_drop_inode will have it removed from the inode - * cache when its usage count hits zero. - */ - iput(inode); - cond_resched(); - spin_lock(&root->inode_lock); - goto again; - } - - if (cond_resched_lock(&root->inode_lock)) - goto again; + inode = btrfs_find_first_inode(root, min_ino); + while (inode) { + if (atomic_read(&inode->vfs_inode.i_count) > 1) + d_prune_aliases(&inode->vfs_inode); - node = rb_next(node); + min_ino = btrfs_ino(inode) + 1; + /* + * btrfs_drop_inode() will have it removed from the inode + * cache when its usage count hits zero. + */ + iput(&inode->vfs_inode); + cond_resched(); + inode = btrfs_find_first_inode(root, min_ino); } - spin_unlock(&root->inode_lock); } int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; + u64 qgroup_reserved = 0; int ret; down_write(&fs_info->subvol_sem); @@ -4485,7 +4572,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", - dest->root_key.objectid); + btrfs_root_id(dest)); ret = -EPERM; goto out_up_write; } @@ -4493,7 +4580,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_up_write; } @@ -4515,12 +4602,15 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -4546,7 +4636,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4554,8 +4644,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) } ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - dest->root_key.objectid); + BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4564,7 +4653,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4579,7 +4668,9 @@ out_end_trans: ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(root, &block_rsv); + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_undead: if (ret) { spin_lock(&dest->root_item_lock); @@ -4603,7 +4694,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - int err = 0; + int ret = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; struct fscrypt_name fname; @@ -4619,33 +4710,33 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } - err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); - if (err) - return err; + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_notrans; } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } - err = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) goto out; last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); - if (!err) { + if (!ret) { btrfs_i_size_write(BTRFS_I(inode), 0); /* * Propagate the last_unlink_trans value of the deleted dir to @@ -4667,7 +4758,7 @@ out_notrans: btrfs_btree_balance_dirty(fs_info); fscrypt_free_filename(&fname); - return err; + return ret; } /* @@ -4695,7 +4786,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); - struct page *page; + struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; @@ -4727,8 +4818,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } again: - page = find_or_create_page(mapping, index, mask); - if (!page) { + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); @@ -4736,15 +4828,15 @@ again: goto out; } - if (!PageUptodate(page)) { - ret = btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto out_unlock; } @@ -4756,19 +4848,19 @@ again: * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent(io_tree, block_start, block_end, &cached_state); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -4789,15 +4881,16 @@ again: if (!len) len = blocksize - offset; if (front) - memzero_page(page, (block_start - page_offset(page)), - offset); + folio_zero_range(folio, block_start - folio_pos(folio), + offset); else - memzero_page(page, (block_start - page_offset(page)) + offset, - len); + folio_zero_range(folio, + (block_start - folio_pos(folio)) + offset, + len); } - btrfs_folio_clear_checked(fs_info, page_folio(page), block_start, + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); - btrfs_folio_set_dirty(fs_info, page_folio(page), block_start, + btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); @@ -4814,8 +4907,8 @@ out_unlock: block_start, blocksize, true); } btrfs_delalloc_release_extents(inode, blocksize); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: if (only_release_metadata) btrfs_check_nocow_unlock(inode); @@ -4889,16 +4982,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err = 0; + int ret = 0; /* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_block(inode, oldsize, 0, 0); - if (err) - return err; + ret = btrfs_truncate_block(inode, oldsize, 0, 0); + if (ret) + return ret; if (size <= hole_start) return 0; @@ -4907,10 +5000,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) &cached_state); cur_offset = hole_start; while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - block_end - cur_offset); + em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { - err = PTR_ERR(em); + ret = PTR_ERR(em); em = NULL; break; } @@ -4921,13 +5013,13 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; - err = maybe_insert_hole(inode, cur_offset, hole_size); - if (err) + ret = maybe_insert_hole(inode, cur_offset, hole_size); + if (ret) break; - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; hole_em = alloc_extent_map(); @@ -4940,20 +5032,18 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) } hole_em->start = cur_offset; hole_em->len = hole_size; - hole_em->orig_start = cur_offset; - hole_em->block_start = EXTENT_MAP_HOLE; - hole_em->block_len = 0; - hole_em->orig_block_len = 0; + hole_em->disk_bytenr = EXTENT_MAP_HOLE; + hole_em->disk_num_bytes = 0; hole_em->ram_bytes = hole_size; hole_em->generation = btrfs_get_fs_generation(fs_info); - err = btrfs_replace_extent_map_range(inode, hole_em, true); + ret = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; } next: @@ -4965,7 +5055,7 @@ next: } free_extent_map(em); unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); - return err; + return ret; } static int btrfs_setsize(struct inode *inode, struct iattr *attr) @@ -5019,10 +5109,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); if (btrfs_is_zoned(fs_info)) { - ret = btrfs_wait_ordered_range(inode, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), ALIGN(newsize, fs_info->sectorsize), (u64)-1); if (ret) @@ -5052,7 +5142,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(inode, 0, (u64)-1); + err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (err) return err; i_size_write(inode, BTRFS_I(inode)->disk_i_size); @@ -5222,7 +5312,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, void btrfs_evict_inode(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv = NULL; @@ -5236,11 +5326,12 @@ void btrfs_evict_inode(struct inode *inode) return; } + fs_info = inode_to_fs_info(inode); evict_inode_truncate_pages(inode); if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) goto out; @@ -5252,7 +5343,7 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID); goto out; } @@ -5424,7 +5515,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = dir->root->root_key.objectid; + key.objectid = btrfs_root_id(dir->root); key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -5465,59 +5556,24 @@ out: return err; } -static void inode_tree_add(struct btrfs_inode *inode) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *entry; - struct rb_node **p; - struct rb_node *parent; - struct rb_node *new = &inode->rb_node; - u64 ino = btrfs_ino(inode); - - if (inode_unhashed(&inode->vfs_inode)) - return; - parent = NULL; - spin_lock(&root->inode_lock); - p = &root->inode_tree.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_inode, rb_node); - if (ino < btrfs_ino(entry)) - p = &parent->rb_left; - else if (ino > btrfs_ino(entry)) - p = &parent->rb_right; - else { - WARN_ON(!(entry->vfs_inode.i_state & - (I_WILL_FREE | I_FREEING))); - rb_replace_node(parent, new, &root->inode_tree); - RB_CLEAR_NODE(parent); - spin_unlock(&root->inode_lock); - return; - } - } - rb_link_node(new, parent, p); - rb_insert_color(new, &root->inode_tree); - spin_unlock(&root->inode_lock); -} -static void inode_tree_del(struct btrfs_inode *inode) +static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; - int empty = 0; + struct btrfs_inode *entry; + bool empty = false; - spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&inode->rb_node)) { - rb_erase(&inode->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&inode->rb_node); - empty = RB_EMPTY_ROOT(&root->inode_tree); - } - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + if (entry == inode) + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty && btrfs_root_refs(&root->root_item) == 0) { - spin_lock(&root->inode_lock); - empty = RB_EMPTY_ROOT(&root->inode_tree); - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty) btrfs_add_dead_root(root); } @@ -5528,12 +5584,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - BTRFS_I(inode)->location.objectid = args->ino; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; + btrfs_set_inode_number(BTRFS_I(inode), args->ino); BTRFS_I(inode)->root = btrfs_grab_root(args->root); - BUG_ON(args->root && !BTRFS_I(inode)->root); if (args->root && args->root == args->root->fs_info->tree_root && args->ino != BTRFS_BTREE_INODE_OBJECTID) @@ -5546,12 +5598,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == BTRFS_I(inode)->location.objectid && + return args->ino == btrfs_ino(BTRFS_I(inode)) && args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, - struct btrfs_root *root) +static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5560,53 +5611,64 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, args.ino = ino; args.root = root; - inode = iget5_locked(s, hashval, btrfs_find_actor, + inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; } /* - * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Get an inode object given its inode number and corresponding root. Path is + * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path) +struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { struct inode *inode; + int ret; - inode = btrfs_iget_locked(s, ino, root); + inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - int ret; + if (!(inode->i_state & I_NEW)) + return inode; - ret = btrfs_read_locked_inode(inode, path); - if (!ret) { - inode_tree_add(BTRFS_I(inode)); - unlock_new_inode(inode); - } else { - iget_failed(inode); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode, this means the inode item - * was not found. - */ - if (ret > 0) - ret = -ENOENT; - inode = ERR_PTR(ret); - } - } + ret = btrfs_read_locked_inode(inode, path); + if (ret) + return ERR_PTR(ret); + unlock_new_inode(inode); return inode; } -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) +/* + * Get an inode object given its inode number and corresponding root. + */ +struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(s, ino, root, NULL); + struct inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_read_locked_inode(inode, path); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(inode); + return inode; } static struct inode *new_simple_dir(struct inode *dir, @@ -5620,10 +5682,11 @@ static struct inode *new_simple_dir(struct inode *dir, return ERR_PTR(-ENOMEM); BTRFS_I(inode)->root = btrfs_grab_root(root); - memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + BTRFS_I(inode)->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); - inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry @@ -5661,11 +5724,11 @@ static inline u8 btrfs_inode_type(struct inode *inode) struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; - struct btrfs_key location; + struct btrfs_key location = { 0 }; u8 di_type = 0; int ret = 0; @@ -5677,7 +5740,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, location.objectid, root); + inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) return inode; @@ -5701,7 +5764,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir, &location, root); } else { - inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); + inode = btrfs_iget(location.objectid, sub_root); btrfs_put_root(sub_root); if (IS_ERR(inode)) @@ -5921,7 +5984,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - put = btrfs_readdir_get_delayed_items(inode, private->last_index, + put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index, &ins_list, &del_list); again: @@ -5994,7 +6057,7 @@ again: * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as - * they're returned by readdir. Until we re-use freed offsets + * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. @@ -6011,7 +6074,7 @@ nopos: ret = 0; err: if (put) - btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); + btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -6096,7 +6159,7 @@ static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; - args.ino = BTRFS_I(inode)->location.objectid; + args.ino = btrfs_ino(BTRFS_I(inode)); args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, @@ -6200,10 +6263,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root; struct btrfs_inode_item *inode_item; - struct btrfs_key *location; struct btrfs_path *path; u64 objectid; struct btrfs_inode_ref *ref; @@ -6212,6 +6274,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_item_batch batch; unsigned long ptr; int ret; + bool xa_reserved = false; path = btrfs_alloc_path(); if (!path) @@ -6221,10 +6284,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); root = BTRFS_I(inode)->root; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + goto out; + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; - inode->i_ino = objectid; + btrfs_set_inode_number(BTRFS_I(inode), objectid); + + ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); + if (ret) + goto out; + xa_reserved = true; if (args->orphan) { /* @@ -6239,8 +6311,10 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) goto out; } - /* index_cnt is ignored for everything but a dir. */ - BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -6267,11 +6341,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - ret = btrfs_insert_inode_locked(inode); if (ret < 0) { if (!args->orphan) @@ -6354,7 +6423,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6370,8 +6438,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, * Subvolumes inherit properties from their parent subvolume, * not the directory they were created in. */ - parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - BTRFS_I(dir)->root); + parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { @@ -6384,8 +6451,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) { btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, - ret); + btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } /* @@ -6400,7 +6466,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(BTRFS_I(inode)); + ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); + if (WARN_ON(ret)) { + /* Shouldn't happen, we used xa_reserve() before. */ + btrfs_abort_transaction(trans, ret); + goto discard; + } trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); @@ -6428,6 +6499,9 @@ discard: ihold(inode); discard_new_inode(inode); out: + if (xa_reserved) + xa_release(&root->inodes, objectid); + btrfs_free_path(path); return ret; } @@ -6458,7 +6532,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, index, name); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, @@ -6501,7 +6575,7 @@ fail_dir_item: u64 local_index; int err; err = btrfs_del_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, &local_index, name); if (err) btrfs_abort_transaction(trans, err); @@ -6522,7 +6596,7 @@ fail_dir_item: static int btrfs_create_common(struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_new_inode_args new_inode_args = { .dir = dir, @@ -6592,14 +6666,14 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; int err; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) + if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) @@ -6685,7 +6759,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, } static noinline int uncompress_inline(struct btrfs_path *path, - struct page *page, + struct folio *folio, struct btrfs_file_extent_item *item) { int ret; @@ -6707,7 +6781,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, + max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6718,36 +6793,35 @@ static noinline int uncompress_inline(struct btrfs_path *path, */ if (max_size < PAGE_SIZE) - memzero_page(page, max_size, PAGE_SIZE - max_size); + folio_zero_range(folio, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } -static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct page *page) +static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; - if (!page || PageUptodate(page)) + if (!folio || folio_test_uptodate(folio)) return 0; - ASSERT(page_offset(page) == 0); + ASSERT(folio_pos(folio) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page, fi); + return uncompress_inline(path, folio, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) - memzero_page(page, copy_size, PAGE_SIZE - copy_size); + folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); return 0; } @@ -6756,7 +6830,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * * @inode: file to search in * @page: page to read extent data into if the extent is inline - * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * @@ -6770,8 +6843,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) + struct folio *folio, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6794,7 +6866,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); - else if (em->block_start == EXTENT_MAP_INLINE && page) + else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) free_extent_map(em); else goto out; @@ -6805,9 +6877,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, goto out; } em->start = EXTENT_MAP_HOLE; - em->orig_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; em->len = (u64)-1; - em->block_len = (u64)-1; path = btrfs_alloc_path(); if (!path) { @@ -6897,9 +6968,8 @@ next: /* New extent overlaps with existing one */ em->start = start; - em->orig_start = start; em->len = found_key.offset - start; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; goto insert; } @@ -6914,7 +6984,6 @@ next: * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ - ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); @@ -6924,19 +6993,18 @@ next: * * Other members are not utilized for inline extents. */ - ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page); + ret = read_inline_extent(path, folio); if (ret < 0) goto out; goto insert; } not_found: em->start = start; - em->orig_start = start; em->len = len; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; insert: ret = 0; btrfs_release_path(path); @@ -6949,7 +7017,7 @@ insert: } write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); @@ -6963,84 +7031,6 @@ out: return em; } -static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - const u64 start, - const u64 len, - const u64 orig_start, - const u64 block_start, - const u64 block_len, - const u64 orig_block_len, - const u64 ram_bytes, - const int type) -{ - struct extent_map *em = NULL; - struct btrfs_ordered_extent *ordered; - - if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, block_start, - block_len, orig_block_len, ram_bytes, - BTRFS_COMPRESS_NONE, /* compress_type */ - type); - if (IS_ERR(em)) - goto out; - } - ordered = btrfs_alloc_ordered_extent(inode, start, len, len, - block_start, block_len, 0, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT), - BTRFS_COMPRESS_NONE); - if (IS_ERR(ordered)) { - if (em) { - free_extent_map(em); - btrfs_drop_extent_map_range(inode, start, - start + len - 1, false); - } - em = ERR_CAST(ordered); - } else { - ASSERT(!dio_data->ordered); - dio_data->ordered = ordered; - } - out: - - return em; -} - -static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 len) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_map *em; - struct btrfs_key ins; - u64 alloc_hint; - int ret; - - alloc_hint = get_extent_allocation_hint(inode, start, len); -again: - ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, - 0, alloc_hint, &ins, 1, 1); - if (ret == -EAGAIN) { - ASSERT(btrfs_is_zoned(fs_info)); - wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - goto again; - } - if (ret) - return ERR_PTR(ret); - - em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, - ins.objectid, ins.offset, ins.offset, - ins.offset, BTRFS_ORDERED_REGULAR); - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); - - return em; -} - static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; @@ -7063,8 +7053,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent - * @strict: if true, omit optimizations that might force us into unnecessary - * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write @@ -7075,10 +7063,10 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict) + struct btrfs_file_extent *file_extent, + bool nowait) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; @@ -7126,12 +7114,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); - if (ram_bytes) - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; - nocow_args.strict = strict; nocow_args.free_path = true; ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); @@ -7145,14 +7130,16 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } ret = 0; - if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) + if (btrfs_extent_readonly(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset)) goto out; if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + nocow_args.num_bytes, + range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { @@ -7161,143 +7148,75 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } } - if (orig_start) - *orig_start = key.offset - nocow_args.extent_offset; - if (orig_block_len) - *orig_block_len = nocow_args.disk_num_bytes; + if (file_extent) + memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); - *len = nocow_args.num_bytes; + *len = nocow_args.file_extent.num_bytes; ret = 1; out: btrfs_free_path(path); return ret; } -static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, - unsigned int iomap_flags) -{ - const bool writing = (iomap_flags & IOMAP_WRITE); - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - int ret = 0; - - while (1) { - if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; - } else { - lock_extent(io_tree, lockstart, lockend, cached_state); - } - /* - * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure there's no ordered - * extents in this range. - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, - lockend - lockstart + 1); - - /* - * We need to make sure there are no buffered pages in this - * range either, we could have raced between the invalidate in - * generic_file_direct_write and locking the extent. The - * invalidate needs to happen so that reads after a write do not - * get stale data. - */ - if (!ordered && - (!writing || !filemap_range_has_page(inode->i_mapping, - lockstart, lockend))) - break; - - unlock_extent(io_tree, lockstart, lockend, cached_state); - - if (ordered) { - if (nowait) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - break; - } - /* - * If we are doing a DIO read and the ordered extent we - * found is for a buffered write, we can not wait for it - * to complete and retry, because if we do so we can - * deadlock with concurrent buffered writes on page - * locks. This happens only if our DIO read covers more - * than one extent map, if at this point has already - * created an ordered extent for a previous extent map - * and locked its range in the inode's io tree, and a - * concurrent write against that previous extent map's - * range and this range started (we unlock the ranges - * in the io tree only when the bios complete and - * buffered writes always lock pages before attempting - * to lock range in the io tree). - */ - if (writing || - test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered); - else - ret = nowait ? -EAGAIN : -ENOTBLK; - btrfs_put_ordered_extent(ordered); - } else { - /* - * We could trigger writeback for this range (and wait - * for it to complete) and then invalidate the pages for - * this range (through invalidate_inode_pages2_range()), - * but that can lead us to a deadlock with a concurrent - * call to readahead (a buffered read or a defrag call - * triggered a readahead) on a page lock due to an - * ordered dio extent we created before but did not have - * yet a corresponding bio submitted (whence it can not - * complete), which makes readahead wait for that - * ordered extent to complete while holding a lock on - * that page. - */ - ret = nowait ? -EAGAIN : -ENOTBLK; - } - - if (ret) - break; - - cond_resched(); - } - - return ret; -} - /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type) +struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, + const struct btrfs_file_extent *file_extent, + int type) { struct extent_map *em; int ret; + /* + * Note the missing NOCOW type. + * + * For pure NOCOW writes, we should not create an io extent map, but + * just reusing the existing one. + * Only PREALLOC writes (NOCOW write into preallocated range) can + * create an io extent map. + */ ASSERT(type == BTRFS_ORDERED_PREALLOC || type == BTRFS_ORDERED_COMPRESSED || - type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); + switch (type) { + case BTRFS_ORDERED_PREALLOC: + /* We're only referring part of a larger preallocated extent. */ + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); + break; + case BTRFS_ORDERED_REGULAR: + /* COW results a new extent matching our file extent size. */ + ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); + ASSERT(file_extent->ram_bytes == file_extent->num_bytes); + + /* Since it's a new extent, we should not have any offset. */ + ASSERT(file_extent->offset == 0); + break; + case BTRFS_ORDERED_COMPRESSED: + /* Must be compressed. */ + ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE); + + /* + * Encoded write can make us to refer to part of the + * uncompressed extent. + */ + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); + break; + } + em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); em->start = start; - em->orig_start = orig_start; - em->len = len; - em->block_len = block_len; - em->block_start = block_start; - em->orig_block_len = orig_block_len; - em->ram_bytes = ram_bytes; + em->len = file_extent->num_bytes; + em->disk_bytenr = file_extent->disk_bytenr; + em->disk_num_bytes = file_extent->disk_num_bytes; + em->ram_bytes = file_extent->ram_bytes; em->generation = -1; + em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; - if (type == BTRFS_ORDERED_PREALLOC) - em->flags |= EXTENT_FLAG_FILLING; - else if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, compress_type); + if (type == BTRFS_ORDERED_COMPRESSED) + extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { @@ -7309,591 +7228,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, return em; } - -static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 *lenp, - unsigned int iomap_flags) -{ - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map *em = *map; - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - struct btrfs_block_group *bg; - bool can_nocow = false; - bool space_reserved = false; - u64 len = *lenp; - u64 prev_len; - int ret = 0; - - /* - * We don't allocate a new extent in the following cases - * - * 1) The inode is marked as NODATACOW. In this case we'll just use the - * existing extent. - * 2) The extent is marked as PREALLOC. We're good to go here and can - * just use the extent. - * - */ - if ((em->flags & EXTENT_FLAG_PREALLOC) || - ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - em->block_start != EXTENT_MAP_HOLE)) { - if (em->flags & EXTENT_FLAG_PREALLOC) - type = BTRFS_ORDERED_PREALLOC; - else - type = BTRFS_ORDERED_NOCOW; - len = min(len, em->len - (start - em->start)); - block_start = em->block_start + (start - em->start); - - if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false, false) == 1) { - bg = btrfs_inc_nocow_writers(fs_info, block_start); - if (bg) - can_nocow = true; - } - } - - prev_len = len; - if (can_nocow) { - struct extent_map *em2; - - /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - nowait); - if (ret < 0) { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - btrfs_dec_nocow_writers(bg); - if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) - ret = -EAGAIN; - goto out; - } - space_reserved = true; - - em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); - btrfs_dec_nocow_writers(bg); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em2; - em = em2; - } - - if (IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - - dio_data->nocow_done = true; - } else { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - - if (nowait) { - ret = -EAGAIN; - goto out; - } - - /* - * If we could not allocate data space before locking the file - * range and we can't do a NOCOW write, then we have to fail. - */ - if (!dio_data->data_space_reserved) { - ret = -ENOSPC; - goto out; - } - - /* - * We have to COW and we have already reserved data space before, - * so now we reserve only metadata. - */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - false); - if (ret < 0) - goto out; - space_reserved = true; - - em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - *map = em; - len = min(len, em->len - (start - em->start)); - if (len < prev_len) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - prev_len - len, true); - } - - /* - * We have created our ordered extent, so we can now release our reservation - * for an outstanding extent. - */ - btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); - - /* - * Need to update the i_size under the extent lock so buffered - * readers will get the updated i_size when we unlock. - */ - if (start + len > i_size_read(inode)) - i_size_write(inode, start + len); -out: - if (ret && space_reserved) { - btrfs_delalloc_release_extents(BTRFS_I(inode), len); - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } - *lenp = len; - return ret; -} - -static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - loff_t length, unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map *em; - struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = iter->private; - u64 lockstart, lockend; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - u64 len = length; - const u64 data_alloc_len = length; - bool unlock_extents = false; - - /* - * We could potentially fault if we have a buffer > PAGE_SIZE, and if - * we're NOWAIT we may submit a bio for a partial range and return - * EIOCBQUEUED, which would result in an errant short read. - * - * The best way to handle this would be to allow for partial completions - * of iocb's, so we could submit the partial bio, return and fault in - * the rest of the pages, and then submit the io for the rest of the - * range. However we don't have that currently, so simply return - * -EAGAIN at this point so that the normal path is used. - */ - if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) - return -EAGAIN; - - /* - * Cap the size of reads to that usually seen in buffered I/O as we need - * to allocate a contiguous array for the checksums. - */ - if (!write) - len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); - - lockstart = start; - lockend = start + len - 1; - - /* - * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't - * enough if we've written compressed pages to this area, so we need to - * flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk - the first flush only starts - * compression on the data, while keeping the pages locked, so by the - * time the second flush returns we know bios for the compressed pages - * were submitted and finished, and the pages no longer under writeback. - * - * If we have a NOWAIT request and we have any pages in the range that - * are locked, likely due to compression still in progress, we don't want - * to block on page locks. We also don't want to block on pages marked as - * dirty or under writeback (same as for the non-compression case). - * iomap_dio_rw() did the same check, but after that and before we got - * here, mmap'ed writes may have happened or buffered reads started - * (readpage() and readahead(), which lock pages), as we haven't locked - * the file range yet. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) { - if (flags & IOMAP_NOWAIT) { - if (filemap_range_needs_writeback(inode->i_mapping, - lockstart, lockend)) - return -EAGAIN; - } else { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; - } - } - - memset(dio_data, 0, sizeof(*dio_data)); - - /* - * We always try to allocate data space and must do it before locking - * the file range, to avoid deadlocks with concurrent writes to the same - * range if the range has several extents and the writes don't expand the - * current i_size (the inode lock is taken in shared mode). If we fail to - * allocate data space here we continue and later, after locking the - * file range, we fail with ENOSPC only if we figure out we can not do a - * NOCOW write. - */ - if (write && !(flags & IOMAP_NOWAIT)) { - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, data_alloc_len, false); - if (!ret) - dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) - goto err; - } - - /* - * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered IO, or we are doing a - * NOWAIT read/write and we need to block. - */ - ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); - if (ret < 0) - goto err; - - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } - - /* - * Ok for INLINE and COMPRESSED extents we need to fallback on buffered - * io. INLINE is special, and we could probably kludge it in here, but - * it's still buffered so for safety lets just fall back to the generic - * buffered path. - * - * For COMPRESSED we _have_ to read the entire extent in so we can - * decompress it, so there will be buffering required no matter what we - * do, so go ahead and fallback to buffered. - * - * We return -ENOTBLK because that's what makes DIO go ahead and go back - * to buffered IO. Don't blame me, this is the price we pay for using - * the generic code. - */ - if (extent_map_is_compressed(em) || - em->block_start == EXTENT_MAP_INLINE) { - free_extent_map(em); - /* - * If we are in a NOWAIT context, return -EAGAIN in order to - * fallback to buffered IO. This is not only because we can - * block with buffered IO (no support for NOWAIT semantics at - * the moment) but also to avoid returning short reads to user - * space - this happens if we were able to read some data from - * previous non-compressed extents and then when we fallback to - * buffered IO, at btrfs_file_read_iter() by calling - * filemap_read(), we fail to fault in pages for the read buffer, - * in which case filemap_read() returns a short read (the number - * of bytes previously read is > 0, so it does not return -EFAULT). - */ - ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; - goto unlock_err; - } - - len = min(len, em->len - (start - em->start)); - - /* - * If we have a NOWAIT request and the range contains multiple extents - * (or a mix of extents and holes), then we return -EAGAIN to make the - * caller fallback to a context where it can do a blocking (without - * NOWAIT) request. This way we avoid doing partial IO and returning - * success to the caller, which is not optimal for writes and for reads - * it can result in unexpected behaviour for an application. - * - * When doing a read, because we use IOMAP_DIO_PARTIAL when calling - * iomap_dio_rw(), we can end up returning less data then what the caller - * asked for, resulting in an unexpected, and incorrect, short read. - * That is, the caller asked to read N bytes and we return less than that, - * which is wrong unless we are crossing EOF. This happens if we get a - * page fault error when trying to fault in pages for the buffer that is - * associated to the struct iov_iter passed to iomap_dio_rw(), and we - * have previously submitted bios for other extents in the range, in - * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of - * those bios have completed by the time we get the page fault error, - * which we return back to our caller - we should only return EIOCBQUEUED - * after we have submitted bios for all the extents in the range. - */ - if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); - ret = -EAGAIN; - goto unlock_err; - } - - if (write) { - ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, &len, flags); - if (ret < 0) - goto unlock_err; - unlock_extents = true; - /* Recalc len in case the new em is smaller than requested */ - len = min(len, em->len - (start - em->start)); - if (dio_data->data_space_reserved) { - u64 release_offset; - u64 release_len = 0; - - if (dio_data->nocow_done) { - release_offset = start; - release_len = data_alloc_len; - } else if (len < data_alloc_len) { - release_offset = start + len; - release_len = data_alloc_len - len; - } - - if (release_len > 0) - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - release_offset, - release_len); - } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; - } - - if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - else - free_extent_state(cached_state); - - /* - * Translate extent map information to iomap. - * We trim the extents (and move the addr) even though iomap code does - * that, since we have locked only the parts we are performing I/O in. - */ - if ((em->block_start == EXTENT_MAP_HOLE) || - ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - } else { - iomap->addr = em->block_start + (start - em->start); - iomap->type = IOMAP_MAPPED; - } - iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_dev->bdev; - iomap->length = len; - free_extent_map(em); - - return 0; - -unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); -err: - if (dio_data->data_space_reserved) { - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - start, data_alloc_len); - extent_changeset_free(dio_data->data_reserved); - } - - return ret; -} - -static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - ssize_t written, unsigned int flags, struct iomap *iomap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_dio_data *dio_data = iter->private; - size_t submitted = dio_data->submitted; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - - if (!write && (iomap->type == IOMAP_HOLE)) { - /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); - return 0; - } - - if (submitted < length) { - pos += submitted; - length -= submitted; - if (write) - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - pos, length, false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); - ret = -ENOTBLK; - } - if (write) { - btrfs_put_ordered_extent(dio_data->ordered); - dio_data->ordered = NULL; - } - - if (write) - extent_changeset_free(dio_data->data_reserved); - return ret; -} - -static void btrfs_dio_end_io(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_inode *inode = bbio->inode; - struct bio *bio = &bbio->bio; - - if (bio->bi_status) { - btrfs_warn(inode->root->fs_info, - "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", - btrfs_ino(inode), bio->bi_opf, - dip->file_offset, dip->bytes, bio->bi_status); - } - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_finish_ordered_extent(bbio->ordered, NULL, - dip->file_offset, dip->bytes, - !bio->bi_status); - } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - bbio->bio.bi_private = bbio->private; - iomap_dio_bio_end_io(bio); -} - -static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, - loff_t file_offset) -{ - struct btrfs_bio *bbio = btrfs_bio(bio); - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_dio_data *dio_data = iter->private; - - btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, - btrfs_dio_end_io, bio->bi_private); - bbio->inode = BTRFS_I(iter->inode); - bbio->file_offset = file_offset; - - dip->file_offset = file_offset; - dip->bytes = bio->bi_iter.bi_size; - - dio_data->submitted += bio->bi_iter.bi_size; - - /* - * Check if we are doing a partial write. If we are, we need to split - * the ordered extent to match the submitted bio. Hang on to the - * remaining unfinishable ordered_extent in dio_data so that it can be - * cancelled in iomap_end to avoid a deadlock wherein faulting the - * remaining pages is blocked on the outstanding ordered extent. - */ - if (iter->flags & IOMAP_WRITE) { - int ret; - - ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); - if (ret) { - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - file_offset, dip->bytes, - !ret); - bio->bi_status = errno_to_blk_status(ret); - iomap_dio_bio_end_io(bio); - return; - } - } - - btrfs_submit_bio(bbio, 0); -} - -static const struct iomap_ops btrfs_dio_iomap_ops = { - .iomap_begin = btrfs_dio_iomap_begin, - .iomap_end = btrfs_dio_iomap_end, -}; - -static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_dio_submit_io, - .bio_set = &btrfs_dio_bioset, -}; - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - int ret; - - ret = fiemap_prep(inode, fieinfo, start, &len, 0); - if (ret) - return ret; - - /* - * fiemap_prep() called filemap_write_and_wait() for the whole possible - * file range (0 to LLONG_MAX), but that is not enough if we have - * compression enabled. The first filemap_fdatawrite_range() only kicks - * in the compression of data (in an async thread) and will return - * before the compression is done and writeback is started. A second - * filemap_fdatawrite_range() is needed to wait for the compression to - * complete and writeback to start. We also need to wait for ordered - * extents to complete, because our fiemap implementation uses mainly - * file extent items to list the extents, searching for extent maps - * only for file ranges with holes or prealloc extents to figure out - * if we have delalloc in those ranges. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) - return ret; - } - - btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); - - /* - * We did an initial flush to avoid holding the inode's lock while - * triggering writeback and waiting for the completion of IO and ordered - * extents. Now after we locked the inode we do it again, because it's - * possible a new write may have happened in between those two steps. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) { - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - return ret; - } - } - - ret = extent_fiemap(btrfs_inode, fieinfo, start, len); - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - - return ret; -} - -static int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return extent_writepages(mapping, wbc); -} - -static void btrfs_readahead(struct readahead_control *rac) -{ - extent_readahead(rac); -} - /* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. @@ -7901,13 +7235,12 @@ static void btrfs_readahead(struct readahead_control *rac) * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ -static void wait_subpage_spinlock(struct page *page) +static void wait_subpage_spinlock(struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page->mapping)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7928,15 +7261,20 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } -static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +static int btrfs_launder_folio(struct folio *folio) { - int ret = try_release_extent_mapping(&folio->page, gfp_flags); + return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), + PAGE_SIZE, NULL); +} - if (ret == 1) { - wait_subpage_spinlock(&folio->page); - clear_page_extent_mapped(&folio->page); +static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +{ + if (try_release_extent_mapping(folio, gfp_flags)) { + wait_subpage_spinlock(folio); + clear_folio_extent_mapped(folio); + return true; } - return ret; + return false; } static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -7970,7 +7308,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, static void btrfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_inode *inode = folio_to_inode(folio); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; @@ -7985,7 +7323,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered - * (Private2) already cleared, so it's possible for endio and + * already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * @@ -7993,7 +7331,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); /* * For subpage case, we have call sites like @@ -8051,7 +7389,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, range_len = range_end + 1 - cur; if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* - * If Ordered (Private2) is cleared, it means endio has + * If Ordered is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. @@ -8124,181 +7462,14 @@ next: } /* * We have iterated through all ordered extents of the page, the page - * should not have Ordered (Private2) anymore, or the above iteration + * should not have Ordered anymore, or the above iteration * did something wrong. */ ASSERT(!folio_test_ordered(folio)); btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); - clear_page_extent_mapped(&folio->page); -} - -/* - * btrfs_page_mkwrite() is not allowed to change the file size as it gets - * called from a page fault handler when a page is first dirtied. Hence we must - * be careful to check for EOF conditions here. We set the page up correctly - * for a written page which means we get ENOSPC checking when writing into - * holes and correct delalloc and unwritten extent mapping on filesystems that - * support these features. - * - * We are not allowed to take the i_mutex here so we have to play games to - * protect against truncate races as the page could now be beyond EOF. Because - * truncate_setsize() writes the inode size before removing pages, once we have - * the page lock we can determine safely if the page is beyond EOF. If it is not - * beyond EOF, then the page is guaranteed safe against truncation until we - * unlock the page. - */ -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct folio *folio = page_folio(page); - struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_changeset *data_reserved = NULL; - unsigned long zero_start; - loff_t size; - vm_fault_t ret; - int ret2; - int reserved = 0; - u64 reserved_space; - u64 page_start; - u64 page_end; - u64 end; - - ASSERT(folio_order(folio) == 0); - - reserved_space = PAGE_SIZE; - - sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - end = page_end; - - /* - * Reserving delalloc space after obtaining the page lock can lead to - * deadlock. For example, if a dirty page is locked by this function - * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepages() function could - * end up waiting indefinitely to get a lock on the page currently - * being processed by btrfs_page_mkwrite() function. - */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; - goto out_noreserve; - } - - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ -again: - down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); - size = i_size_read(inode); - - if ((page->mapping != inode->i_mapping) || - (page_start >= size)) { - /* page got truncated out from underneath us */ - goto out_unlock; - } - wait_on_page_writeback(page); - - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); - goto out_unlock; - } - - /* - * we can't set the delalloc bits if there are pending ordered - * extents. Drop our locks and wait for them to finish - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, - PAGE_SIZE); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - goto again; - } - - if (page->index == ((size - 1) >> PAGE_SHIFT)) { - reserved_space = round_up(size - page_start, - fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { - end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); - } - } - - /* - * page_mkwrite gets called when the page is firstly dirtied after it's - * faulted in, but write(2) could also dirty a page and set delalloc - * bits, thus in this case for space account reason, we still need to - * clear any delalloc bits within this page range since we have to - * reserve data&meta space before lock_page() (see above comments). - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); - - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, - &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; - goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); - else - zero_start = PAGE_SIZE; - - if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); - - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); - btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); - btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); - - btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - - unlock_extent(io_tree, page_start, page_end, &cached_state); - up_read(&BTRFS_I(inode)->i_mmap_lock); - - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - -out_unlock: - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); -out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); -out_noreserve: - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return ret; + clear_folio_extent_mapped(folio); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) @@ -8318,7 +7489,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(&inode->vfs_inode, + ret = btrfs_wait_ordered_range(inode, inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) @@ -8519,20 +7690,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; - struct extent_io_tree *file_extent_tree = NULL; - - /* Self tests may pass a NULL fs_info. */ - if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { - file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); - if (!file_extent_tree) - return NULL; - } ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); - if (!ei) { - kfree(file_extent_tree); + if (!ei) return NULL; - } ei->root = NULL; ei->generation = 0; @@ -8545,8 +7706,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->disk_i_size = 0; ei->flags = 0; ei->ro_flags = 0; + /* + * ->index_cnt will be properly initialized later when creating a new + * inode (btrfs_create_new_inode()) or when reading an existing inode + * from disk (btrfs_read_locked_inode()). + */ ei->csum_bytes = 0; - ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_reflink_trans = 0; @@ -8573,20 +7738,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - ei->file_extent_tree = file_extent_tree; - if (file_extent_tree) { - extent_io_tree_init(fs_info, ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); - /* Lockdep class is set only for the file extent tree. */ - lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); - } + ei->file_extent_tree = NULL; + mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); - RB_CLEAR_NODE(&ei->rb_node); init_rwsem(&ei->i_mmap_lock); return inode; @@ -8622,9 +7781,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!S_ISDIR(vfs_inode->i_mode)) { WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); } - WARN_ON(inode->csum_bytes); - WARN_ON(inode->defrag_bytes); + if (!root || !btrfs_is_data_reloc_root(root)) + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8658,7 +7818,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) } } btrfs_qgroup_check_reserved_leak(inode); - inode_tree_del(inode); + btrfs_del_inode_from_root(inode); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); @@ -8692,7 +7852,6 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); - bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); } @@ -8700,20 +7859,12 @@ int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) - goto fail; - - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bbio.bio), - BIOSET_NEED_BVECS)) - goto fail; + return -ENOMEM; return 0; -fail: - btrfs_destroy_cachep(); - return -ENOMEM; } static int btrfs_getattr(struct mnt_idmap *idmap, @@ -8723,7 +7874,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, u64 delalloc_bytes; u64 inode_bytes; struct inode *inode = d_inode(path->dentry); - u32 blocksize = inode->i_sb->s_blocksize; + u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize; u32 bi_flags = BTRFS_I(inode)->flags; u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; @@ -8749,6 +7900,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; + stat->subvol = BTRFS_I(inode)->root->root_key.objectid; + stat->result_mask |= STATX_SUBVOL; + spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; inode_bytes = inode_get_bytes(inode); @@ -8763,7 +7917,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; @@ -8912,31 +8066,45 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), @@ -9015,7 +8183,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_new_inode_args whiteout_args = { .dir = old_dir, .dentry = old_dentry, @@ -9172,16 +8340,23 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } if (new_inode) { @@ -9189,18 +8364,27 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } - if (!ret && new_inode->i_nlink == 0) + if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } } @@ -9457,7 +8641,7 @@ out: static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; @@ -9540,7 +8724,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); @@ -9628,7 +8811,7 @@ free_qgroup: * or we leak qgroup data reservation. */ btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, qgroup_released, + btrfs_root_id(inode->root), qgroup_released, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret); } @@ -9638,7 +8821,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -9703,11 +8886,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } em->start = cur_offset; - em->orig_start = cur_offset; em->len = ins.offset; - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; + em->disk_bytenr = ins.objectid; + em->offset = 0; + em->disk_num_bytes = ins.offset; em->ram_bytes = ins.offset; em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; @@ -9790,7 +8972,7 @@ static int btrfs_permission(struct mnt_idmap *idmap, static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode; @@ -9848,28 +9030,6 @@ out_inode: return finish_open_simple(file, ret); } -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - while (index <= end_index) { - page = find_get_page(inode->vfs_inode.i_mapping, index); - ASSERT(page); /* Pages should be in the extent_io_tree */ - - /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(page_folio(page)) == 0); - btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); - put_page(page); - index++; - } -} - int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { @@ -9914,12 +9074,16 @@ static ssize_t btrfs_encoded_read_inline( unsigned long ptr; void *tmp; ssize_t ret; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } + + path->nowait = nowait; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { @@ -9982,8 +9146,9 @@ out: } struct btrfs_encoded_read_private { - wait_queue_head_t wait; - atomic_t pending; + struct completion done; + void *uring_ctx; + refcount_t pending_refs; blk_status_t status; }; @@ -10002,26 +9167,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (!atomic_dec_return(&priv->pending)) - wake_up(&priv->wait); + if (refcount_dec_and_test(&priv->pending_refs)) { + int err = blk_status_to_errno(READ_ONCE(priv->status)); + + if (priv->uring_ctx) { + btrfs_uring_read_extent_endio(priv->uring_ctx, err); + kfree(priv); + } else { + complete(&priv->done); + } + } bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .pending = ATOMIC_INIT(1), - }; + struct btrfs_encoded_read_private *priv; unsigned long i = 0; struct btrfs_bio *bbio; + int ret; - init_waitqueue_head(&priv.wait); + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); + priv->status = 0; + priv->uring_ctx = uring_ctx; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -10029,11 +9208,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; @@ -10044,22 +9223,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); - /* See btrfs_encoded_read_endio() for ordering. */ - return blk_status_to_errno(READ_ONCE(priv.status)); + if (uring_ctx) { + if (refcount_dec_and_test(&priv->pending_refs)) { + ret = blk_status_to_errno(READ_ONCE(priv->status)); + btrfs_uring_read_extent_endio(uring_ctx, ret); + kfree(priv); + return ret; + } + + return -EIOCBQUEUED; + } else { + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); + /* See btrfs_encoded_read_endio() for ordering. */ + ret = blk_status_to_errno(READ_ONCE(priv->status)); + kfree(priv); + return ret; + } } -static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, - struct iov_iter *iter, - u64 start, u64 lockend, - struct extent_state **cached_state, - u64 disk_bytenr, u64 disk_io_size, - size_t count, bool compressed, - bool *unlocked) +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; @@ -10073,14 +9263,14 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages, 0); + ret = btrfs_alloc_page_array(nr_pages, pages, false); if (ret) { ret = -ENOMEM; goto out; } - ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, - disk_io_size, pages); + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, NULL); if (ret) goto out; @@ -10120,21 +9310,26 @@ out: } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded) + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); - u64 start, lockend, disk_bytenr, disk_io_size; - struct extent_state *cached_state = NULL; + u64 start, lockend; struct extent_map *em; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); bool unlocked = false; file_accessed(iocb->ki_filp); - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + ret = btrfs_inode_lock(inode, + BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); + if (ret) + return ret; if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -10147,30 +9342,55 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - for (;;) { + if (nowait) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, - lockend - start + 1); - if (ret) + if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, + start, lockend)) { + ret = -EAGAIN; + goto out_unlock_inode; + } + + if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + ret = -EAGAIN; goto out_unlock_inode; - lock_extent(io_tree, start, lockend, &cached_state); + } + ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); - if (!ordered) - break; - btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, &cached_state); - cond_resched(); + if (ordered) { + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + ret = -EAGAIN; + goto out_unlock_inode; + } + } else { + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + + lock_extent(io_tree, start, lockend, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + cond_resched(); + } } - em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_extent; } - if (em->block_start == EXTENT_MAP_INLINE) { + if (em->disk_bytenr == EXTENT_MAP_INLINE) { u64 extent_start = em->start; /* @@ -10180,9 +9400,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, - &cached_state, extent_start, + cached_state, extent_start, count, encoded, &unlocked); - goto out; + goto out_unlock_extent; } /* @@ -10191,73 +9411,68 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { - disk_bytenr = EXTENT_MAP_HOLE; + *disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; } else if (extent_map_is_compressed(em)) { - disk_bytenr = em->block_start; + *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. */ - if (em->block_len > count) { + if (em->disk_num_bytes > count) { ret = -ENOBUFS; goto out_em; } - disk_io_size = em->block_len; - count = em->block_len; + *disk_io_size = em->disk_num_bytes; + count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; - encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - disk_bytenr = em->block_start + (start - em->start); + *disk_bytenr = extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ - disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; - count = start + disk_io_size - iocb->ki_pos; + *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + *disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; - disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } free_extent_map(em); em = NULL; - if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, &cached_state); + if (*disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { - ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, - &cached_state, disk_bytenr, - disk_io_size, count, - encoded->compression, - &unlocked); + ret = -EIOCBQUEUED; + goto out_unlock_extent; } -out: - if (ret >= 0) - iocb->ki_pos += encoded->len; out_em: free_extent_map(em); out_unlock_extent: - if (!unlocked) - unlock_extent(io_tree, start, lockend, &cached_state); + /* Leave inode and extent locked if we need to do a read. */ + if (!unlocked && ret != -EIOCBQUEUED) + unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: - if (!unlocked) + if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -10272,12 +9487,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; int compression; size_t orig_count; u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; - unsigned long nr_pages, i; - struct page **pages; + unsigned long nr_folios, i; + struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; @@ -10366,24 +9582,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); - nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); - if (!pages) + nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); + if (!folios) return -ENOMEM; - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_folios; i++) { size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr; - pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); - if (!pages[i]) { + folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); + if (!folios[i]) { ret = -ENOMEM; - goto out_pages; + goto out_folios; } - kaddr = kmap_local_page(pages[i]); + kaddr = kmap_local_folio(folios[i], 0); if (copy_from_iter(kaddr, bytes, from) != bytes) { kunmap_local(kaddr); ret = -EFAULT; - goto out_pages; + goto out_folios; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); @@ -10393,14 +9609,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, for (;;) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) - goto out_pages; + goto out_folios; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) - goto out_pages; + goto out_folios; lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && @@ -10428,10 +9644,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_qgroup_free_data; /* Try an inline extent first. */ - if (start == 0 && encoded->unencoded_len == encoded->len && - encoded->unencoded_offset == 0) { - ret = cow_file_range_inline(inode, encoded->len, orig_count, - compression, pages, true); + if (encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0 && + can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { + ret = __cow_file_range_inline(inode, encoded->len, + orig_count, compression, folios[0], + true); if (ret <= 0) { if (ret == 0) ret = orig_count; @@ -10445,22 +9663,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_delalloc_release; extent_reserved = true; - em = create_io_em(inode, start, num_bytes, - start - encoded->unencoded_offset, ins.objectid, - ins.offset, ins.offset, ram_bytes, compression, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = num_bytes; + file_extent.ram_bytes = ram_bytes; + file_extent.offset = encoded->unencoded_offset; + file_extent.compression = compression; + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserved; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, - ins.objectid, ins.offset, - encoded->unencoded_offset, + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED), - compression); + (1 << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -10475,7 +9693,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); ret = orig_count; goto out; @@ -10497,12 +9715,12 @@ out_free_data_space: btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: unlock_extent(io_tree, start, end, &cached_state); -out_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) - __free_page(pages[i]); +out_folios: + for (i = 0; i < nr_folios; i++) { + if (folios[i]) + folio_put(folios[i]); } - kvfree(pages); + kvfree(folios); out: if (ret >= 0) iocb->ki_pos += encoded->len; @@ -10649,39 +9867,59 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and * we don't really care. */ - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -10696,7 +9934,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -10710,7 +9949,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -10726,11 +9966,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", - root->root_key.objectid); - return -EPERM; + btrfs_root_id(root)); + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -10738,24 +9980,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->block_start == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->block_start == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -10767,23 +10024,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = em->block_start + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -10818,7 +10097,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -10859,20 +10137,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); @@ -10885,6 +10166,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; @@ -10893,7 +10178,6 @@ out: *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; - sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else @@ -10955,7 +10239,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en if (ordered) { btrfs_err(root->fs_info, "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", - start, end, btrfs_ino(inode), root->root_key.objectid, + start, end, btrfs_ino(inode), btrfs_root_id(root), ordered->file_offset, ordered->file_offset + ordered->num_bytes - 1); btrfs_put_ordered_extent(ordered); @@ -10964,6 +10248,36 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en ASSERT(ordered == NULL); } +/* + * Find the first inode with a minimum number. + * + * @root: The root to search for. + * @min_ino: The minimum inode number. + * + * Find the first inode in the @root with a number >= @min_ino and return it. + * Returns NULL if no such inode found. + */ +struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + if (igrab(&inode->vfs_inode)) + break; + + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -11016,6 +10330,7 @@ static const struct address_space_operations btrfs_aops = { .writepages = btrfs_writepages, .readahead = btrfs_readahead, .invalidate_folio = btrfs_invalidate_folio, + .launder_folio = btrfs_launder_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9d1eac15e09e..6c18bad53cd3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -29,16 +29,15 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> #include <linux/sched/xacct.h> +#include <linux/io_uring/cmd.h> #include "ctree.h" #include "disk-io.h" #include "export.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "volumes.h" #include "locking.h" #include "backref.h" -#include "rcu-string.h" #include "send.h" #include "dev-replace.h" #include "props.h" @@ -47,9 +46,7 @@ #include "tree-log.h" #include "compression.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" -#include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -231,6 +228,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) +{ + if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + +static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2) +{ + if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + /* * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. @@ -247,7 +258,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; @@ -365,15 +376,15 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, - strlen(comp), 0); + ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, - 0, 0); + ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -392,86 +403,6 @@ update_flags: return ret; } -/* - * Start exclusive operation @type, return true on success - */ -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - bool ret = false; - - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { - fs_info->exclusive_operation = type; - ret = true; - } - spin_unlock(&fs_info->super_lock); - - return ret; -} - -/* - * Conditionally allow to enter the exclusive operation in case it's compatible - * with the running one. This must be paired with btrfs_exclop_start_unlock and - * btrfs_exclop_finish. - * - * Compatibility: - * - the same type is already running - * - when trying to add a device and balance has been paused - * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller - * must check the condition first that would allow none -> @type - */ -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type || - (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && - type == BTRFS_EXCLOP_DEV_ADD)) - return true; - - spin_unlock(&fs_info->super_lock); - return false; -} - -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) -{ - spin_unlock(&fs_info->super_lock); -} - -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) -{ - spin_lock(&fs_info->super_lock); - WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); - spin_unlock(&fs_info->super_lock); - sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); -} - -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op) -{ - switch (op) { - case BTRFS_EXCLOP_BALANCE_PAUSED: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || - fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || - fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || - fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; - spin_unlock(&fs_info->super_lock); - break; - case BTRFS_EXCLOP_BALANCE: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; - spin_unlock(&fs_info->super_lock); - break; - default: - btrfs_warn(fs_info, - "invalid exclop balance operation %d requested", op); - } -} - static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); @@ -528,29 +459,16 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * block group is in the logical address space, which can be any * sectorsize aligned bytenr in the range [0, U64_MAX]. */ - if (range.len < fs_info->sb->s_blocksize) + if (range.len < fs_info->sectorsize) return -EINVAL; range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); - if (ret < 0) - return ret; if (copy_to_user(arg, &range, sizeof(range))) return -EFAULT; - return 0; -} - -int __pure btrfs_is_empty_uuid(u8 *uuid) -{ - int i; - - for (i = 0; i < BTRFS_UUID_SIZE; i++) { - if (uuid[i]) - return 0; - } - return 1; + return ret; } /* @@ -584,7 +502,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item *root_item; @@ -603,6 +521,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, int ret; dev_t anon_dev; u64 objectid; + u64 qgroup_reserved = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -640,19 +559,19 @@ static noinline int create_subvol(struct mnt_idmap *idmap, trans_num_items, false); if (ret) goto out_new_inode_args; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(root, &block_rsv); - goto out_new_inode_args; + goto out_release_rsv; } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - /* Tree log can't currently deal with an inode which is a new root. */ - btrfs_set_log_full_commit(trans); - ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit); + ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit); if (ret) goto out; @@ -703,6 +622,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { + int ret2; + /* * Since we don't abort the transaction in this case, free the * tree block so that we don't leak space and leave the @@ -713,7 +634,9 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_tree_lock(leaf); btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); - btrfs_free_tree_block(trans, objectid, leaf, 0, 1); + ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1); + if (ret2 < 0) + btrfs_abort_transaction(trans, ret2); free_extent_buffer(leaf); goto out; } @@ -751,15 +674,19 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; out: trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(root, &block_rsv); - btrfs_end_transaction(trans); +out_release_rsv: + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -776,11 +703,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv *block_rsv; + u64 qgroup_reserved = 0; int ret; /* We do not support snapshotting right now. */ @@ -817,24 +746,24 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, - BTRFS_BLOCK_RSV_TEMP); + block_rsv = &pending_snapshot->block_rsv; + btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item */ trans_num_items = create_subvol_num_items(inherit) + 3; - ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv, trans_num_items, false); if (ret) goto free_pending; + qgroup_reserved = block_rsv->qgroup_rsv_reserved; pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; - pending_snapshot->dir = dir; + pending_snapshot->dir = BTRFS_I(dir); pending_snapshot->inherit = inherit; trans = btrfs_start_transaction(root, 0); @@ -842,6 +771,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(trans); goto fail; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) { + btrfs_end_transaction(trans); + goto fail; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->pending_snapshot = pending_snapshot; @@ -871,7 +807,9 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); + btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); @@ -910,7 +848,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(d_inode(victim->d_parent) != dir); + /* The @victim is not inside @dir. */ + if (d_inode(victim->d_parent) != dir) + return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); @@ -962,7 +902,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; @@ -1018,7 +958,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { int ret; - bool snapshot_force_cow = false; /* * Force new buffered writes to reserve space even when NOCOW is @@ -1037,15 +976,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent, * creation. */ atomic_inc(&root->snapshot_force_cow); - snapshot_force_cow = true; - btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: - if (snapshot_force_cow) - atomic_dec(&root->snapshot_force_cow); btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } @@ -1097,7 +1034,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 new_size; u64 old_size; u64 devid = 1; @@ -1128,7 +1065,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = PTR_ERR(vol_args); goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + sizestr = vol_args->name; cancel = (strcmp("cancel", sizestr) == 0); ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); @@ -1275,14 +1215,14 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { - struct fd src = fdget(fd); + CLASS(fd, src)(fd); struct inode *src_inode; - if (!src.file) { + if (fd_empty(src)) { ret = -EINVAL; goto out_drop_write; } - src_inode = file_inode(src.file); + src_inode = file_inode(fd_file(src)); if (src_inode->i_sb != file_inode(file)->i_sb) { btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); @@ -1308,7 +1248,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, BTRFS_I(src_inode)->root, readonly, inherit); } - fdput(src); } out_drop_write: mnt_drop_write_file(file); @@ -1328,12 +1267,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); +out: kfree(vol_args); return ret; } @@ -1352,7 +1294,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto free_args; if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; @@ -1362,7 +1306,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { - u64 nums; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE) { @@ -1375,19 +1319,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (inherit->num_qgroups > PAGE_SIZE || - inherit->num_ref_copies > PAGE_SIZE || - inherit->num_excl_copies > PAGE_SIZE) { - ret = -EINVAL; - goto free_inherit; - } - - nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + - 2 * inherit->num_excl_copies; - if (vol_args->size != struct_size(inherit, qgroups, nums)) { - ret = -EINVAL; + ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size); + if (ret < 0) goto free_inherit; - } } ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), @@ -1405,7 +1339,7 @@ free_args: static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; @@ -1428,7 +1362,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; @@ -1481,7 +1415,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, spin_unlock(&root->root_item_lock); btrfs_warn(fs_info, "Attempt to set subvolume %llu read-write during send", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_drop_sem; } @@ -1675,7 +1609,7 @@ static noinline int search_ioctl(struct inode *inode, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *info = inode_to_fs_info(inode); struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; @@ -1888,9 +1822,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_ioctl_ino_lookup_user_args *args) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct super_block *sb = inode->i_sb; - struct btrfs_key upper_limit = BTRFS_I(inode)->location; - u64 treeid = BTRFS_I(inode)->root->root_key.objectid; + u64 upper_limit = btrfs_ino(BTRFS_I(inode)); + u64 treeid = btrfs_root_id(BTRFS_I(inode)->root); u64 dirid = args->dirid; unsigned long item_off; unsigned long item_len; @@ -1915,7 +1848,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * If the bottom subvolume does not exist directly under upper_limit, * construct the path in from the bottom up. */ - if (dirid != upper_limit.objectid) { + if (dirid != upper_limit) { ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; root = btrfs_get_fs_root(fs_info, treeid, true); @@ -1977,7 +1910,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * btree and lock the same leaf. */ btrfs_release_path(path); - temp_inode = btrfs_iget(sb, key2.objectid, root); + temp_inode = btrfs_iget(key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; @@ -1990,7 +1923,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } - if (key.offset == upper_limit.objectid) + if (key.offset == upper_limit) break; if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { ret = -EACCES; @@ -2062,7 +1995,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = root->root_key.objectid; + args->treeid = btrfs_root_id(root); if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2111,7 +2044,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) inode = file_inode(file); if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && - BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { /* * The subvolume does not exist under fd with which this is * called @@ -2158,7 +2091,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ - key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.objectid = btrfs_root_id(BTRFS_I(inode)->root); root = btrfs_get_fs_root(fs_info, key.objectid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); @@ -2273,7 +2206,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, return PTR_ERR(rootrefs); } - objectid = root->root_key.objectid; + objectid = btrfs_root_id(root); key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; @@ -2346,9 +2279,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, bool destroy_v2) { struct dentry *parent = file->f_path.dentry; - struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); struct dentry *dentry; struct inode *dir = d_inode(parent); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2357,7 +2290,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; - int err = 0; + int ret = 0; bool destroy_parent = false; /* We don't support snapshots with extent tree v2 yet. */ @@ -2373,7 +2306,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, return PTR_ERR(vol_args2); if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { - err = -EOPNOTSUPP; + ret = -EOPNOTSUPP; goto out; } @@ -2382,29 +2315,31 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { - vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); + if (ret < 0) + goto out; subvol_name = vol_args2->name; - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; } else { struct inode *old_dir; if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; + ret = -EINVAL; goto out; } - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, vol_args2->subvolid, 0); if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); + ret = PTR_ERR(dentry); goto out_drop_write; } @@ -2424,7 +2359,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ dput(dentry); if (IS_ERR(parent)) { - err = PTR_ERR(parent); + ret = PTR_ERR(parent); goto out_drop_write; } old_dir = dir; @@ -2448,14 +2383,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * to delete without an idmapped mount. */ if (old_dir != dir && idmap != &nop_mnt_idmap) { - err = -EOPNOTSUPP; + ret = -EOPNOTSUPP; goto free_parent; } subvol_name_ptr = btrfs_get_subvol_name_from_objectid( fs_info, vol_args2->subvolid); if (IS_ERR(subvol_name_ptr)) { - err = PTR_ERR(subvol_name_ptr); + ret = PTR_ERR(subvol_name_ptr); goto free_parent; } /* subvol_name_ptr is already nul terminated */ @@ -2466,11 +2401,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; + subvol_name = vol_args->name; - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; } @@ -2478,26 +2416,26 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (strchr(subvol_name, '/') || strncmp(subvol_name, "..", subvol_namelen) == 0) { - err = -EINVAL; + ret = -EINVAL; goto free_subvol_name; } if (!S_ISDIR(dir->i_mode)) { - err = -ENOTDIR; + ret = -ENOTDIR; goto free_subvol_name; } - err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (err == -EINTR) + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) goto free_subvol_name; dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); + ret = PTR_ERR(dentry); goto out_unlock_dir; } if (d_really_is_negative(dentry)) { - err = -ENOENT; + ret = -ENOENT; goto out_dput; } @@ -2517,7 +2455,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * Users who want to delete empty subvols should try * rmdir(2). */ - err = -EPERM; + ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; @@ -2528,29 +2466,29 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * of the subvol, not a random directory contained * within it. */ - err = -EINVAL; + ret = -EINVAL; if (root == dest) goto out_dput; - err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); - if (err) + ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); + if (ret) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(idmap, dir, dentry, 1); - if (err) + ret = btrfs_may_delete(idmap, dir, dentry, 1); + if (ret) goto out_dput; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; + ret = -EINVAL; goto out_dput; } btrfs_inode_lock(BTRFS_I(inode), 0); - err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); + ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry); btrfs_inode_unlock(BTRFS_I(inode), 0); - if (!err) + if (!ret) d_delete_notify(dir, dentry); out_dput: @@ -2567,7 +2505,7 @@ out_drop_write: out: kfree(vol_args2); kfree(vol_args); - return err; + return ret; } static int btrfs_ioctl_defrag(struct file *file, void __user *argp) @@ -2606,6 +2544,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } + /* + * Don't allow defrag on pre-content watched files, as it could + * populate the page cache with 0's via readahead. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + ret = -EINVAL; + goto out; + } + if (argp) { if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; @@ -2677,12 +2624,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) goto out; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); +out_free: kfree(vol_args); out: if (restore_op) @@ -2696,9 +2647,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2714,7 +2665,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto out; + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { args.devid = vol_args->devid; } else if (!strcmp("cancel", vol_args->name)) { @@ -2735,7 +2689,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); btrfs_exclop_finish(fs_info); @@ -2749,8 +2703,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2761,9 +2715,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2774,7 +2728,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { @@ -2790,17 +2747,18 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); +out_free: kfree(vol_args); return ret; } @@ -2904,7 +2862,7 @@ out: static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; @@ -2936,7 +2894,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(new_root->root_key.objectid)) { + if (!is_fstree(btrfs_root_id(new_root))) { ret = -ENOENT; goto out_free; } @@ -2967,7 +2925,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -3178,7 +3135,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); struct btrfs_ioctl_scrub_args *sa; int ret; @@ -3696,7 +3653,7 @@ out: static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_ctl_args *sa; int ret; @@ -3713,15 +3670,43 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) goto drop_write; } - down_write(&fs_info->subvol_sem); - switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: + down_write(&fs_info->subvol_sem); ret = btrfs_quota_enable(fs_info, sa); + up_write(&fs_info->subvol_sem); break; case BTRFS_QUOTA_CTL_DISABLE: + /* + * Lock the cleaner mutex to prevent races with concurrent + * relocation, because relocation may be building backrefs for + * blocks of the quota root while we are deleting the root. This + * is like dropping fs roots of deleted snapshots/subvolumes, we + * need the same protection. + * + * This also prevents races between concurrent tasks trying to + * disable quotas, because we will unlock and relock + * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + * + * We take this here because we have the dependency of + * + * inode_lock -> subvol_sem + * + * because of rename. With relocation we can prealloc extents, + * so that makes the dependency chain + * + * cleaner_mutex -> inode_lock -> subvol_sem + * + * so we must take the cleaner_mutex here before we take the + * subvol_sem. The deadlock can't actually happen, but this + * quiets lockdep. + */ + mutex_lock(&fs_info->cleaner_mutex); + down_write(&fs_info->subvol_sem); ret = btrfs_quota_disable(fs_info); + up_write(&fs_info->subvol_sem); + mutex_unlock(&fs_info->cleaner_mutex); break; default: ret = -EINVAL; @@ -3729,18 +3714,34 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) } kfree(sa); - up_write(&fs_info->subvol_sem); drop_write: mnt_drop_write_file(file); return ret; } +/* + * Quick check for ioctl handlers if quotas are enabled. Proper locking must be + * done before any operations. + */ +static bool qgroup_enabled(struct btrfs_fs_info *fs_info) +{ + bool ret = true; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) + ret = false; + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + return ret; +} + static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; + struct btrfs_qgroup_list *prealloc = NULL; struct btrfs_trans_handle *trans; int ret; int err; @@ -3748,6 +3749,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3758,14 +3762,27 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) goto drop_write; } + if (sa->assign) { + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + if (!prealloc) { + ret = -ENOMEM; + goto drop_write; + } + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } + /* + * Prealloc ownership is moved to the relation handler, there it's used + * or freed on error. + */ if (sa->assign) { - ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst); + ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst, prealloc); + prealloc = NULL; } else { ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); } @@ -3775,13 +3792,15 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) err = btrfs_run_qgroups(trans); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) - btrfs_handle_fs_error(fs_info, err, - "failed to update qgroup status and info"); + btrfs_warn(fs_info, + "qgroup status update failed after %s relation, marked as inconsistent", + sa->assign ? "adding" : "deleting"); err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: + kfree(prealloc); kfree(sa); drop_write: mnt_drop_write_file(file); @@ -3800,6 +3819,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3856,6 +3878,9 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3875,7 +3900,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = sa->qgroupid; if (!qgroupid) { /* take the current subvol as qgroup */ - qgroupid = root->root_key.objectid; + qgroupid = btrfs_root_id(root); } ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); @@ -3894,13 +3919,16 @@ drop_write: static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_rescan_args *qsa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3944,8 +3972,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, return 0; } -static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, - void __user *arg) +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3958,7 +3985,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; @@ -4006,7 +4033,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, !btrfs_is_empty_uuid(root_item->received_uuid)) { ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + btrfs_root_id(root)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4030,7 +4057,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + btrfs_root_id(root)); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4146,7 +4173,7 @@ static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_trans_handle *trans; @@ -4289,7 +4316,7 @@ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags flags[2]; @@ -4357,7 +4384,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4400,12 +4427,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; loff_t pos; struct kiocb kiocb; ssize_t ret; + u64 disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4458,7 +4490,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos; - ret = btrfs_encoded_read(&kiocb, &iter, &args); + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + + if (ret == -EIOCBQUEUED) { + bool unlocked = false; + u64 start, lockend, count; + + start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + if (args.compression) + count = disk_io_size; + else + count = args.len; + + ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + args.compression, &unlocked); + + if (!unlocked) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + } + if (ret >= 0) { fsnotify_access(file); if (copy_to_user(argp + copy_end, @@ -4555,7 +4612,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_iov; init_sync_kiocb(&kiocb, file); - ret = kiocb_set_rw_flags(&kiocb, 0); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) goto out_iov; kiocb.ki_pos = pos; @@ -4576,11 +4633,590 @@ out_acct: return ret; } +/* + * Context that's attached to an encoded read io_uring command, in cmd->pdu. It + * contains the fields in btrfs_uring_read_extent that are necessary to finish + * off and cleanup the I/O in btrfs_uring_read_finished. + */ +struct btrfs_uring_priv { + struct io_uring_cmd *cmd; + struct page **pages; + unsigned long nr_pages; + struct kiocb iocb; + struct iovec *iov; + struct iov_iter iter; + struct extent_state *cached_state; + u64 count; + u64 start; + u64 lockend; + int err; + bool compressed; +}; + +struct io_btrfs_cmd { + struct btrfs_uring_priv *priv; +}; + +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_priv *priv = bc->priv; + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + unsigned long index; + u64 cur; + size_t page_offset; + ssize_t ret; + + /* The inode lock has already been acquired in btrfs_uring_read_extent. */ + btrfs_lockdep_inode_acquire(inode, i_rwsem); + + if (priv->err) { + ret = priv->err; + goto out; + } + + if (priv->compressed) { + index = 0; + page_offset = 0; + } else { + index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; + page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); + } + cur = 0; + while (cur < priv->count) { + size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); + + if (copy_page_to_iter(priv->pages[index], page_offset, bytes, + &priv->iter) != bytes) { + ret = -EFAULT; + goto out; + } + + index++; + cur += bytes; + page_offset = 0; + } + ret = priv->count; + +out: + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + add_rchar(current, ret); + + for (index = 0; index < priv->nr_pages; index++) + __free_page(priv->pages[index]); + + kfree(priv->pages); + kfree(priv->iov); + kfree(priv); +} + +void btrfs_uring_read_extent_endio(void *ctx, int err) +{ + struct btrfs_uring_priv *priv = ctx; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd); + + priv->err = err; + bc->priv = priv; + + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); +} + +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state *cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + struct iovec *iov, struct io_uring_cmd *cmd) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + struct btrfs_uring_priv *priv = NULL; + unsigned long nr_pages; + int ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto out_fail; + } + + priv = kmalloc(sizeof(*priv), GFP_NOFS); + if (!priv) { + ret = -ENOMEM; + goto out_fail; + } + + priv->iocb = *iocb; + priv->iov = iov; + priv->iter = *iter; + priv->count = count; + priv->cmd = cmd; + priv->cached_state = cached_state; + priv->compressed = compressed; + priv->nr_pages = nr_pages; + priv->pages = pages; + priv->start = start; + priv->lockend = lockend; + priv->err = 0; + + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, priv); + if (ret && ret != -EIOCBQUEUED) + goto out_fail; + + /* + * If we return -EIOCBQUEUED, we're deferring the cleanup to + * btrfs_uring_read_finished(), which will handle unlocking the extent + * and inode and freeing the allocations. + */ + + /* + * We're returning to userspace with the inode lock held, and that's + * okay - it'll get unlocked in a worker thread. Call + * btrfs_lockdep_inode_release() to avoid confusing lockdep. + */ + btrfs_lockdep_inode_release(inode, i_rwsem); + + return -EIOCBQUEUED; + +out_fail: + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + kfree(priv); + return ret; +} + +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); + size_t copy_end; + int ret; + u64 disk_bytenr, disk_io_size; + struct file *file; + struct btrfs_inode *inode; + struct btrfs_fs_info *fs_info; + struct extent_io_tree *io_tree; + loff_t pos; + struct kiocb kiocb; + struct extent_state *cached_state = NULL; + u64 start, lockend; + void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + file = cmd->file; + inode = BTRFS_I(file->f_inode); + fs_info = inode->root->fs_info; + io_tree = &inode->io_tree; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; + goto out_acct; + } + + io_uring_cmd_get_async_data(cmd)->op_data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } + } + + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); + if (ret < 0) + goto out_free; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + if (issue_flags & IO_URING_F_NONBLOCK) + kiocb.ki_flags |= IOCB_NOWAIT; + + start = ALIGN_DOWN(pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, + &disk_bytenr, &disk_io_size); + if (ret < 0 && ret != -EIOCBQUEUED) + goto out_free; + + file_accessed(file); + + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { + if (ret == -EIOCBQUEUED) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + ret = -EFAULT; + goto out_free; + } + + if (ret == -EIOCBQUEUED) { + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); + + /* Match ioctl by not returning past EOF if uncompressed. */ + if (!data->args.compression) + count = min_t(u64, count, data->args.len); + + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); + + goto out_acct; + } + +out_free: + kfree(data->iov); + +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + + return ret; +} + +static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + loff_t pos; + struct kiocb kiocb; + struct file *file; + ssize_t ret; + void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + file = cmd->file; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; + goto out_acct; + } + + io_uring_cmd_get_async_data(cmd)->op_data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; + data->args.len = args32.len; + data->args.unencoded_len = args32.unencoded_len; + data->args.unencoded_offset = args32.unencoded_offset; + data->args.compression = args32.compression; + data->args.encryption = args32.encryption; + memcpy(data->args.reserved, args32.reserved, + sizeof(data->args.reserved)); +#else + ret = -ENOTTY; + goto out_acct; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (data->args.flags != 0) + goto out_acct; + if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved))) + goto out_acct; + if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (data->args.unencoded_offset > data->args.unencoded_len) + goto out_acct; + if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset) + goto out_acct; + + data->iov = data->iovstack; + ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_iov; + } + } + + if (issue_flags & IO_URING_F_NONBLOCK) { + ret = -EAGAIN; + goto out_acct; + } + + pos = data->args.offset; + ret = rw_verify_area(WRITE, file, &pos, data->args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); + if (ret) + goto out_iov; + kiocb.ki_pos = pos; + + file_start_write(file); + + ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args); + if (ret > 0) + fsnotify_modify(file); + + file_end_write(file); +out_iov: + kfree(data->iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case BTRFS_IOC_ENCODED_READ: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: +#endif + return btrfs_uring_encoded_read(cmd, issue_flags); + + case BTRFS_IOC_ENCODED_WRITE: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_WRITE_32: +#endif + return btrfs_uring_encoded_write(cmd, issue_flags); + } + + return -EINVAL; +} + +static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) +{ + struct btrfs_root *root; + struct btrfs_ioctl_subvol_wait args = { 0 }; + signed long sched_ret; + int refs; + u64 root_flags; + bool wait_for_deletion = false; + bool found = false; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + switch (args.mode) { + case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: + /* + * Wait for the first one deleted that waits until all previous + * are cleaned. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + found = true; + } + spin_unlock(&fs_info->trans_lock); + if (!found) + return -ENOENT; + + fallthrough; + case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: + if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || + BTRFS_LAST_FREE_OBJECTID < args.subvolid) + return -EINVAL; + break; + case BTRFS_SUBVOL_SYNC_COUNT: + spin_lock(&fs_info->trans_lock); + args.count = list_count_nodes(&fs_info->dead_roots); + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_FIRST: + spin_lock(&fs_info->trans_lock); + /* Last in the list was deleted first. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_LAST: + spin_lock(&fs_info->trans_lock); + /* First in the list was deleted last. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } + + /* 32bit limitation: fs_roots_radix key is not wide enough. */ + if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) + return -EOVERFLOW; + + while (1) { + /* Wait for the specific one. */ + if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) + return -EINTR; + refs = -1; + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)args.subvolid); + if (root) { + spin_lock(&root->root_item_lock); + refs = btrfs_root_refs(&root->root_item); + root_flags = btrfs_root_flags(&root->root_item); + spin_unlock(&root->root_item_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + up_read(&fs_info->subvol_sem); + + /* Subvolume does not exist. */ + if (!root) + return -ENOENT; + + /* Subvolume not deleted at all. */ + if (refs > 0) + return -EEXIST; + /* We've waited and now the subvolume is gone. */ + if (wait_for_deletion && refs == -1) { + /* Return the one we waited for as the last one. */ + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + } + + /* Subvolume not found on the first try (deleted or never existed). */ + if (refs == -1) + return -ENOENT; + + wait_for_deletion = true; + ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); + sched_ret = schedule_timeout_interruptible(HZ); + /* Early wake up or error. */ + if (sched_ret != 0) + return -EINTR; + } + + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; void __user *argp = (void __user *)arg; @@ -4649,11 +5285,10 @@ long btrfs_ioctl(struct file *file, unsigned int return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* - * The transaction thread may want to do more work, - * namely it pokes the cleaner kthread that will start - * processing uncleaned subvols. + * There may be work for the cleaner kthread to do (subvolume + * deletion, delayed iputs, defrag inodes, etc), so wake it up. */ - wake_up_process(fs_info->transaction_kthread); + wake_up_process(fs_info->cleaner_kthread); return ret; } case BTRFS_IOC_START_SYNC: @@ -4679,10 +5314,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(inode, argp, false); + return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(inode, argp, true); + return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); @@ -4699,7 +5334,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: - return btrfs_ioctl_quota_rescan_wait(fs_info, argp); + return btrfs_ioctl_quota_rescan_wait(fs_info); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: @@ -4718,6 +5353,8 @@ long btrfs_ioctl(struct file *file, unsigned int return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case FS_IOC_READ_VERITY_METADATA: + return fsverity_ioctl_read_metadata(file, argp); case BTRFS_IOC_ENCODED_READ: return btrfs_ioctl_encoded_read(file, argp, false); case BTRFS_IOC_ENCODED_WRITE: @@ -4728,6 +5365,8 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ENCODED_WRITE_32: return btrfs_ioctl_encoded_write(file, argp, true); #endif + case BTRFS_IOC_SUBVOL_SYNC_WAIT: + return btrfs_ioctl_subvol_sync(fs_info, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index d51b9a2f2f6e..ce915fcda43b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -3,6 +3,15 @@ #ifndef BTRFS_IOCTL_H #define BTRFS_IOCTL_H +#include <linux/types.h> + +struct file; +struct dentry; +struct mnt_idmap; +struct fileattr; +struct btrfs_fs_info; +struct btrfs_ioctl_balance_args; + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); @@ -10,8 +19,9 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void btrfs_uring_read_extent_endio(void *ctx, int err); #endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 74d8e2003f58..9a7a7b723305 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -13,7 +13,6 @@ #include "ctree.h" #include "extent_io.h" #include "locking.h" -#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given @@ -85,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int { struct btrfs_lockdep_keyset *ks; - BUG_ON(level >= ARRAY_SIZE(ks->keys)); + ASSERT(level < ARRAY_SIZE(ks->keys)); /* Find the matching keyset, id 0 is the default entry */ for (ks = btrfs_lockdep_keysets; ks->id; ks++) @@ -98,7 +97,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) { if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) - btrfs_set_buffer_lockdep_class(root->root_key.objectid, + btrfs_set_buffer_lockdep_class(btrfs_root_id(root), eb, btrfs_header_level(eb)); } @@ -130,14 +129,14 @@ static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } */ /* - * __btrfs_tree_read_lock - lock extent buffer for read + * btrfs_tree_read_lock_nested - lock extent buffer for read * @eb: the eb to be locked * @nest: the nesting level to be used for lockdep * * This takes the read lock on the extent buffer, using the specified nesting * level for lockdep purposes. */ -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) +void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest) { u64 start_ns = 0; @@ -148,11 +147,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne trace_btrfs_tree_read_lock(eb, start_ns); } -void btrfs_tree_read_lock(struct extent_buffer *eb) -{ - __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); -} - /* * Try-lock for read. * @@ -168,21 +162,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) } /* - * Try-lock for write. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_try_tree_write_lock(struct extent_buffer *eb) -{ - if (down_write_trylock(&eb->lock)) { - btrfs_set_eb_lock_owner(eb, current->pid); - trace_btrfs_try_tree_write_lock(eb); - return 1; - } - return 0; -} - -/* * Release read lock. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) @@ -199,7 +178,7 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) * * Returns with the eb->lock write locked. */ -void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) +void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -212,11 +191,6 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) trace_btrfs_tree_lock(eb, start_ns); } -void btrfs_tree_lock(struct extent_buffer *eb) -{ - __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); -} - /* * Release the write lock. */ @@ -375,8 +349,12 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) { - atomic_dec(&lock->writers); - cond_wake_up(&lock->pending_readers); + /* + * atomic_dec_and_test() implies a full barrier, so woken up readers are + * guaranteed to see the decrement. + */ + if (atomic_dec_and_test(&lock->writers)) + wake_up(&lock->pending_readers); } void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 7d6ee1e609bf..c69e57ff804b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -8,9 +8,14 @@ #include <linux/atomic.h> #include <linux/wait.h> +#include <linux/lockdep.h> #include <linux/percpu_counter.h> #include "extent_io.h" +struct extent_buffer; +struct btrfs_path; +struct btrfs_root; + #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 @@ -124,6 +129,16 @@ enum btrfs_lockdep_trans_states { rwsem_release(&owner->lock##_map, _THIS_IP_) /* + * Used to account for the fact that when doing io_uring encoded I/O, we can + * return to userspace with the inode lock still held. + */ +#define btrfs_lockdep_inode_acquire(owner, lock) \ + rwsem_acquire_read(&owner->vfs_inode.lock.dep_map, 0, 0, _THIS_IP_) + +#define btrfs_lockdep_inode_release(owner, lock) \ + rwsem_release(&owner->vfs_inode.lock.dep_map, _THIS_IP_) + +/* * Macros for the transaction states wait events, similar to the generic wait * event macros. */ @@ -157,17 +172,24 @@ enum btrfs_lockdep_trans_states { static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); -struct btrfs_path; +void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest); + +static inline void btrfs_tree_lock(struct extent_buffer *eb) +{ + btrfs_tree_lock_nested(eb, BTRFS_NESTING_NORMAL); +} -void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); -void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); -void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest); + +static inline void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + btrfs_tree_read_lock_nested(eb, BTRFS_NESTING_NORMAL); +} + void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); -int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); @@ -177,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { lockdep_assert_held_write(&eb->lock); } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + lockdep_assert_held_read(&eb->lock); +} #else static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } #endif void btrfs_unlock_up_safe(struct btrfs_path *path, int level); diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h index 00328c856be6..07f1bb1c6aa3 100644 --- a/fs/btrfs/lru_cache.h +++ b/fs/btrfs/lru_cache.h @@ -3,6 +3,7 @@ #ifndef BTRFS_LRU_CACHE_H #define BTRFS_LRU_CACHE_H +#include <linux/types.h> #include <linux/maple_tree.h> #include <linux/list.h> @@ -50,11 +51,6 @@ struct btrfs_lru_cache { #define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) -static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) -{ - return cache->size; -} - static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( struct btrfs_lru_cache *cache) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index e43bc0fdc74e..a45bc11f8665 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(unsigned int level) +struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; @@ -130,17 +130,17 @@ static inline size_t read_compress_length(const char *buf) */ static int copy_compressed_data_to_page(char *compressed_data, size_t compressed_size, - struct page **out_pages, - unsigned long max_nr_page, + struct folio **out_folios, + unsigned long max_nr_folio, u32 *cur_out, const u32 sectorsize) { u32 sector_bytes_left; u32 orig_out; - struct page *cur_page; + struct folio *cur_folio; char *kaddr; - if ((*cur_out / PAGE_SIZE) >= max_nr_page) + if ((*cur_out / PAGE_SIZE) >= max_nr_folio) return -E2BIG; /* @@ -149,16 +149,16 @@ static int copy_compressed_data_to_page(char *compressed_data, */ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - cur_page = out_pages[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ - if (!cur_page) { - cur_page = btrfs_alloc_compr_page(); - if (!cur_page) + if (!cur_folio) { + cur_folio = btrfs_alloc_compr_folio(); + if (!cur_folio) return -ENOMEM; - out_pages[*cur_out / PAGE_SIZE] = cur_page; + out_folios[*cur_out / PAGE_SIZE] = cur_folio; } - kaddr = kmap_local_page(cur_page); + kaddr = kmap_local_folio(cur_folio, 0); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -172,18 +172,18 @@ static int copy_compressed_data_to_page(char *compressed_data, kunmap_local(kaddr); - if ((*cur_out / PAGE_SIZE) >= max_nr_page) + if ((*cur_out / PAGE_SIZE) >= max_nr_folio) return -E2BIG; - cur_page = out_pages[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ - if (!cur_page) { - cur_page = btrfs_alloc_compr_page(); - if (!cur_page) + if (!cur_folio) { + cur_folio = btrfs_alloc_compr_folio(); + if (!cur_folio) return -ENOMEM; - out_pages[*cur_out / PAGE_SIZE] = cur_page; + out_folios[*cur_out / PAGE_SIZE] = cur_folio; } - kaddr = kmap_local_page(cur_page); + kaddr = kmap_local_folio(cur_folio, 0); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -209,15 +209,15 @@ out: return 0; } -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; - struct page *page_in = NULL; + const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; + struct folio *folio_in = NULL; char *sizes_ptr; - const unsigned long max_nr_page = *out_pages; + const unsigned long max_nr_folio = *out_folios; int ret = 0; /* Points to the file offset of input data */ u64 cur_in = start; @@ -225,8 +225,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u32 cur_out = 0; u32 len = *total_out; - ASSERT(max_nr_page > 0); - *out_pages = 0; + ASSERT(max_nr_folio > 0); + *out_folios = 0; *total_out = 0; *total_in = 0; @@ -243,28 +243,29 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, size_t out_len; /* Get the input page first */ - if (!page_in) { - page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT); - ASSERT(page_in); + if (!folio_in) { + ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); + if (ret < 0) + goto out; } /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap_local_page(page_in); + data_in = kmap_local_folio(folio_in, 0); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, workspace->mem); kunmap_local(data_in); - if (ret < 0) { - pr_debug("BTRFS: lzo in loop returned %d\n", ret); + if (unlikely(ret < 0)) { + /* lzo1x_1_compress never fails. */ ret = -EIO; goto out; } ret = copy_compressed_data_to_page(workspace->cbuf, out_len, - pages, max_nr_page, + folios, max_nr_folio, &cur_out, sectorsize); if (ret < 0) goto out; @@ -282,13 +283,13 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we have reached page boundary */ if (PAGE_ALIGNED(cur_in)) { - put_page(page_in); - page_in = NULL; + folio_put(folio_in); + folio_in = NULL; } } /* Store the size of all chunks of compressed data */ - sizes_ptr = kmap_local_page(pages[0]); + sizes_ptr = kmap_local_folio(folios[0], 0); write_compress_length(sizes_ptr, cur_out); kunmap_local(sizes_ptr); @@ -296,9 +297,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = cur_out; *total_in = cur_in - start; out: - if (page_in) - put_page(page_in); - *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); + if (folio_in) + folio_put(folio_in); + *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE); return ret; } @@ -313,15 +314,15 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct page *cur_page; + struct folio *cur_folio; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); ASSERT(copy_len); - cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE]; - memcpy_from_page(dest + *cur_in - orig_in, cur_page, - offset_in_page(*cur_in), copy_len); + memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, + offset_in_folio(cur_folio, *cur_in), copy_len); *cur_in += copy_len; } @@ -341,7 +342,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap_local_page(cb->compressed_pages[0]); + kaddr = kmap_local_folio(cb->compressed_folios[0], 0); len_in = read_compress_length(kaddr); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -353,17 +354,20 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) * and all sectors should be used. * If this happens, it means the compressed extent is corrupted. */ - if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || - round_up(len_in, sectorsize) < cb->compressed_len) { + if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || + round_up(len_in, sectorsize) < cb->compressed_len)) { + struct btrfs_inode *inode = cb->bbio.inode; + btrfs_err(fs_info, - "invalid lzo header, lzo len %u compressed len %u", - len_in, cb->compressed_len); +"lzo header invalid, root %llu inode %llu offset %llu lzo len %u compressed len %u", + btrfs_root_id(inode->root), btrfs_ino(inode), + cb->start, len_in, cb->compressed_len); return -EUCLEAN; } /* Go through each lzo segment */ while (cur_in < len_in) { - struct page *cur_page; + struct folio *cur_folio; /* Length of the compressed segment */ u32 seg_len; u32 sector_bytes_left; @@ -375,20 +379,24 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; - ASSERT(cur_page); - kaddr = kmap_local_page(cur_page); + cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE]; + ASSERT(cur_folio); + kaddr = kmap_local_folio(cur_folio, 0); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; - if (seg_len > WORKSPACE_CBUF_LENGTH) { + if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) { + struct btrfs_inode *inode = cb->bbio.inode; + /* * seg_len shouldn't be larger than we have allocated * for workspace->cbuf */ - btrfs_err(fs_info, "unexpectedly large lzo segment len %u", - seg_len); + btrfs_err(fs_info, + "lzo segment too big, root %llu inode %llu offset %llu len %u", + btrfs_root_id(inode->root), btrfs_ino(inode), + cb->start, seg_len); return -EIO; } @@ -398,8 +406,13 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Decompress the data */ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, workspace->buf, &out_len); - if (ret != LZO_E_OK) { - btrfs_err(fs_info, "failed to decompress"); + if (unlikely(ret != LZO_E_OK)) { + struct btrfs_inode *inode = cb->bbio.inode; + + btrfs_err(fs_info, + "lzo decompression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + cb->start); return -EIO; } @@ -425,11 +438,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; @@ -453,18 +466,23 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); - if (ret != LZO_E_OK) { - pr_warn("BTRFS: decompress failed!\n"); + if (unlikely(ret != LZO_E_OK)) { + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(fs_info, + "lzo decompression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(dest_folio)); ret = -EIO; goto out; } ASSERT(out_len <= sectorsize); - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); /* Early end, considered as an error. */ if (unlikely(out_len < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); + folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); } out: return ret; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index cdada4865837..363fd28c0268 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -3,13 +3,11 @@ #include "fs.h" #include "messages.h" #include "discard.h" -#include "transaction.h" -#include "space-info.h" #include "super.h" #ifdef CONFIG_PRINTK -#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_PREFACE " state " #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) /* @@ -22,7 +20,8 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', [BTRFS_FS_STATE_DEV_REPLACING] = 'R', [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, - [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', + [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S', [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', }; @@ -240,7 +239,8 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, vaf.fmt = fmt; vaf.va = &args; - if (__ratelimit(ratelimit)) { + /* Do not ratelimit if CONFIG_BTRFS_DEBUG is enabled. */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG) || __ratelimit(ratelimit)) { if (fs_info) { char statestr[STATE_STRING_BUF_LEN]; diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 40f2d9f1a17a..0d599fd847c9 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -3,6 +3,8 @@ #ifndef BTRFS_MISC_H #define BTRFS_MISC_H +#include <linux/types.h> +#include <linux/bitmap.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/math64.h> @@ -64,7 +66,7 @@ struct rb_simple_node { u64 bytenr; }; -static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) +static inline struct rb_node *rb_simple_search(const struct rb_root *root, u64 bytenr) { struct rb_node *node = root->rb_node; struct rb_simple_node *entry; @@ -91,7 +93,7 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) * Return the rb_node that start at or after @bytenr. If there is no entry at * or after @bytner return NULL. */ -static inline struct rb_node *rb_simple_search_first(struct rb_root *root, +static inline struct rb_node *rb_simple_search_first(const struct rb_root *root, u64 bytenr) { struct rb_node *node = root->rb_node, *ret = NULL; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 59850dc17b22..4aca7475fd82 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,7 +19,7 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" -#include "super.h" +#include "block-group.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -111,8 +111,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, return NULL; } -static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, - u64 len) +static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) { if (file_offset + len <= entry->file_offset || entry->file_offset + entry->num_bytes <= file_offset) @@ -180,7 +180,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->disk_num_bytes = disk_num_bytes; entry->offset = offset; entry->bytes_left = num_bytes; - entry->inode = igrab(&inode->vfs_inode); + entry->inode = BTRFS_I(igrab(&inode->vfs_inode)); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = qgroup_rsv; @@ -208,7 +208,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( static void insert_ordered_extent(struct btrfs_ordered_extent *entry) { - struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_inode *inode = entry->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -224,7 +224,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) spin_lock_irq(&inode->ordered_tree_lock); node = tree_insert(&inode->ordered_tree, entry->file_offset, &entry->rb_node); - if (node) + if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); @@ -264,17 +264,39 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) */ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type) + const struct btrfs_file_extent *file_extent, unsigned long flags) { struct btrfs_ordered_extent *entry; ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); - entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes, - disk_bytenr, disk_num_bytes, offset, flags, - compress_type); + /* + * For regular writes, we just use the members in @file_extent. + * + * For NOCOW, we don't really care about the numbers except @start and + * file_extent->num_bytes, as we won't insert a file extent item at all. + * + * For PREALLOC, we do not use ordered extent members, but + * btrfs_mark_extent_written() handles everything. + * + * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents, + * or btrfs_split_ordered_extent() cannot handle it correctly. + */ + if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))) + entry = alloc_ordered_extent(inode, file_offset, + file_extent->num_bytes, + file_extent->num_bytes, + file_extent->disk_bytenr + file_extent->offset, + file_extent->num_bytes, 0, flags, + file_extent->compression); + else + entry = alloc_ordered_extent(inode, file_offset, + file_extent->num_bytes, + file_extent->ram_bytes, + file_extent->disk_bytenr, + file_extent->disk_num_bytes, + file_extent->offset, flags, + file_extent->compression); if (!IS_ERR(entry)) insert_ordered_extent(entry); return entry; @@ -288,13 +310,19 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum) { - struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_inode *inode = entry->inode; spin_lock_irq(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); spin_unlock_irq(&inode->ordered_tree_lock); } +void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) +{ + if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO); +} + static void finish_ordered_fn(struct btrfs_work *work) { struct btrfs_ordered_extent *ordered_extent; @@ -304,36 +332,35 @@ static void finish_ordered_fn(struct btrfs_work *work) } static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; lockdep_assert_held(&inode->ordered_tree_lock); - if (page) { - ASSERT(page->mapping); - ASSERT(page_offset(page) <= file_offset); - ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + if (folio) { + ASSERT(folio->mapping); + ASSERT(folio_pos(folio) <= file_offset); + ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* - * Ordered (Private2) bit indicates whether we still have + * Ordered flag indicates whether we still have * pending io unfinished for the ordered extent. * - * If there's no such bit, we need to skip to next range. + * If it's not set, we need to skip to next range. */ - if (!btrfs_folio_test_ordered(fs_info, page_folio(page), - file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; - btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); + btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); } /* Now we're fine to update the accounting. */ if (WARN_ON_ONCE(len > ordered->bytes_left)) { btrfs_crit(fs_info, "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", - inode->root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(inode->root), btrfs_ino(inode), ordered->file_offset, ordered->num_bytes, len, ordered->bytes_left); ordered->bytes_left = 0; @@ -360,7 +387,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? fs_info->endio_freespace_worker : fs_info->endio_write_workers; @@ -369,30 +396,61 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) btrfs_queue_work(wq, &ordered->work); } -bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, +void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; unsigned long flags; bool ret; trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); spin_lock_irqsave(&inode->ordered_tree_lock, flags); - ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + ret = can_finish_ordered_extent(ordered, folio, file_offset, len, + uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + /* + * If this is a COW write it means we created new extent maps for the + * range and they point to unwritten locations if we got an error either + * before submitting a bio or during IO. + * + * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we + * are queuing its completion below. During completion, at + * btrfs_finish_one_ordered(), we will drop the extent maps for the + * unwritten extents. + * + * However because completion runs in a work queue we can end up having + * a fast fsync running before that. In the case of direct IO, once we + * unlock the inode the fsync might start, and we queue the completion + * before unlocking the inode. In the case of buffered IO when writeback + * finishes (end_bbio_data_write()) we queue the completion, so if the + * writeback was triggered by a fast fsync, the fsync might start + * logging before ordered extent completion runs in the work queue. + * + * The fast fsync will log file extent items based on the extent maps it + * finds, so if by the time it collects extent maps the ordered extent + * completion didn't happen yet, it will log file extent items that + * point to unwritten extents, resulting in a corruption if a crash + * happens and the log tree is replayed. Note that a fast fsync does not + * wait for completion of ordered extents in order to reduce latency. + * + * Set a flag in the inode so that the next fast fsync will wait for + * ordered extents to complete before starting to log. + */ + if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); + if (ret) btrfs_queue_ordered_fn(ordered); - return ret; } /* * Mark all ordered extents io inside the specified range finished. * - * @page: The involved page for the operation. - * For uncompressed buffered IO, the page status also needs to be + * @folio: The involved folio for the operation. + * For uncompressed buffered IO, the folio status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the @@ -402,7 +460,7 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 num_bytes, bool uptodate) { struct rb_node *node; @@ -466,7 +524,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { + if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&inode->ordered_tree_lock, flags); @@ -552,14 +610,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); + trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) - btrfs_add_delayed_iput(BTRFS_I(entry->inode)); + btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -590,7 +648,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, freespace_inode = btrfs_is_free_space_inode(btrfs_inode); btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); - /* This is paired with btrfs_alloc_ordered_extent. */ + /* This is paired with alloc_ordered_extent(). */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); @@ -676,11 +734,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) } /* - * wait for all the ordered extents in a root. This is done when balancing - * space between drives. + * Wait for all the ordered extents in a root. Use @bg as range or do whole + * range if it's NULL. */ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, - const u64 range_start, const u64 range_len) + const struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(splice); @@ -688,7 +746,17 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; u64 count = 0; - const u64 range_end = range_start + range_len; + u64 range_start, range_len; + u64 range_end; + + if (bg) { + range_start = bg->start; + range_len = bg->length; + } else { + range_start = 0; + range_len = U64_MAX; + } + range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -715,10 +783,10 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); cond_resched(); - spin_lock(&root->ordered_extent_lock); if (nr != U64_MAX) nr--; count++; + spin_lock(&root->ordered_extent_lock); } list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); @@ -735,8 +803,12 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, return count; } +/* + * Wait for @nr ordered extents that intersect the @bg, or the whole range of + * the filesystem if @bg is NULL. + */ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, - const u64 range_start, const u64 range_len) + const struct btrfs_block_group *bg) { struct btrfs_root *root; LIST_HEAD(splice); @@ -754,14 +826,13 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr, - range_start, range_len); + done = btrfs_wait_ordered_extents(root, nr, bg); btrfs_put_root(root); - spin_lock(&fs_info->ordered_root_lock); - if (nr != U64_MAX) { + if (nr != U64_MAX) nr -= done; - } + + spin_lock(&fs_info->ordered_root_lock); } list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); @@ -778,7 +849,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; - struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_inode *inode = entry->inode; bool freespace_inode; trace_btrfs_ordered_extent_start(inode, entry); @@ -805,7 +876,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) /* * Used to wait on ordered extents across a large range of bytes. */ -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len) { int ret = 0; int ret_wb = 0; @@ -835,11 +906,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) * before the ordered extents complete - to avoid failures (-EEXIST) * when adding the new ordered extents to the ordered tree. */ - ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); + ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end); end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); + ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -914,7 +985,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( while (1) { entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (range_overlaps(entry, file_offset, len)) + if (btrfs_range_overlaps(entry, file_offset, len)) break; if (entry->file_offset >= file_offset + len) { @@ -944,7 +1015,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, { struct rb_node *n; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); spin_lock_irq(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { @@ -1043,12 +1114,12 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( } if (prev) { entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); - if (range_overlaps(entry, file_offset, len)) + if (btrfs_range_overlaps(entry, file_offset, len)) goto out; } if (next) { entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); - if (range_overlaps(entry, file_offset, len)) + if (btrfs_range_overlaps(entry, file_offset, len)) goto out; } /* No ordered extent in the range */ @@ -1137,7 +1208,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; @@ -1158,6 +1229,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); @@ -1176,19 +1259,37 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( /* One ref for the tree. */ refcount_inc(&new->refs); + /* + * Take the root's ordered_extent_lock to avoid a race with + * btrfs_wait_ordered_extents() when updating the disk_bytenr and + * disk_num_bytes fields of the ordered extent below. And we disable + * IRQs because the inode's ordered_tree_lock is used in IRQ context + * elsewhere. + * + * There's no concern about a previous caller of + * btrfs_wait_ordered_extents() getting the trimmed ordered extent + * before we insert the new one, because even if it gets the ordered + * extent before it's trimmed and the new one inserted, right before it + * uses it or during its use, the ordered extent might have been + * trimmed in the meanwhile, and it missed the new ordered extent. + * There's no way around this and it's harmless for current use cases, + * so we take the root's ordered_extent_lock to fix that race during + * trimming and silence tools like KCSAN. + */ spin_lock_irq(&root->ordered_extent_lock); spin_lock(&inode->ordered_tree_lock); - /* Remove from tree once */ - node = &ordered->rb_node; - rb_erase(node, &inode->ordered_tree); - RB_CLEAR_NODE(node); - if (inode->ordered_tree_last == node) - inode->ordered_tree_last = NULL; + /* + * We don't have overlapping ordered extents (that would imply double + * allocation of extents) and we checked above that the split length + * does not cross the ordered extent's num_bytes field, so there's + * no need to remove it and re-insert it in the tree. + */ ordered->file_offset += len; ordered->disk_bytenr += len; ordered->num_bytes -= len; ordered->disk_num_bytes -= len; + ordered->ram_bytes -= len; if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { ASSERT(ordered->bytes_left == 0); @@ -1213,18 +1314,10 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( offset += sum->len; } - /* Re-insert the node */ - node = tree_insert(&inode->ordered_tree, ordered->file_offset, - &ordered->rb_node); - if (node) - btrfs_panic(fs_info, -EEXIST, - "zoned: inconsistency in ordered tree at offset %llu", - ordered->file_offset); - node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); - if (node) + if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, - "zoned: inconsistency in ordered tree at offset %llu", + "inconsistency in ordered tree at offset %llu after split", new->file_offset); spin_unlock(&inode->ordered_tree_lock); @@ -1236,10 +1329,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( int __init ordered_data_init(void) { - btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", - sizeof(struct btrfs_ordered_extent), 0, - SLAB_MEM_SPREAD, - NULL); + btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 127ef8bf0ffd..4e152736d06c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -6,6 +6,21 @@ #ifndef BTRFS_ORDERED_DATA_H #define BTRFS_ORDERED_DATA_H +#include <linux/types.h> +#include <linux/list.h> +#include <linux/refcount.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <linux/wait.h> +#include "async-thread.h" + +struct inode; +struct page; +struct extent_state; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_fs_info; + struct btrfs_ordered_sum { /* * Logical start address and length for of the blocks covered by @@ -115,7 +130,7 @@ struct btrfs_ordered_extent { refcount_t refs; /* the inode we belong to */ - struct inode *inode; + struct btrfs_inode *inode; /* list of checksums for insertion when the extent io is done */ struct list_head list; @@ -147,26 +162,37 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, +void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct folio *folio, u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, bool uptodate); + struct folio *folio, u64 file_offset, + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); + +/* + * This represents details about the target file extent item of a write operation. + */ +struct btrfs_file_extent { + u64 disk_bytenr; + u64 disk_num_bytes; + u64 num_bytes; + u64 ram_bytes; + u64 offset; + u8 compression; +}; + struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type); + const struct btrfs_file_extent *file_extent, unsigned long flags); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( @@ -178,9 +204,9 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, - const u64 range_start, const u64 range_len); + const struct btrfs_block_group *bg); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, - const u64 range_start, const u64 range_len); + const struct btrfs_block_group *bg); void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); @@ -188,6 +214,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len); +void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 7a1b021b5669..9f3ad124104f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -4,15 +4,13 @@ */ #include "ctree.h" -#include "disk-io.h" #include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; - int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -22,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - - btrfs_free_path(path); - return ret; + return btrfs_insert_empty_item(trans, root, path, &key, 0); } int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -45,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - if (ret) { /* JDM: Really? */ - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); + return ret; + if (ret) + return -ENOENT; -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h index 3faab5cbb59a..aa54a88a60de 100644 --- a/fs/btrfs/orphan.h +++ b/fs/btrfs/orphan.h @@ -3,6 +3,11 @@ #ifndef BTRFS_ORPHAN_H #define BTRFS_ORPHAN_H +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_root; + int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 7e46aa8a0444..fc821aa446f0 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -14,7 +14,7 @@ struct root_name_map { u64 id; - char name[16]; + const char *name; }; static const struct root_name_map root_map[] = { @@ -109,7 +109,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type btrfs_err(eb->fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); - btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL); + return; } ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); @@ -208,11 +208,6 @@ static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size, struct btrfs_stripe_extent *stripe) { const int num_stripes = btrfs_num_raid_stripes(item_size); - const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe); - - pr_info("\t\t\tencoding: %s\n", - (encoding && encoding < BTRFS_NR_RAID_TYPES) ? - btrfs_raid_array[encoding].raid_name : "unknown"); for (int i = 0; i < num_stripes; i++) pr_info("\t\t\tstride %d devid %llu physical %llu\n", @@ -310,6 +305,9 @@ void btrfs_print_leaf(const struct extent_buffer *l) case BTRFS_EXTENT_DATA_KEY: fi = btrfs_item_ptr(l, i, struct btrfs_file_extent_item); + pr_info("\t\tgeneration %llu type %hhu\n", + btrfs_file_extent_generation(l, fi), + btrfs_file_extent_type(l, fi)); if (btrfs_file_extent_type(l, fi) == BTRFS_FILE_EXTENT_INLINE) { pr_info("\t\tinline extent data size %llu\n", diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index c42bc666d5ee..8504bf1702c7 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,6 +9,9 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 +struct extent_buffer; +struct btrfs_key; + void btrfs_print_leaf(const struct extent_buffer *l); void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index f9bf591a0718..b8fa34e16abb 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,6 +4,7 @@ */ #include <linux/hashtable.h> +#include <linux/xattr.h> #include "messages.h" #include "props.h" #include "btrfs_inode.h" @@ -26,7 +27,7 @@ struct prop_handler { int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); - const char *(*extract)(struct inode *inode); + const char *(*extract)(const struct inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -103,7 +104,7 @@ bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) return handler->ignore(inode); } -int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, const char *name, const char *value, size_t value_len, int flags) { @@ -115,29 +116,29 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, return -EINVAL; if (value_len == 0) { - ret = btrfs_setxattr(trans, inode, handler->xattr_name, + ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); if (ret) return ret; - ret = handler->apply(inode, NULL, 0); + ret = handler->apply(&inode->vfs_inode, NULL, 0); ASSERT(ret == 0); return ret; } - ret = btrfs_setxattr(trans, inode, handler->xattr_name, value, + ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, value, value_len, flags); if (ret) return ret; - ret = handler->apply(inode, value, value_len); + ret = handler->apply(&inode->vfs_inode, value, value_len); if (ret) { - btrfs_setxattr(trans, inode, handler->xattr_name, NULL, + btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); return ret; } - set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); return 0; } @@ -267,7 +268,7 @@ static void inode_prop_iterator(void *ctx, btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", handler->xattr_name, btrfs_ino(BTRFS_I(inode)), - root->root_key.objectid, ret); + btrfs_root_id(root), ret); else set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } @@ -302,7 +303,7 @@ static int prop_compression_validate(const struct btrfs_inode *inode, static int prop_compression_apply(struct inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int type; /* Reset to defaults */ @@ -358,7 +359,7 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode) return false; } -static const char *prop_compression_extract(struct inode *inode) +static const char *prop_compression_extract(const struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { case BTRFS_COMPRESS_ZLIB: @@ -384,7 +385,7 @@ static struct prop_handler prop_handlers[] = { }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *parent) + struct inode *inode, const struct inode *parent) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 6e283196e38a..63546d0a9444 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,11 +6,16 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H -#include "ctree.h" +#include <linux/compiler_types.h> + +struct inode; +struct btrfs_inode; +struct btrfs_path; +struct btrfs_trans_handle; int __init btrfs_props_init(void); -int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, const char *name, const char *value, size_t value_len, int flags); int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, @@ -21,6 +26,6 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, - struct inode *dir); + const struct inode *dir); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5470e1cdf10c..f9d3766c809b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,7 +30,7 @@ #include "root-tree.h" #include "tree-checker.h" -enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) +enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info) { if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return BTRFS_QGROUP_MODE_DISABLED; @@ -39,12 +39,12 @@ enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) return BTRFS_QGROUP_MODE_FULL; } -bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info) +bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info) { return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; } -bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info) +bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info) { return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; } @@ -107,7 +107,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, - struct btrfs_qgroup *src) + const struct btrfs_qgroup *src) { int i; @@ -117,7 +117,7 @@ static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, - struct btrfs_qgroup *src) + const struct btrfs_qgroup *src) { int i; @@ -141,37 +141,27 @@ static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, qg->new_refcnt += mod; } -static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq) { if (qg->old_refcnt < seq) return 0; return qg->old_refcnt - seq; } -static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq) { if (qg->new_refcnt < seq) return 0; return qg->new_refcnt - seq; } -/* - * glue structure to represent the relations between qgroups. - */ -struct btrfs_qgroup_list { - struct list_head next_group; - struct list_head next_member; - struct btrfs_qgroup *group; - struct btrfs_qgroup *member; -}; - static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); /* must be called with qgroup_ioctl_lock held */ -static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, +static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info, u64 qgroupid) { struct rb_node *n = fs_info->qgroup_tree.rb_node; @@ -236,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup) +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; @@ -268,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); return 0; } @@ -346,7 +335,7 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info, } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, +int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl) { struct btrfs_qgroup *qgroup; @@ -468,6 +457,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } if (!qgroup) { struct btrfs_qgroup *prealloc; + struct btrfs_root *tree_root = fs_info->tree_root; prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) { @@ -475,6 +465,25 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); + /* + * If a qgroup exists for a subvolume ID, it is possible + * that subvolume has been deleted, in which case + * reusing that ID would lead to incorrect accounting. + * + * Ensure that we skip any such subvol ids. + * + * We don't need to lock because this is only called + * during mount before we start doing things like creating + * subvolumes. + */ + if (is_fstree(qgroup->qgroupid) && + qgroup->qgroupid > tree_root->free_objectid) + /* + * Don't need to check against BTRFS_LAST_FREE_OBJECTID, + * as it will get checked on the next call to + * btrfs_get_free_objectid. + */ + tree_root->free_objectid = qgroup->qgroupid + 1; } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) @@ -588,7 +597,7 @@ out: * Return false if no reserved space is left. * Return true if some reserved space is leaked. */ -bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) +bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info) { struct rb_node *node; bool ret = false; @@ -633,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); } @@ -664,9 +673,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - btrfs_free_path(path); return ret; } @@ -743,8 +749,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); - btrfs_mark_buffer_dirty(trans, leaf); - btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; @@ -762,8 +766,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - btrfs_mark_buffer_dirty(trans, leaf); - ret = 0; out: btrfs_free_path(path); @@ -850,9 +852,6 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -896,9 +895,6 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -938,9 +934,6 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -1112,6 +1105,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -1120,8 +1114,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); - btrfs_mark_buffer_dirty(trans, leaf); - key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; key.offset = 0; @@ -1245,8 +1237,6 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (simple) - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ @@ -1314,24 +1304,19 @@ out: */ static int flush_reservations(struct btrfs_fs_info *fs_info) { - struct btrfs_trans_handle *trans; int ret; ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - trans = btrfs_join_transaction(fs_info->tree_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_commit_transaction(trans); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); - return ret; + return btrfs_commit_current_transaction(fs_info->tree_root); } int btrfs_quota_disable(struct btrfs_fs_info *fs_info) { - struct btrfs_root *quota_root; + struct btrfs_root *quota_root = NULL; struct btrfs_trans_handle *trans = NULL; int ret = 0; @@ -1342,16 +1327,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) lockdep_assert_held_write(&fs_info->subvol_sem); /* - * Lock the cleaner mutex to prevent races with concurrent relocation, - * because relocation may be building backrefs for blocks of the quota - * root while we are deleting the root. This is like dropping fs roots - * of deleted snapshots/subvolumes, we need the same protection. - * - * This also prevents races between concurrent tasks trying to disable - * quotas, because we will unlock and relock qgroup_ioctl_lock across - * BTRFS_FS_QUOTA_ENABLED changes. + * Relocation will mess with backrefs, so make sure we have the + * cleaner_mutex held to protect us from relocate. */ - mutex_lock(&fs_info->cleaner_mutex); + lockdep_assert_held(&fs_info->cleaner_mutex); mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) @@ -1373,9 +1352,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); + /* + * We have nothing held here and no trans handle, just return the error + * if there is one. + */ ret = flush_reservations(fs_info); if (ret) - goto out_unlock_cleaner; + return ret; /* * 1 For the root item @@ -1404,7 +1387,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -1428,20 +1411,19 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_tree_lock(quota_root->node); btrfs_clear_buffer_dirty(trans, quota_root->node); btrfs_tree_unlock(quota_root->node); - btrfs_free_tree_block(trans, btrfs_root_id(quota_root), - quota_root->node, 0, 1); + ret = btrfs_free_tree_block(trans, btrfs_root_id(quota_root), + quota_root->node, 0, 1); - btrfs_put_root(quota_root); + if (ret < 0) + btrfs_abort_transaction(trans, ret); out: + btrfs_put_root(quota_root); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); else if (trans) ret = btrfs_commit_transaction(trans); -out_unlock_cleaner: - mutex_unlock(&fs_info->cleaner_mutex); - return ret; } @@ -1541,18 +1523,15 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info, { struct btrfs_qgroup *qgroup; int ret = 1; - int err = 0; qgroup = find_qgroup_rb(fs_info, src); if (!qgroup) goto out; if (qgroup->excl == qgroup->rfer) { - ret = 0; - err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); - if (err < 0) { - ret = err; + ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); + if (ret < 0) goto out; - } + ret = 0; } out: if (ret) @@ -1560,15 +1539,21 @@ out: return ret; } -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) +/* + * Add relation between @src and @dst qgroup. The @prealloc is allocated by the + * callers and transferred here (either used or freed on error). + */ +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst, + struct btrfs_qgroup_list *prealloc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct btrfs_qgroup_list *prealloc = NULL; int ret = 0; + ASSERT(prealloc); + /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; @@ -1593,11 +1578,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst } } - prealloc = kzalloc(sizeof(*list), GFP_NOFS); - if (!prealloc) { - ret = -ENOMEM; - goto out; - } ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out; @@ -1736,13 +1716,55 @@ out: return ret; } -static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) +/* + * Return 0 if we can not delete the qgroup (not empty or has children etc). + * Return >0 if we can delete the qgroup. + * Return <0 for other errors during tree search. + */ +static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { - return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || - qgroup->excl > 0 || qgroup->excl_cmpr > 0 || - qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + /* + * Squota would never be inconsistent, but there can still be case + * where a dropped subvolume still has qgroup numbers, and squota + * relies on such qgroup for future accounting. + * + * So for squota, do not allow dropping any non-zero qgroup. + */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && + (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr)) + return 0; + + /* For higher level qgroup, we can only delete it if it has no child. */ + if (btrfs_qgroup_level(qgroup->qgroupid)) { + if (!list_empty(&qgroup->members)) + return 0; + return 1; + } + + /* + * For level-0 qgroups, we can only delete it if it has no subvolume + * for it. + * This means even a subvolume is unlinked but not yet fully dropped, + * we can not delete the qgroup. + */ + key.objectid = qgroup->qgroupid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = -1ULL; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); + btrfs_free_path(path); + /* + * The @ret from btrfs_find_root() exactly matches our definition for + * the return value, thus can be returned directly. + */ + return ret; } int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) @@ -1764,7 +1786,10 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } - if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { + ret = can_delete_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + if (ret == 0) { ret = -EBUSY; goto out; } @@ -1789,6 +1814,45 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) } spin_lock(&fs_info->qgroup_lock); + /* + * Warn on reserved space. The subvolume should has no child nor + * corresponding subvolume. + * Thus its reserved space should all be zero, no matter if qgroup + * is consistent or the mode. + */ + if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + + } + /* + * The same for rfer/excl numbers, but that's only if our qgroup is + * consistent and if it's in regular qgroup mode. + * For simple mode it's not as accurate thus we can hit non-zero values + * very frequently. + */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL && + !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { + if (qgroup->rfer || qgroup->excl || + qgroup->rfer_cmpr || qgroup->excl_cmpr) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rfer, qgroup->rfer_cmpr, + qgroup->excl, qgroup->excl_cmpr); + qgroup_mark_inconsistent(fs_info); + } + } del_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); @@ -1804,6 +1868,40 @@ out: return ret; } +int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid) +{ + struct btrfs_trans_handle *trans; + int ret; + + if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root) + return 0; + + /* + * Commit current transaction to make sure all the rfer/excl numbers + * get updated. + */ + ret = btrfs_commit_current_transaction(fs_info->quota_root); + if (ret < 0) + return ret; + + /* Start new trans to delete the qgroup info and limit items. */ + trans = btrfs_start_transaction(fs_info->quota_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_remove_qgroup(trans, subvolid); + btrfs_end_transaction(trans); + /* + * It's squota and the subvolume still has numbers needed for future + * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. + */ + if (ret == -EBUSY || ret == -ENOENT) + ret = 0; + return ret; +} + int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { @@ -1890,43 +1988,49 @@ out: * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. - * Error is not possible + * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record, + u64 bytenr) { - struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_qgroup_extent_record *entry; - u64 bytenr = record->bytenr; + struct btrfs_qgroup_extent_record *existing, *ret; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; - lockdep_assert_held(&delayed_refs->lock); - trace_btrfs_qgroup_trace_extent(fs_info, record); +#if BITS_PER_LONG == 32 + if (bytenr >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, +"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", + bytenr); + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; + } +#endif - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, - node); - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - } else { - if (record->data_rsv && !entry->data_rsv) { - entry->data_rsv = record->data_rsv; - entry->data_rsv_refroot = - record->data_rsv_refroot; - } - return 1; + trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr); + + xa_lock(&delayed_refs->dirty_extents); + existing = xa_load(&delayed_refs->dirty_extents, index); + if (existing) { + if (record->data_rsv && !existing->data_rsv) { + existing->data_rsv = record->data_rsv; + existing->data_rsv_refroot = record->data_rsv_refroot; } + xa_unlock(&delayed_refs->dirty_extents); + return 1; + } + + ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); + xa_unlock(&delayed_refs->dirty_extents); + if (xa_is_err(ret)) { + qgroup_mark_inconsistent(fs_info); + return xa_err(ret); } - rb_link_node(&record->node, parent_node, p); - rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); return 0; } @@ -1952,12 +2056,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord) + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr) { - struct btrfs_backref_walk_ctx ctx = { 0 }; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_backref_walk_ctx ctx = { + .bytenr = bytenr, + .fs_info = fs_info, + }; int ret; - if (!btrfs_qgroup_full_accounting(trans->fs_info)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * We are always called in a context where we are already holding a @@ -1980,16 +2089,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, */ ASSERT(trans != NULL); - if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ctx.bytenr = qrecord->bytenr; - ctx.fs_info = trans->fs_info; - ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - qgroup_mark_inconsistent(trans->fs_info); - btrfs_warn(trans->fs_info, + qgroup_mark_inconsistent(fs_info); + btrfs_warn(fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); return 0; @@ -2024,7 +2130,8 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) @@ -2033,19 +2140,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; - delayed_refs = &trans->transaction->delayed_refs; - record->bytenr = bytenr; + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { + kfree(record); + return -ENOMEM; + } + record->num_bytes = num_bytes; - record->old_roots = NULL; - spin_lock(&delayed_refs->lock); - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); - spin_unlock(&delayed_refs->lock); - if (ret > 0) { + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, bytenr); } /* @@ -2505,8 +2614,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; - BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); - BUG_ON(root_eb == NULL); + ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); + ASSERT(root_eb != NULL); if (!btrfs_qgroup_full_accounting(fs_info)) return 0; @@ -2530,7 +2639,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, if (!extent_buffer_uptodate(root_eb)) { struct btrfs_tree_parent_check check = { - .has_first_key = false, .transid = root_gen, .level = root_level }; @@ -2861,8 +2969,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (nr_old_roots == 0 && nr_new_roots == 0) goto out_free; - BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); @@ -2912,7 +3018,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; - struct rb_node *node; + unsigned long index; u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -2922,18 +3028,17 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; - while ((node = rb_first(&delayed_refs->dirty_extent_root))) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); + xa_for_each(&delayed_refs->dirty_extents, index, record) { + const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits); num_dirty_extents++; - trace_btrfs_qgroup_account_extents(fs_info, record); + trace_btrfs_qgroup_account_extents(fs_info, record, bytenr); if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { struct btrfs_backref_walk_ctx ctx = { 0 }; - ctx.bytenr = record->bytenr; + ctx.bytenr = bytenr; ctx.fs_info = fs_info; /* @@ -2959,11 +3064,6 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ctx.roots = NULL; } - /* Free the reserved data space */ - btrfs_qgroup_free_refroot(fs_info, - record->data_rsv_refroot, - record->data_rsv, - BTRFS_QGROUP_RSV_DATA); /* * Use BTRFS_SEQ_LAST as time_seq to do special search, * which doesn't lock tree or delayed_refs and search @@ -2980,18 +3080,23 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ulist_del(record->old_roots, qgroup_to_skip, 0); } - ret = btrfs_qgroup_account_extent(trans, record->bytenr, + ret = btrfs_qgroup_account_extent(trans, bytenr, record->num_bytes, record->old_roots, new_roots); record->old_roots = NULL; new_roots = NULL; } + /* Free the reserved data space */ + btrfs_qgroup_free_refroot(fs_info, + record->data_rsv_refroot, + record->data_rsv, + BTRFS_QGROUP_RSV_DATA); cleanup: ulist_free(record->old_roots); ulist_free(new_roots); new_roots = NULL; - rb_erase(node, &delayed_refs->dirty_extent_root); + xa_erase(&delayed_refs->dirty_extents, index); kfree(record); } @@ -3048,6 +3153,62 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) return ret; } +int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_inherit *inherit, + size_t size) +{ + if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) + return -EOPNOTSUPP; + if (size < sizeof(*inherit) || size > PAGE_SIZE) + return -EINVAL; + + /* + * In the past we allowed btrfs_qgroup_inherit to specify to copy + * rfer/excl numbers directly from other qgroups. This behavior has + * been disabled in userspace for a very long time, but here we should + * also disable it in kernel, as this behavior is known to mark qgroup + * inconsistent, and a rescan would wipe out the changes anyway. + * + * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies. + */ + if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) + return -EINVAL; + + if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) + return -EINVAL; + + /* + * Skip the inherit source qgroups check if qgroup is not enabled. + * Qgroup can still be later enabled causing problems, but in that case + * btrfs_qgroup_inherit() would just ignore those invalid ones. + */ + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + + /* + * Now check all the remaining qgroups, they should all: + * + * - Exist + * - Be higher level qgroups. + */ + for (int i = 0; i < inherit->num_qgroups; i++) { + struct btrfs_qgroup *qgroup; + u64 qgroupid = inherit->qgroups[i]; + + if (btrfs_qgroup_level(qgroupid) == 0) + return -EINVAL; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + spin_unlock(&fs_info->qgroup_lock); + return -ENOENT; + } + spin_unlock(&fs_info->qgroup_lock); + } + return 0; +} + static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, u64 inode_rootid, struct btrfs_qgroup_inherit **inherit) @@ -3083,13 +3244,69 @@ static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, qgids = res->qgroups; list_for_each_entry(qg_list, &inode_qg->groups, next_group) - qgids[i] = qg_list->group->qgroupid; + qgids[i++] = qg_list->group->qgroupid; *inherit = res; return 0; } /* + * Check if we can skip rescan when inheriting qgroups. If @src has a single + * @parent, and that @parent is owning all its bytes exclusively, we can skip + * the full rescan, by just adding nodesize to the @parent's excl/rfer. + * + * Return <0 for fatal errors (like srcid/parentid has no qgroup). + * Return 0 if a quick inherit is done. + * Return >0 if a quick inherit is not possible, and a full rescan is needed. + */ +static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info, + u64 srcid, u64 parentid) +{ + struct btrfs_qgroup *src; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + int nr_parents = 0; + + src = find_qgroup_rb(fs_info, srcid); + if (!src) + return -ENOENT; + parent = find_qgroup_rb(fs_info, parentid); + if (!parent) + return -ENOENT; + + /* + * Source has no parent qgroup, but our new qgroup would have one. + * Qgroup numbers would become inconsistent. + */ + if (list_empty(&src->groups)) + return 1; + + list_for_each_entry(list, &src->groups, next_group) { + /* The parent is not the same, quick update is not possible. */ + if (list->group->qgroupid != parentid) + return 1; + nr_parents++; + /* + * More than one parent qgroup, we can't be sure about accounting + * consistency. + */ + if (nr_parents > 1) + return 1; + } + + /* + * The parent is not exclusively owning all its bytes. We're not sure + * if the source has any bytes not fully owned by the parent. + */ + if (parent->excl != parent->rfer) + return 1; + + parent->excl += fs_info->nodesize; + parent->rfer += fs_info->nodesize; + return 0; +} + +/* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error @@ -3100,7 +3317,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_qgroup_inherit *inherit) { int ret = 0; - int i; u64 *i_qgroups; bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; @@ -3157,7 +3373,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, i_qgroups = (u64 *)(inherit + 1); nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2 * inherit->num_excl_copies; - for (i = 0; i < nums; ++i) { + for (int i = 0; i < nums; i++) { srcgroup = find_qgroup_rb(fs_info, *i_qgroups); /* @@ -3184,7 +3400,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, */ if (inherit) { i_qgroups = (u64 *)(inherit + 1); - for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { + for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) { if (*i_qgroups == 0) continue; ret = add_qgroup_relation_item(trans, objectid, @@ -3257,13 +3473,20 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); + + /* + * If the source qgroup has parent but the new one doesn't, + * we need a full rescan. + */ + if (!inherit && !list_empty(&srcgroup->groups)) + need_rescan = true; } if (!inherit) goto unlock; i_qgroups = (u64 *)(inherit + 1); - for (i = 0; i < inherit->num_qgroups; ++i) { + for (int i = 0; i < inherit->num_qgroups; i++) { if (*i_qgroups) { ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, *i_qgroups); @@ -3271,17 +3494,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (ret) goto unlock; } + if (srcid) { + /* Check if we can do a quick inherit. */ + ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); + if (ret < 0) + goto unlock; + if (ret > 0) + need_rescan = true; + ret = 0; + } ++i_qgroups; - - /* - * If we're doing a snapshot, and adding the snapshot to a new - * qgroup, the numbers are guaranteed to be incorrect. - */ - if (srcid) - need_rescan = true; } - for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { + for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; @@ -3302,7 +3527,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, /* Manually tweaking numbers certainly needs a rescan */ need_rescan = true; } - for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { + for (int i = 0; i < inherit->num_excl_copies; i++, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; @@ -3360,7 +3585,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, { struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; - u64 ref_root = root->root_key.objectid; + u64 ref_root = btrfs_root_id(root); int ret = 0; LIST_HEAD(qgroup_list); @@ -3595,7 +3820,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - int err = -ENOMEM; int ret = 0; bool stopped = false; bool did_leaf_rescans = false; @@ -3604,8 +3828,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) return; path = btrfs_alloc_path(); - if (!path) + if (!path) { + ret = -ENOMEM; goto out; + } /* * Rescan should only search for commit root, and any later difference * should be recorded by qgroup @@ -3613,18 +3839,17 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path->search_commit_root = 1; path->skip_locking = 1; - err = 0; - while (!err && !(stopped = rescan_should_stop(fs_info))) { + while (!ret && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); break; } - err = qgroup_rescan_leaf(trans, path); + ret = qgroup_rescan_leaf(trans, path); did_leaf_rescans = true; - if (err > 0) + if (ret > 0) btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); @@ -3634,10 +3859,10 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (err > 0 && + if (ret > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - } else if (err < 0 || stopped) { + } else if (ret < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3652,11 +3877,11 @@ out: if (did_leaf_rescans) { trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", - err); + ret); } } else { trans = NULL; @@ -3667,11 +3892,11 @@ out: fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", - err); + int ret2 = update_qgroup_status_item(trans); + + if (ret2 < 0) { + ret = ret2; + btrfs_err(fs_info, "fail to update qgroup status: %d", ret); } } fs_info->qgroup_rescan_running = false; @@ -3688,11 +3913,11 @@ out: btrfs_info(fs_info, "qgroup scan paused"); } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { btrfs_info(fs_info, "qgroup scan cancelled"); - } else if (err >= 0) { + } else if (ret >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", - err > 0 ? " (inconsistency flag cleared)" : ""); + ret > 0 ? " (inconsistency flag cleared)" : ""); } else { - btrfs_err(fs_info, "qgroup scan failed with %d", err); + btrfs_err(fs_info, "qgroup scan failed with %d", ret); } } @@ -3715,14 +3940,14 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled"); - ret = -EINVAL; + ret = -ENOTCONN; } if (ret) @@ -3733,14 +3958,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - btrfs_warn(fs_info, - "qgroup rescan is already in progress"); ret = -EINPROGRESS; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled"); - ret = -EINVAL; + ret = -ENOTCONN; } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; @@ -3789,7 +4012,6 @@ int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) { int ret = 0; - struct btrfs_trans_handle *trans; ret = qgroup_rescan_init(fs_info, 0, 1); if (ret) @@ -3806,16 +4028,10 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) * going to clear all tracking information for a clean start. */ - trans = btrfs_attach_transaction_barrier(fs_info->fs_root); - if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { + ret = btrfs_commit_current_transaction(fs_info->fs_root); + if (ret) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - return PTR_ERR(trans); - } else if (trans != ERR_PTR(-ENOENT)) { - ret = btrfs_commit_transaction(trans); - if (ret) { - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - return ret; - } + return ret; } qgroup_rescan_zero_tracking(fs_info); @@ -3951,7 +4167,6 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, */ static int try_flush_qgroup(struct btrfs_root *root) { - struct btrfs_trans_handle *trans; int ret; /* Can't hold an open transaction or we run the risk of deadlocking. */ @@ -3972,17 +4187,18 @@ static int try_flush_qgroup(struct btrfs_root *root) ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; - btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, NULL); - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - if (ret == -ENOENT) - ret = 0; - goto out; - } + /* + * After waiting for ordered extents run delayed iputs in order to free + * space from unlinked files before committing the current transaction, + * as ordered extents may have been holding the last reference of an + * inode and they add a delayed iput when they complete. + */ + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(root); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); wake_up(&root->qgroup_flush_wait); @@ -4001,7 +4217,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid) || len == 0) + !is_fstree(btrfs_root_id(root)) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -4117,7 +4333,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, + btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed, BTRFS_QGROUP_RSV_DATA); if (freed_ret) *freed_ret = freed; @@ -4136,10 +4352,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { - extent_changeset_init(&changeset); return clear_record_extent_bits(&inode->io_tree, start, start + len - 1, - EXTENT_QGROUP_RESERVED, &changeset); + EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ @@ -4158,7 +4373,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, + btrfs_root_id(inode->root), changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); if (released) *released = changeset.bytes_changed; @@ -4253,7 +4468,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid) || num_bytes == 0) + !is_fstree(btrfs_root_id(root)) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -4298,13 +4513,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid)) + !is_fstree(btrfs_root_id(root))) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ - btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -4314,7 +4529,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid)) + !is_fstree(btrfs_root_id(root))) return; /* @@ -4325,8 +4540,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); - btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, - num_bytes, type); + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, @@ -4374,13 +4588,15 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid)) + !is_fstree(btrfs_root_id(root))) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); - qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); + if (!sb_rdonly(fs_info->sb)) + add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* @@ -4407,7 +4623,7 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) btrfs_ino(inode), unode->val, unode->aux); } btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, + btrfs_root_id(inode->root), changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } @@ -4466,8 +4682,7 @@ out: * BOTH POINTERS ARE BEFORE TREE SWAP * @last_snapshot: last snapshot generation of the subvolume tree */ -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -4593,7 +4808,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (!is_fstree(root->root_key.objectid) || !root->reloc_root) + if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) return 0; spin_lock(&blocks->lock); @@ -4664,30 +4879,17 @@ out: void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) { struct btrfs_qgroup_extent_record *entry; - struct btrfs_qgroup_extent_record *next; - struct rb_root *root; + unsigned long index; - root = &trans->delayed_refs.dirty_extent_root; - rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) { ulist_free(entry->old_roots); kfree(entry); } - *root = RB_ROOT; -} - -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) -{ - if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) - return; - - if (!is_fstree(root)) - return; - - btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); + xa_destroy(&trans->delayed_refs.dirty_extents); } int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, - struct btrfs_squota_delta *delta) + const struct btrfs_squota_delta *delta) { int ret; struct btrfs_qgroup *qgroup; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index be18c862e64e..e233cc79af18 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -6,12 +6,22 @@ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H +#include <linux/types.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/kobject.h> -#include "ulist.h" -#include "delayed-ref.h" -#include "misc.h" +#include <linux/list.h> +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; +struct extent_changeset; +struct btrfs_delayed_extent_op; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_ioctl_quota_ctl_args; +struct btrfs_trans_handle; +struct btrfs_delayed_ref_root; +struct btrfs_inode; /* * Btrfs qgroup overview @@ -111,13 +121,18 @@ #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) +#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3) + /* * Record a dirty extent, and info qgroup to update quota on it - * TODO: Use kmem cache to alloc it. */ struct btrfs_qgroup_extent_record { - struct rb_node node; - u64 bytenr; + /* + * The bytenr of the extent is given by its index in the dirty_extents + * xarray of struct btrfs_delayed_ref_root left shifted by + * fs_info->sectorsize_bits. + */ + u64 num_bytes; /* @@ -269,6 +284,14 @@ struct btrfs_qgroup { struct kobject kobj; }; +/* Glue structure to represent the relations between qgroups. */ +struct btrfs_qgroup_list { + struct list_head next_group; + struct list_head next_member; + struct btrfs_qgroup *group; + struct btrfs_qgroup *member; +}; + struct btrfs_squota_delta { /* The fstree root this delta counts against. */ u64 root; @@ -302,9 +325,9 @@ enum btrfs_qgroup_mode { BTRFS_QGROUP_MODE_SIMPLE }; -enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info); -bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info); -bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info); +enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info); int btrfs_quota_enable(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_quota_ctl_args *quota_ctl_args); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); @@ -312,23 +335,26 @@ int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst, + struct btrfs_qgroup_list *prealloc); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); +int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid); int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit); int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); + struct btrfs_qgroup_extent_record *record, + u64 bytenr); int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord); + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr); int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, @@ -341,6 +367,9 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct ulist *new_roots); int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); +int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_inherit *inherit, + size_t size); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u64 objectid, u64 inode_rootid, struct btrfs_qgroup_inherit *inherit); @@ -349,7 +378,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, enum btrfs_qgroup_rsv_type type); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, +int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif @@ -410,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks); void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -419,9 +447,8 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); -bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); +bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, - struct btrfs_squota_delta *delta); + const struct btrfs_squota_delta *delta); #endif diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9589362acfbf..1834011ccc49 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -11,9 +11,58 @@ #include "disk-io.h" #include "raid-stripe-tree.h" #include "volumes.h" -#include "misc.h" #include "print-tree.h" +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + const struct btrfs_key *oldkey, + u64 newlen, u64 frontpad) +{ + struct btrfs_root *stripe_root = trans->fs_info->stripe_root; + struct btrfs_stripe_extent *extent, *newitem; + struct extent_buffer *leaf; + int slot; + size_t item_size; + struct btrfs_key newkey = { + .objectid = oldkey->objectid + frontpad, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = newlen, + }; + int ret; + + ASSERT(newlen > 0); + ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); + + leaf = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(leaf, slot); + + newitem = kzalloc(item_size, GFP_NOFS); + if (!newitem) + return -ENOMEM; + + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride) + frontpad; + btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys); + } + + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); + +out: + kfree(newitem); + return ret; +} + int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -27,9 +76,22 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le int slot; int ret; - if (!stripe_root) + if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root) return 0; + if (!btrfs_is_testing(fs_info)) { + struct btrfs_chunk_map *map; + bool use_rst; + + map = btrfs_find_chunk_map(fs_info, start, length); + if (!map) + return -EINVAL; + use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); + btrfs_free_chunk_map(map); + if (!use_rst) + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -37,23 +99,55 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le while (1) { key.objectid = start; key.type = BTRFS_RAID_STRIPE_KEY; - key.offset = length; + key.offset = 0; ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); if (ret < 0) break; - if (ret > 0) { - ret = 0; - if (path->slots[0] == 0) - break; + + if (path->slots[0] == btrfs_header_nritems(path->nodes[0])) path->slots[0]--; - } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); found_start = key.objectid; found_end = found_start + key.offset; + ret = 0; + + /* + * The stripe extent starts before the range we want to delete, + * but the range spans more than one stripe extent: + * + * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to get the previous item, truncate its + * length and then restart the search. + */ + if (found_start > start) { + if (slot == 0) { + ret = btrfs_previous_item(stripe_root, path, start, + BTRFS_RAID_STRIPE_KEY); + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } + } else { + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + ASSERT(found_start <= start); + } + + if (key.type != BTRFS_RAID_STRIPE_KEY) + break; /* That stripe ends before we start, we're done. */ if (found_end <= start) @@ -62,11 +156,107 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le trace_btrfs_raid_extent_delete(fs_info, start, end, found_start, found_end); - ASSERT(found_start >= start && found_end <= end); + /* + * The stripe extent starts before the range we want to delete + * and ends after the range we want to delete, i.e. we're + * punching a hole in the stripe extent: + * + * |--- RAID Stripe Extent ---| + * | keep |--- drop ---| keep | + * + * This means we need to a) truncate the existing item and b) + * create a second item for the remaining range. + */ + if (found_start < start && found_end > end) { + size_t item_size; + u64 diff_start = start - found_start; + u64 diff_end = found_end - end; + struct btrfs_stripe_extent *extent; + struct btrfs_key newkey = { + .objectid = end, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = diff_end, + }; + + /* The "right" item. */ + ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey); + if (ret) + break; + + item_size = btrfs_item_size(leaf, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride); + phys += diff_start + length; + btrfs_set_raid_stride_physical(leaf, stride, phys); + } + + /* The "left" item. */ + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_partially_delete_raid_extent(trans, path, &key, + diff_start, 0); + break; + } + + /* + * The stripe extent starts before the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_start < start) { + u64 diff_start = start - found_start; + + btrfs_partially_delete_raid_extent(trans, path, &key, + diff_start, 0); + + start += (key.offset - diff_start); + length -= (key.offset - diff_start); + if (length == 0) + break; + + btrfs_release_path(path); + continue; + } + + /* + * The stripe extent ends after the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- drop ---|--- keep ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_end > end) { + u64 diff_end = found_end - end; + + btrfs_partially_delete_raid_extent(trans, path, &key, + key.offset - length, + length); + ASSERT(key.offset - diff_end == length); + break; + } + + /* Finally we can delete the whole item, no more special cases. */ ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; + start += key.offset; + length -= key.offset; + if (length == 0) + break; + btrfs_release_path(path); } @@ -74,14 +264,43 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le return ret; } -static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, - struct btrfs_io_context *bioc) +static int update_raid_extent_item(struct btrfs_trans_handle *trans, + struct btrfs_key *key, + struct btrfs_stripe_extent *stripe_extent, + const size_t item_size) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path, + 0, 1); + if (ret) + return (ret == 1 ? ret : -EINVAL); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), + item_size); + btrfs_free_path(path); + + return ret; +} + +EXPORT_FOR_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key stripe_key; struct btrfs_root *stripe_root = fs_info->stripe_root; const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); - u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type); struct btrfs_stripe_extent *stripe_extent; const size_t item_size = struct_size(stripe_extent, strides, num_stripes); int ret; @@ -95,16 +314,11 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size, num_stripes); - btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding); for (int i = 0; i < num_stripes; i++) { u64 devid = bioc->stripes[i].dev->devid; u64 physical = bioc->stripes[i].physical; - u64 length = bioc->stripes[i].length; struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; - if (length == 0) - length = bioc->size; - btrfs_set_stack_raid_stride_devid(raid_stride, devid); btrfs_set_stack_raid_stride_physical(raid_stride, physical); } @@ -115,6 +329,9 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); + if (ret == -EEXIST) + ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, + item_size); if (ret) btrfs_abort_transaction(trans, ret); @@ -160,7 +377,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf; const u64 end = logical + *length; int num_stripes; - u8 encoding; u64 offset; u64 found_logical; u64 found_length; @@ -176,7 +392,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, if (!path) return -ENOMEM; - if (stripe->is_scrub) { + if (stripe->rst_search_commit_root) { path->skip_locking = 1; path->search_commit_root = 1; } @@ -199,7 +415,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, found_end = found_logical + found_length; if (found_logical > end) { - ret = -ENOENT; + ret = -ENODATA; goto out; } @@ -223,16 +439,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); - encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent); - - if (encoding != btrfs_bg_flags_to_raid_index(map_type)) { - ret = -EUCLEAN; - btrfs_handle_fs_error(fs_info, ret, - "on-disk stripe encoding %d doesn't match RAID index %d", - encoding, - btrfs_bg_flags_to_raid_index(map_type)); - goto out; - } for (int i = 0; i < num_stripes; i++) { struct btrfs_raid_stride *stride = &stripe_extent->strides[i]; @@ -255,14 +461,12 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, } /* If we're here, we haven't found the requested devid in the stripe. */ - ret = -ENOENT; + ret = -ENODATA; out: if (ret > 0) - ret = -ENOENT; - if (ret && ret != -EIO && !stripe->is_scrub) { - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - btrfs_print_tree(leaf, 1); - btrfs_err(fs_info, + ret = -ENODATA; + if (ret && ret != -EIO && !stripe->rst_search_commit_root) { + btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index cdb58b38fcb5..541836421778 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -6,6 +6,10 @@ #ifndef BTRFS_RAID_STRIPE_TREE_H #define BTRFS_RAID_STRIPE_TREE_H +#include <linux/types.h> +#include <uapi/linux/btrfs_tree.h> +#include "fs.h" + #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ BTRFS_BLOCK_GROUP_RAID0 | \ @@ -13,6 +17,7 @@ struct btrfs_io_context; struct btrfs_io_stripe; +struct btrfs_fs_info; struct btrfs_ordered_extent; struct btrfs_trans_handle; @@ -23,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered_extent); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc); +#endif + static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, u64 map_type) { @@ -43,8 +53,7 @@ static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, static inline int btrfs_num_raid_stripes(u32 item_size) { - return (item_size - offsetof(struct btrfs_stripe_extent, strides)) / - sizeof(struct btrfs_raid_stride); + return item_size / sizeof(struct btrfs_raid_stride); } #endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 792c8e17c31d..cdd373c27784 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -14,7 +14,6 @@ #include <linux/raid/xor.h> #include <linux/mm.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -41,6 +40,85 @@ #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc) +{ + if (unlikely(!bioc)) { + btrfs_crit(fs_info, "bioc=NULL"); + return; + } + btrfs_crit(fs_info, +"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u", + bioc->logical, bioc->full_stripe_logical, bioc->size, + bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes, + bioc->replace_stripe_src, bioc->num_stripes); + for (int i = 0; i < bioc->num_stripes; i++) { + btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu", + i, bioc->stripes[i].dev->devid, + bioc->stripes[i].physical); + } +} + +static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, + const struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + dump_bioc(fs_info, rbio->bioc); + btrfs_crit(fs_info, +"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx", + rbio->flags, rbio->nr_sectors, rbio->nr_data, + rbio->real_stripes, rbio->stripe_nsectors, + rbio->scrubp, rbio->dbitmap); +} + +#define ASSERT_RBIO(expr, rbio) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "logical=%llu", (logical)); \ + } \ + ASSERT((expr)); \ +}) + /* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; @@ -332,12 +410,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) static void merge_rbio(struct btrfs_raid_bio *dest, struct btrfs_raid_bio *victim) { - bio_list_merge(&dest->bio_list, &victim->bio_list); + bio_list_merge_init(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; /* Also inherit the bitmaps from @victim. */ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, dest->stripe_nsectors); - bio_list_init(&victim->bio_list); } /* @@ -594,8 +671,8 @@ static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr) { - ASSERT(stripe_nr < rbio->real_stripes); - ASSERT(sector_nr < rbio->stripe_nsectors); + ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); return stripe_nr * rbio->stripe_nsectors + sector_nr; } @@ -875,8 +952,10 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; int index; - ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); - ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, + rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, + rbio, sector_nr); index = stripe_nr * rbio->stripe_nsectors + sector_nr; ASSERT(index >= 0 && index < rbio->nr_sectors); @@ -918,6 +997,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); + /* + * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 + * (limited by u8). + */ + ASSERT(real_stripes >= 2); + ASSERT(real_stripes <= U8_MAX); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -955,6 +1041,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); + ASSERT(rbio->nr_data > 0); return rbio; } @@ -964,7 +1051,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { int ret; - ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0); + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false); if (ret < 0) return ret; /* Mapping all sectors */ @@ -979,7 +1066,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) int ret; ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, - rbio->stripe_pages + data_pages, 0); + rbio->stripe_pages + data_pages, false); if (ret < 0) return ret; @@ -1051,8 +1138,10 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, * thus it can be larger than rbio->real_stripe. * So here we check against bioc->num_stripes, not rbio->real_stripes. */ - ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); - ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes, + rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, + rbio, sector_nr); ASSERT(sector->page); stripe = &rbio->bioc->stripes[stripe_nr]; @@ -1181,6 +1270,25 @@ static inline void bio_list_put(struct bio_list *bio_list) bio_put(bio); } +static void assert_rbio(struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + /* + * At least two stripes (2 disks RAID5), and since real_stripes is U8, + * we won't go beyond 256 disks anyway. + */ + ASSERT_RBIO(rbio->real_stripes >= 2, rbio); + ASSERT_RBIO(rbio->nr_data > 0, rbio); + + /* + * This is another check to make sure nr data stripes is smaller + * than total stripes. + */ + ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1212,6 +1320,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { @@ -1530,7 +1639,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0); + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false); if (ret < 0) return ret; @@ -1614,9 +1723,10 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) const u32 sectorsize = fs_info->sectorsize; u64 cur_logical; - ASSERT(orig_logical >= full_stripe_start && - orig_logical + orig_len <= full_stripe_start + - rbio->nr_data * BTRFS_STRIPE_LEN); + ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN, + rbio, orig_logical); bio_list_add(&rbio->bio_list, orig_bio); rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; @@ -2362,7 +2472,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, break; } } - ASSERT(i < rbio->real_stripes); + ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i); bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); return rbio; @@ -2473,6 +2583,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } if (has_qstripe) { + assert_rbio(rbio); /* RAID6, call the library function to fill in our P/Q */ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); @@ -2527,7 +2638,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) * Replace is running and our parity stripe needs to be duplicated to * the target device. Check we have a valid source stripe number. */ - ASSERT(rbio->bioc->replace_stripe_src >= 0); + ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 470213688872..0d7b4c2fb6ae 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,9 +7,18 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H +#include <linux/types.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/bio.h> +#include <linux/refcount.h> #include <linux/workqueue.h> #include "volumes.h" +struct page; +struct sector_ptr; +struct btrfs_fs_info; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c2b66d155ef..1c2d7cb1fe6f 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -6,6 +6,12 @@ #ifndef BTRFS_RCU_STRING_H #define BTRFS_RCU_STRING_H +#include <linux/types.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/rcupdate.h> +#include <linux/printk.h> + struct rcu_string { struct rcu_head rcu; char str[]; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 8c4fc98ca9ce..2928abf7eb82 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; - int type, ret; + int type; + int ret = 0; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); @@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, key->objectid, key->offset); break; case BTRFS_EXTENT_OWNER_REF_KEY: - WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) { + btrfs_err(fs_info, + "found extent owner ref without simple quotas enabled"); + ret = -EINVAL; + } break; default: btrfs_err(fs_info, "invalid key type in iref"); @@ -673,7 +678,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, int ret = 0; bool metadata; u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; + u64 num_bytes = generic_ref->num_bytes; u64 parent = generic_ref->parent; u64 ref_root = 0; u64 owner = 0; @@ -684,11 +689,11 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.ref_root; + ref_root = generic_ref->ref_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.ref_root; - owner = generic_ref->data_ref.ino; + ref_root = generic_ref->ref_root; + owner = generic_ref->data_ref.objectid; offset = generic_ref->data_ref.offset; } metadata = owner < BTRFS_FIRST_FREE_OBJECTID; @@ -852,6 +857,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + rb_erase(&ref->node, &be->refs); kfree(ref); kfree(ra); goto out_unlock; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 855de37719b5..3511e1a5c96b 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -6,7 +6,16 @@ #ifndef BTRFS_REF_VERIFY_H #define BTRFS_REF_VERIFY_H +#include <linux/types.h> +#include <linux/rbtree_types.h> + +struct btrfs_fs_info; +struct btrfs_ref; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY + +#include <linux/spinlock.h> + int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ae90894dc7dc..f0824c948cb7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -66,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; @@ -83,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret) goto out; - page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(mapping)); - if (!page) { + folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + btrfs_alloc_write_mask(mapping)); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out_unlock; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; @@ -115,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, offset_in_page(file_offset), data_start, - datal); + memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, + datal); } else { - ret = btrfs_decompress(comp_type, data_start, page, - offset_in_page(file_offset), + ret = btrfs_decompress(comp_type, data_start, folio, + offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; - flush_dcache_page(page); + flush_dcache_folio(folio); } /* @@ -139,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) - memzero_page(page, datal, block_size - datal); + folio_zero_range(folio, datal, block_size - datal); - btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); + btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); + btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: - if (page) { - unlock_page(page); - put_page(page); + if (!IS_ERR(folio)) { + folio_unlock(folio); + folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, @@ -174,7 +175,7 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); struct btrfs_root *root = BTRFS_I(dst)->root; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); @@ -337,7 +338,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, const u64 destoff, int no_time_update) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = NULL; struct extent_buffer *leaf; struct btrfs_trans_handle *trans; @@ -616,35 +617,6 @@ out: return ret; } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); -} - -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - u64 range1_end = loff1 + len - 1; - u64 range2_end = loff2 + len - 1; - - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - swap(range1_end, range2_end); - } else if (inode1 == inode2 && loff2 < loff1) { - swap(loff1, loff2); - swap(range1_end, range2_end); - } - - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); - - btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); - btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); -} - static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) { if (inode1 < inode2) @@ -662,17 +634,21 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { + const u64 end = dst_loff + len - 1; + struct extent_state *cached_state = NULL; struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; - const u64 bs = fs_info->sb->s_blocksize; + const u64 bs = fs_info->sectorsize; int ret; /* - * Lock destination range to serialize with concurrent readahead() and - * source range to serialize with relocation. + * Lock destination range to serialize with concurrent readahead(), and + * we are safe from concurrency with relocation of source extents + * because we have already locked the inode's i_mmap_lock in exclusive + * mode. */ - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -690,7 +666,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (root_dst->send_in_progress) { btrfs_warn_rl(root_dst->fs_info, "cannot deduplicate to root %llu while send operations are using it (%d in progress)", - root_dst->root_key.objectid, + btrfs_root_id(root_dst), root_dst->send_in_progress); spin_unlock(&root_dst->root_item_lock); return -EAGAIN; @@ -724,13 +700,15 @@ out: static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 off, u64 olen, u64 destoff) { + struct extent_state *cached_state = NULL; struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int ret; int wb_ret; u64 len = olen; - u64 bs = fs_info->sb->s_blocksize; + u64 bs = fs_info->sectorsize; + u64 end; /* * VFS's generic_remap_file_range_prep() protects us from cloning the @@ -756,26 +734,29 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * we found the previous extent covering eof and before we * attempted to increment its reference count). */ - ret = btrfs_wait_ordered_range(inode, wb_start, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start, destoff - wb_start); if (ret) return ret; } /* - * Lock destination range to serialize with concurrent readahead() and - * source range to serialize with relocation. + * Lock destination range to serialize with concurrent readahead(), and + * we are safe from concurrency with relocation of source extents + * because we have already locked the inode's i_mmap_lock in exclusive + * mode. */ - btrfs_double_extent_lock(src, off, inode, destoff, len); + end = destoff + len - 1; + lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - btrfs_double_extent_unlock(src, off, inode, destoff, len); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination * range, so wait for writeback to complete before truncating pages * from the page cache. This is a rare case. */ - wb_ret = btrfs_wait_ordered_range(inode, destoff, len); + wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); ret = ret ? ret : wb_ret; /* * Truncate page cache pages so that future reads will see the cloned @@ -796,7 +777,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; u64 wb_len; int ret; @@ -855,11 +836,11 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h index ecb309b4dad0..1e291f7d85c4 100644 --- a/fs/btrfs/reflink.h +++ b/fs/btrfs/reflink.h @@ -3,7 +3,9 @@ #ifndef BTRFS_REFLINK_H #define BTRFS_REFLINK_H -#include <linux/fs.h> +#include <linux/types.h> + +struct file; loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index abe594f77f99..af0969b70b53 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -36,6 +36,7 @@ #include "relocation.h" #include "super.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" /* * Relocation overview @@ -231,70 +232,6 @@ static struct btrfs_backref_node *walk_down_backref( return NULL; } -static void update_backref_node(struct btrfs_backref_cache *cache, - struct btrfs_backref_node *node, u64 bytenr) -{ - struct rb_node *rb_node; - rb_erase(&node->rb_node, &cache->rb_root); - node->bytenr = bytenr; - rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); - if (rb_node) - btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); -} - -/* - * update backref cache after a transaction commit - */ -static int update_backref_cache(struct btrfs_trans_handle *trans, - struct btrfs_backref_cache *cache) -{ - struct btrfs_backref_node *node; - int level = 0; - - if (cache->last_trans == 0) { - cache->last_trans = trans->transid; - return 0; - } - - if (cache->last_trans == trans->transid) - return 0; - - /* - * detached nodes are used to avoid unnecessary backref - * lookup. transaction commit changes the extent tree. - * so the detached nodes are no longer useful. - */ - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct btrfs_backref_node, list); - btrfs_backref_cleanup_node(cache, node); - } - - while (!list_empty(&cache->changed)) { - node = list_entry(cache->changed.next, - struct btrfs_backref_node, list); - list_del_init(&node->list); - BUG_ON(node->pending); - update_backref_node(cache, node, node->new_bytenr); - } - - /* - * some nodes can be left in the pending list if there were - * errors during processing the pending nodes. - */ - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - list_for_each_entry(node, &cache->pending[level], list) { - BUG_ON(!node->pending); - if (node->bytenr == node->new_bytenr) - continue; - update_backref_node(cache, node, node->new_bytenr); - } - } - - cache->last_trans = 0; - return 1; -} - static bool reloc_root_is_dead(const struct btrfs_root *root) { /* @@ -405,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, if (cur == node) ret = true; - /* The node is the lowest node */ - if (cur->lowest) { - list_del_init(&cur->lower); - cur->lowest = 0; - } - /* Cleanup the lower edges */ while (!list_empty(&cur->lower)) { struct btrfs_backref_edge *edge; @@ -436,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, * cache to avoid unnecessary backref lookup. */ if (cur->level > 0) { - list_add(&cur->list, &cache->detached); cur->detached = 1; } else { rb_erase(&cur->rb_node, &cache->rb_root); @@ -473,34 +403,31 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( struct btrfs_backref_node *node = NULL; struct btrfs_backref_edge *edge; int ret; - int err = 0; iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } node = btrfs_backref_alloc_node(cache, bytenr, level); if (!node) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } - node->lowest = 1; cur = node; /* Breadth-first search to build backref cache */ do { ret = btrfs_backref_add_tree_node(trans, cache, path, iter, node_key, cur); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } + edge = list_first_entry_or_null(&cache->pending_edge, struct btrfs_backref_edge, list[UPPER]); /* @@ -515,19 +442,18 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( /* Finish the upper linkage of newly added edges/nodes */ ret = btrfs_backref_finish_upper_links(cache, node); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (handle_useless_nodes(rc, node)) node = NULL; out: - btrfs_backref_iter_free(iter); + btrfs_free_path(iter->path); + kfree(iter); btrfs_free_path(path); - if (err) { + if (ret) { btrfs_backref_error_cleanup(cache, node); - return ERR_PTR(err); + return ERR_PTR(ret); } ASSERT(!node || !node->detached); ASSERT(list_empty(&cache->useless_node) && @@ -536,95 +462,6 @@ out: } /* - * helper to add backref node for the newly created snapshot. - * the backref node is created by cloning backref node that - * corresponds to root of source tree - */ -static int clone_backref_node(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - const struct btrfs_root *src, - struct btrfs_root *dest) -{ - struct btrfs_root *reloc_root = src->reloc_root; - struct btrfs_backref_cache *cache = &rc->backref_cache; - struct btrfs_backref_node *node = NULL; - struct btrfs_backref_node *new_node; - struct btrfs_backref_edge *edge; - struct btrfs_backref_edge *new_edge; - struct rb_node *rb_node; - - if (cache->last_trans > 0) - update_backref_cache(trans, cache); - - rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); - if (node->detached) - node = NULL; - else - BUG_ON(node->new_bytenr != reloc_root->node->start); - } - - if (!node) { - rb_node = rb_simple_search(&cache->rb_root, - reloc_root->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, - rb_node); - BUG_ON(node->detached); - } - } - - if (!node) - return 0; - - new_node = btrfs_backref_alloc_node(cache, dest->node->start, - node->level); - if (!new_node) - return -ENOMEM; - - new_node->lowest = node->lowest; - new_node->checked = 1; - new_node->root = btrfs_grab_root(dest); - ASSERT(new_node->root); - - if (!node->lowest) { - list_for_each_entry(edge, &node->lower, list[UPPER]) { - new_edge = btrfs_backref_alloc_edge(cache); - if (!new_edge) - goto fail; - - btrfs_backref_link_edge(new_edge, edge->node[LOWER], - new_node, LINK_UPPER); - } - } else { - list_add_tail(&new_node->lower, &cache->leaves); - } - - rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, - &new_node->rb_node); - if (rb_node) - btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); - - if (!new_node->lowest) { - list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { - list_add_tail(&new_edge->list[LOWER], - &new_edge->node[LOWER]->upper); - } - } - return 0; -fail: - while (!list_empty(&new_node->lower)) { - new_edge = list_entry(new_node->lower.next, - struct btrfs_backref_edge, list[UPPER]); - list_del(&new_edge->list[UPPER]); - btrfs_backref_free_edge(cache, new_edge); - } - btrfs_backref_free_node(cache, new_node); - return -ENOMEM; -} - -/* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) @@ -753,7 +590,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = objectid; - if (root->root_key.objectid == objectid) { + if (btrfs_root_id(root) == objectid) { u64 commit_root_gen; /* called by btrfs_init_reloc_root */ @@ -797,7 +634,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); - if (root->root_key.objectid == objectid) { + if (btrfs_root_id(root) == objectid) { btrfs_set_root_refs(root_item, 0); memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); @@ -820,7 +657,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, goto abort; } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); - reloc_root->last_trans = trans->transid; + btrfs_set_root_last_trans(reloc_root, trans->transid); return reloc_root; fail: kfree(root_item); @@ -867,7 +704,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, */ if (root->reloc_root) { reloc_root = root->reloc_root; - reloc_root->last_trans = trans->transid; + btrfs_set_root_last_trans(reloc_root, trans->transid); return 0; } @@ -875,8 +712,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, * We are merging reloc roots, we do not need new reloc trees. Also * reloc trees never need their own reloc tree. */ - if (!rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) return 0; if (!trans->reloc_reserved) { @@ -884,7 +720,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, trans->block_rsv = rc->block_rsv; clear_rsv = 1; } - reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); if (clear_rsv) trans->block_rsv = rsv; if (IS_ERR(reloc_root)) @@ -926,7 +762,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, btrfs_grab_root(reloc_root); /* root->reloc_root will stay until current relocation finished */ - if (fs_info->reloc_ctl->merge_reloc_tree && + if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); /* @@ -951,60 +787,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, } /* - * helper to find first cached inode with inode number >= objectid - * in a subvolume - */ -static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) -{ - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(entry)) - node = node->rb_left; - else if (objectid > btrfs_ino(entry)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(entry)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - return inode; - } - - objectid = btrfs_ino(entry) + 1; - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); - } - spin_unlock(&root->inode_lock); - return NULL; -} - -/* * get new location of data */ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, @@ -1020,7 +802,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, if (!path) return -ENOMEM; - bytenr -= BTRFS_I(reloc_inode)->index_cnt; + bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) @@ -1064,7 +846,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; u64 parent; u64 bytenr; u64 new_bytenr = 0; @@ -1074,13 +856,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans, u32 i; int ret = 0; int first = 1; - int dirty = 0; if (rc->stage != UPDATE_DATA_PTRS) return 0; /* reloc trees always use full backref */ - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) parent = leaf->start; else parent = 0; @@ -1109,15 +890,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, * if we are modifying block in fs tree, wait for read_folio * to complete and drop the extent cache */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { if (first) { - inode = find_next_inode(root, key.objectid); + inode = btrfs_find_first_inode(root, key.objectid); first = 0; - } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { - btrfs_add_delayed_iput(BTRFS_I(inode)); - inode = find_next_inode(root, key.objectid); + } else if (inode && btrfs_ino(inode) < key.objectid) { + btrfs_add_delayed_iput(inode); + inode = btrfs_find_first_inode(root, key.objectid); } - if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { + if (inode && btrfs_ino(inode) == key.objectid) { struct extent_state *cached_state = NULL; end = key.offset + @@ -1126,16 +907,20 @@ int replace_file_extents(struct btrfs_trans_handle *trans, fs_info->sectorsize)); WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, - &cached_state); - if (!ret) + /* Take mmap lock to serialize with reflinks. */ + if (!down_read_trylock(&inode->i_mmap_lock)) continue; + ret = try_lock_extent(&inode->io_tree, key.offset, + end, &cached_state); + if (!ret) { + up_read(&inode->i_mmap_lock); + continue; + } - btrfs_drop_extent_map_range(BTRFS_I(inode), - key.offset, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, &cached_state); + btrfs_drop_extent_map_range(inode, key.offset, end, true); + unlock_extent(&inode->io_tree, key.offset, end, + &cached_state); + up_read(&inode->i_mmap_lock); } } @@ -1150,35 +935,38 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); - dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - num_bytes, parent, root->root_key.objectid); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset, - root->root_key.objectid, false); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = num_bytes; + ref.parent = parent; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_header_owner(leaf); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, parent, root->root_key.objectid); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset, - root->root_key.objectid, false); + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = parent; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_header_owner(leaf); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } } - if (dirty) - btrfs_mark_buffer_dirty(trans, leaf); if (inode) - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); return ret; } @@ -1224,8 +1012,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, int ret; int slot; - ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: @@ -1358,7 +1146,7 @@ again: * The real subtree rescan is delayed until we have new * CoW on the subtree root node before transaction commit. */ - ret = btrfs_qgroup_add_swapped_blocks(trans, dest, + ret = btrfs_qgroup_add_swapped_blocks(dest, rc->block_group, parent, slot, path->nodes[level], path->slots[level], last_snapshot); @@ -1369,28 +1157,32 @@ again: */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(trans, path->nodes[level]); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, - blocksize, path->nodes[level]->start, - src->root_key.objectid); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, - 0, true); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = old_bytenr; + ref.num_bytes = blocksize; + ref.parent = path->nodes[level]->start; + ref.owning_root = btrfs_root_id(src); + ref.ref_root = btrfs_root_id(src); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - blocksize, 0, dest->root_key.objectid); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, - true); + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = blocksize; + ref.parent = 0; + ref.owning_root = btrfs_root_id(dest); + ref.ref_root = btrfs_root_id(dest); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1398,10 +1190,13 @@ again: } /* We don't know the real owning_root, use 0. */ - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, - blocksize, path->nodes[level]->start, 0); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, - 0, true); + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = blocksize; + ref.parent = path->nodes[level]->start; + ref.owning_root = 0; + ref.ref_root = btrfs_root_id(src); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1409,10 +1204,13 @@ again: } /* We don't know the real owning_root, use 0. */ - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, - blocksize, 0, 0); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, - 0, true); + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = old_bytenr; + ref.num_bytes = blocksize; + ref.parent = 0; + ref.owning_root = 0; + ref.ref_root = btrfs_root_id(dest); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1520,7 +1318,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, const struct btrfs_key *max_key) { struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; u64 objectid; u64 start, end; u64 ino; @@ -1530,23 +1328,24 @@ static int invalidate_extent_cache(struct btrfs_root *root, struct extent_state *cached_state = NULL; cond_resched(); - iput(inode); + if (inode) + iput(&inode->vfs_inode); if (objectid > max_key->objectid) break; - inode = find_next_inode(root, objectid); + inode = btrfs_find_first_inode(root, objectid); if (!inode) break; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); if (ino > max_key->objectid) { - iput(inode); + iput(&inode->vfs_inode); break; } objectid = ino + 1; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->vfs_inode.i_mode)) continue; if (unlikely(min_key->objectid == ino)) { @@ -1579,9 +1378,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_drop_extent_map_range(inode, start, end, true); + unlock_extent(&inode->io_tree, start, end, &cached_state); } return 0; } @@ -1616,7 +1415,7 @@ static int insert_dirty_subvol(struct btrfs_trans_handle *trans, int ret; /* @root must be a subvolume tree root with a valid reloc tree */ - ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); ASSERT(reloc_root); reloc_root_item = &reloc_root->root_item; @@ -1645,7 +1444,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, reloc_dirty_list) { - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; @@ -1774,7 +1573,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * btrfs_update_reloc_root() and update our root item * appropriately. */ - reloc_root->last_trans = trans->transid; + btrfs_set_root_last_trans(reloc_root, trans->transid); trans->block_rsv = rc->block_rsv; replaced = 0; @@ -1920,13 +1719,13 @@ again: if (root->reloc_root) { btrfs_err(fs_info, "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", - root->root_key.objectid, - root->reloc_root->root_key.objectid, + btrfs_root_id(root), + btrfs_root_id(root->reloc_root), root->reloc_root->root_key.type, root->reloc_root->root_key.offset, btrfs_root_generation( &root->reloc_root->root_item), - reloc_root->root_key.objectid, + btrfs_root_id(reloc_root), reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( @@ -1934,8 +1733,8 @@ again: } else { btrfs_err(fs_info, "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", - root->root_key.objectid, - reloc_root->root_key.objectid, + btrfs_root_id(root), + btrfs_root_id(reloc_root), reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( @@ -2117,7 +1916,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root; int ret; - if (reloc_root->last_trans == trans->transid) + if (btrfs_get_root_last_trans(reloc_root) == trans->transid) return 0; root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); @@ -2159,100 +1958,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, int index = 0; int ret; - next = node; - while (1) { - cond_resched(); - next = walk_up_backref(next, edges, &index); - root = next->root; + next = walk_up_backref(node, edges, &index); + root = next->root; - /* - * If there is no root, then our references for this block are - * incomplete, as we should be able to walk all the way up to a - * block that is owned by a root. - * - * This path is only for SHAREABLE roots, so if we come upon a - * non-SHAREABLE root then we have backrefs that resolve - * improperly. - * - * Both of these cases indicate file system corruption, or a bug - * in the backref walking code. - */ - if (!root) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu doesn't have a backref path ending in a root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu has multiple refs with one ending in a non-shareable root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = record_reloc_root_in_trans(trans, root); - if (ret) - return ERR_PTR(ret); - break; - } + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a block + * that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve improperly. + * + * Both of these cases indicate file system corruption, or a bug in the + * backref walking code. + */ + if (unlikely(!root)) { + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } - ret = btrfs_record_root_in_trans(trans, root); + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { + ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); - root = root->reloc_root; - - /* - * We could have raced with another thread which failed, so - * root->reloc_root may not be set, return ENOENT in this case. - */ - if (!root) - return ERR_PTR(-ENOENT); + goto found; + } - if (next->new_bytenr != root->node->start) { - /* - * We just created the reloc root, so we shouldn't have - * ->new_bytenr set and this shouldn't be in the changed - * list. If it is then we have multiple roots pointing - * at the same bytenr which indicates corruption, or - * we've made a mistake in the backref walking code. - */ - ASSERT(next->new_bytenr == 0); - ASSERT(list_empty(&next->list)); - if (next->new_bytenr || !list_empty(&next->list)) { - btrfs_err(trans->fs_info, - "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", - node->bytenr, next->bytenr); - return ERR_PTR(-EUCLEAN); - } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); + root = root->reloc_root; - next->new_bytenr = root->node->start; - btrfs_put_root(next->root); - next->root = btrfs_grab_root(root); - ASSERT(next->root); - list_add_tail(&next->list, - &rc->backref_cache.changed); - mark_block_processed(rc, next); - break; - } + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); - WARN_ON(1); - root = NULL; - next = walk_down_backref(edges, &index); - if (!next || next->level <= node->level) - break; - } - if (!root) { + if (next->new_bytenr) { /* - * This can happen if there's fs corruption or if there's a bug - * in the backref lookup code. + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set yet. If it is then we have multiple roots + * pointing at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. */ - ASSERT(0); - return ERR_PTR(-ENOENT); + ASSERT(next->new_bytenr == 0); + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); } + next->new_bytenr = root->node->start; + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); + ASSERT(next->root); + mark_block_processed(rc, next); +found: next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { @@ -2299,7 +2070,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return root; - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) fs_root = root; if (next != node) @@ -2315,9 +2086,8 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) return fs_root; } -static noinline_for_stack -u64 calcu_metadata_size(struct reloc_control *rc, - struct btrfs_backref_node *node, int reserve) +static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, + struct btrfs_backref_node *node) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *next = node; @@ -2326,12 +2096,12 @@ u64 calcu_metadata_size(struct reloc_control *rc, u64 num_bytes = 0; int index = 0; - BUG_ON(reserve && node->processed); + BUG_ON(node->processed); while (next) { cond_resched(); while (1) { - if (next->processed && (reserve || next != node)) + if (next->processed) break; num_bytes += fs_info->nodesize; @@ -2349,17 +2119,11 @@ u64 calcu_metadata_size(struct reloc_control *rc, return num_bytes; } -static int reserve_metadata_space(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_backref_node *node) +static int refill_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, u64 num_bytes) { - struct btrfs_root *root = rc->extent_root; - struct btrfs_fs_info *fs_info = root->fs_info; - u64 num_bytes; + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - u64 tmp; - - num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; @@ -2372,7 +2136,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) tmp <<= 1; /* @@ -2390,6 +2155,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, return 0; } +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + u64 num_bytes; + + num_bytes = calcu_metadata_size(rc, node) * 2; + return refill_metadata_space(trans, rc, num_bytes); +} + /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. @@ -2422,8 +2197,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { - struct btrfs_ref ref = { 0 }; - cond_resched(); upper = edge->node[UPPER]; @@ -2511,19 +2284,23 @@ static int do_relocation(struct btrfs_trans_handle *trans, */ ASSERT(node->eb == eb); } else { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = node->eb->start, + .num_bytes = blocksize, + .parent = upper->eb->start, + .owning_root = btrfs_header_owner(upper->eb), + .ref_root = btrfs_header_owner(upper->eb), + }; + btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); btrfs_set_node_ptr_generation(upper->eb, slot, trans->transid); btrfs_mark_buffer_dirty(trans, upper->eb); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - node->eb->start, blocksize, - upper->eb->start, - btrfs_header_owner(upper->eb)); btrfs_init_tree_ref(&ref, node->level, - btrfs_header_owner(upper->eb), - root->root_key.objectid, false); + btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); if (!ret) ret = btrfs_drop_subtree(trans, root, eb, @@ -2542,7 +2319,7 @@ next: if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); - list_move_tail(&node->list, &rc->backref_cache.changed); + list_del_init(&node->list); node->pending = 0; } @@ -2705,8 +2482,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, /* * This block was the root block of a root, and this is * the first time we're processing the block and thus it - * should not have had the ->new_bytenr modified and - * should have not been included on the changed list. + * should not have had the ->new_bytenr modified. * * However in the case of corruption we could have * multiple refs pointing to the same block improperly, @@ -2716,8 +2492,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - ASSERT(list_empty(&node->list)); - if (node->new_bytenr || !list_empty(&node->list)) { + if (node->new_bytenr) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); @@ -2740,17 +2515,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_put_root(node->root); node->root = btrfs_grab_root(root); ASSERT(node->root); - list_add_tail(&node->list, &rc->backref_cache.changed); } else { - path->lowest_level = node->level; - if (root == root->fs_info->chunk_root) - btrfs_reserve_chunk_metadata(trans, false); - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(path); - if (root == root->fs_info->chunk_root) - btrfs_trans_release_chunk_metadata(trans); - if (ret > 0) - ret = 0; + btrfs_err(root->fs_info, + "bytenr %llu resolved to a non-shareable root", + node->bytenr); + ret = -EUCLEAN; + goto out; } if (!ret) update_processed_blocks(rc, node); @@ -2758,11 +2528,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = do_relocation(trans, rc, node, key, path, 1); } out: - if (ret || node->level == 0 || node->cowonly) + if (ret || node->level == 0) btrfs_backref_cleanup_node(&rc->backref_cache, node); return ret; } +static int relocate_cowonly_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct tree_block *block, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + u64 num_bytes; + int nr_levels; + int ret; + + root = btrfs_get_fs_root(fs_info, block->owner, true); + if (IS_ERR(root)) + return PTR_ERR(root); + + nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; + + num_bytes = fs_info->nodesize * nr_levels; + ret = refill_metadata_space(trans, rc, num_bytes); + if (ret) { + btrfs_put_root(root); + return ret; + } + path->lowest_level = block->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); + + ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); + path->lowest_level = 0; + btrfs_release_path(path); + + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); + if (ret > 0) + ret = 0; + btrfs_put_root(root); + + return ret; +} + /* * relocate a list of blocks */ @@ -2775,12 +2584,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct tree_block *block; struct tree_block *next; - int ret; - int err = 0; + int ret = 0; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out_free_blocks; } @@ -2795,46 +2603,58 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Get first keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) { - err = get_tree_block_key(fs_info, block); - if (err) + ret = get_tree_block_key(fs_info, block); + if (ret) goto out_free_path; } } /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { + /* + * For COWonly blocks, or the data reloc tree, we only need to + * COW down to the block, there's no need to generate a backref + * tree. + */ + if (block->owner && + (!is_fstree(block->owner) || + block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { + ret = relocate_cowonly_block(trans, rc, block, path); + if (ret) + break; + continue; + } + node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { - err = PTR_ERR(node); + ret = PTR_ERR(node); goto out; } ret = relocate_tree_block(trans, rc, node, &block->key, path); - if (ret < 0) { - err = ret; + if (ret < 0) break; - } } out: - err = finish_pending_nodes(trans, rc, path, err); + ret = finish_pending_nodes(trans, rc, path, ret); out_free_path: btrfs_free_path(path); out_free_blocks: free_block_list(blocks); - return err; + return ret; } -static noinline_for_stack int prealloc_file_extent_cluster( - struct btrfs_inode *inode, - const struct file_extent_cluster *cluster) +static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) { + const struct file_extent_cluster *cluster = &rc->cluster; + struct btrfs_inode *inode = BTRFS_I(rc->data_inode); u64 alloc_hint = 0; u64 start; u64 end; - u64 offset = inode->index_cnt; + u64 offset = inode->reloc_block_group_start; u64 num_bytes; int nr; int ret = 0; @@ -2849,7 +2669,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * btrfs_do_readpage() call of previously relocated file cluster. * * If the current cluster starts in the above range, btrfs_do_readpage() - * will skip the read, and relocate_one_page() will later writeback + * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). @@ -2858,7 +2678,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; - struct page *page; + struct folio *folio; ASSERT(sectorsize < PAGE_SIZE); ASSERT(IS_ALIGNED(i_size, sectorsize)); @@ -2889,16 +2709,16 @@ static noinline_for_stack int prealloc_file_extent_cluster( clear_extent_bits(&inode->io_tree, i_size, round_up(i_size, PAGE_SIZE) - 1, EXTENT_UPTODATE); - page = find_lock_page(mapping, i_size >> PAGE_SHIFT); + folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we * will re-read the whole page anyway. */ - if (page) { - btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size, + if (!IS_ERR(folio)) { + btrfs_subpage_clear_uptodate(fs_info, folio, i_size, round_up(i_size, PAGE_SIZE) - i_size); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } @@ -2936,11 +2756,14 @@ static noinline_for_stack int prealloc_file_extent_cluster( return ret; } -static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, - u64 start, u64 end, u64 block_start) +static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) { + struct btrfs_inode *inode = BTRFS_I(rc->data_inode); struct extent_map *em; struct extent_state *cached_state = NULL; + u64 offset = inode->reloc_block_group_start; + u64 start = rc->cluster.start - offset; + u64 end = rc->cluster.end - offset; int ret = 0; em = alloc_extent_map(); @@ -2949,13 +2772,14 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->start = start; em->len = end + 1 - start; - em->block_len = em->len; - em->block_start = block_start; + em->disk_bytenr = rc->cluster.start; + em->disk_num_bytes = em->len; + em->ram_bytes = em->len; em->flags |= EXTENT_FLAG_PINNED; - lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); - ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + lock_extent(&inode->io_tree, start, end, &cached_state); + ret = btrfs_replace_extent_map_range(inode, em, false); + unlock_extent(&inode->io_tree, start, end, &cached_state); free_extent_map(em); return ret; @@ -2983,68 +2807,91 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, return cluster->boundary[cluster_nr + 1] - 1; } -static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, - const struct file_extent_cluster *cluster, - int *cluster_nr, unsigned long page_index) +static int relocate_one_folio(struct reloc_control *rc, + struct file_ra_state *ra, + int *cluster_nr, unsigned long index) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 offset = BTRFS_I(inode)->index_cnt; + const struct file_extent_cluster *cluster = &rc->cluster; + struct inode *inode = rc->data_inode; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + u64 offset = BTRFS_I(inode)->reloc_block_group_start; const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - struct page *page; - u64 page_start; - u64 page_end; + struct folio *folio; + u64 folio_start; + u64 folio_end; u64 cur; int ret; + const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); - ASSERT(page_index <= last_index); - page = find_lock_page(inode->i_mapping, page_index); - if (!page) { - page_cache_sync_readahead(inode->i_mapping, ra, NULL, - page_index, last_index + 1 - page_index); - page = find_or_create_page(inode->i_mapping, page_index, mask); - if (!page) - return -ENOMEM; + ASSERT(index <= last_index); +again: + folio = filemap_lock_folio(inode->i_mapping, index); + if (IS_ERR(folio)) { + + /* + * On relocation we're doing readahead on the relocation inode, + * but if the filesystem is backed by a RAID stripe tree we can + * get ENOENT (e.g. due to preallocated extents not being + * mapped in the RST) from the lookup. + * + * But readahead doesn't handle the error and submits invalid + * reads to the device, causing a assertion failures. + */ + if (!use_rst) + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + index, last_index + 1 - index); + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) + return PTR_ERR(folio); } - if (PageReadahead(page)) + WARN_ON(folio_order(folio)); + + if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, - page_folio(page), page_index, - last_index + 1 - page_index); + folio, last_index + 1 - index); - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (!folio_test_uptodate(folio)) { ret = -EIO; - goto release_page; + goto release_folio; + } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; } } /* - * We could have lost page private when we dropped the lock to read the - * page above, make sure we set_page_extent_mapped here so we have any + * We could have lost folio private when we dropped the lock to read the + * folio above, make sure we set_folio_extent_mapped() here so we have any * of the subpage blocksize stuff we need in place. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) - goto release_page; + goto release_folio; - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + folio_start = folio_pos(folio); + folio_end = folio_start + PAGE_SIZE - 1; /* * Start from the cluster, as for subpage case, the cluster can start - * inside the page. + * inside the folio. */ - cur = max(page_start, cluster->boundary[*cluster_nr] - offset); - while (cur <= page_end) { + cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); + while (cur <= folio_end) { struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; - u64 clamped_start = max(page_start, extent_start); - u64 clamped_end = min(page_end, extent_end); + u64 clamped_start = max(folio_start, extent_start); + u64 clamped_end = min(folio_end, extent_end); u32 clamped_len = clamped_end + 1 - clamped_start; /* Reserve metadata for this range */ @@ -3052,7 +2899,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len, clamped_len, false); if (ret) - goto release_page; + goto release_folio; /* Mark the range delalloc and dirty for later writeback */ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, @@ -3068,20 +2915,18 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); - goto release_page; + goto release_folio; } - btrfs_folio_set_dirty(fs_info, page_folio(page), - clamped_start, clamped_len); + btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); /* - * Set the boundary if it's inside the page. + * Set the boundary if it's inside the folio. * Data relocation requires the destination extents to have the * same size as the source. * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ - if (in_range(cluster->boundary[*cluster_nr] - offset, - page_start, PAGE_SIZE)) { + if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + @@ -3104,8 +2949,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, break; } } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); @@ -3113,16 +2958,17 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, ret = -ECANCELED; return ret; -release_page: - unlock_page(page); - put_page(page); +release_folio: + folio_unlock(folio); + folio_put(folio); return ret; } -static int relocate_file_extent_cluster(struct inode *inode, - const struct file_extent_cluster *cluster) +static int relocate_file_extent_cluster(struct reloc_control *rc) { - u64 offset = BTRFS_I(inode)->index_cnt; + struct inode *inode = rc->data_inode; + const struct file_extent_cluster *cluster = &rc->cluster; + u64 offset = BTRFS_I(inode)->reloc_block_group_start; unsigned long index; unsigned long last_index; struct file_ra_state *ra; @@ -3136,21 +2982,20 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!ra) return -ENOMEM; - ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); + ret = prealloc_file_extent_cluster(rc); if (ret) goto out; file_ra_state_init(ra, inode->i_mapping); - ret = setup_relocation_extent_mapping(inode, cluster->start - offset, - cluster->end - offset, cluster->start); + ret = setup_relocation_extent_mapping(rc); if (ret) goto out; last_index = (cluster->end - offset) >> PAGE_SHIFT; for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) - ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); + ret = relocate_one_folio(rc, ra, &cluster_nr, index); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3158,15 +3003,16 @@ out: return ret; } -static noinline_for_stack int relocate_data_extent(struct inode *inode, - const struct btrfs_key *extent_key, - struct file_extent_cluster *cluster) +static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, + const struct btrfs_key *extent_key) { + struct inode *inode = rc->data_inode; + struct file_extent_cluster *cluster = &rc->cluster; int ret; struct btrfs_root *root = BTRFS_I(inode)->root; if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { - ret = relocate_file_extent_cluster(inode, cluster); + ret = relocate_file_extent_cluster(rc); if (ret) return ret; cluster->nr = 0; @@ -3192,7 +3038,7 @@ static noinline_for_stack int relocate_data_extent(struct inode *inode, * the cluster we need to relocate. */ root->relocation_src_root = cluster->owning_root; - ret = relocate_file_extent_cluster(inode, cluster); + ret = relocate_file_extent_cluster(rc); if (ret) return ret; cluster->nr = 0; @@ -3211,7 +3057,7 @@ static noinline_for_stack int relocate_data_extent(struct inode *inode, cluster->nr++; if (cluster->nr >= MAX_EXTENTS) { - ret = relocate_file_extent_cluster(inode, cluster); + ret = relocate_file_extent_cluster(rc); if (ret) return ret; cluster->nr = 0; @@ -3405,7 +3251,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, if (inode) goto truncate; - inode = btrfs_iget(fs_info->sb, ino, root); + inode = btrfs_iget(ino, root); if (IS_ERR(inode)) return -ENOENT; @@ -3714,11 +3560,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) break; } restart: - if (update_backref_cache(trans, &rc->backref_cache)) { - btrfs_end_transaction(trans); - trans = NULL; - continue; - } + if (rc->backref_cache.last_trans != trans->transid) + btrfs_backref_release_cache(&rc->backref_cache); + rc->backref_cache.last_trans = trans->transid; ret = find_next_extent(rc, path, &key); if (ret < 0) @@ -3780,8 +3624,7 @@ restart: if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = true; - ret = relocate_data_extent(rc->data_inode, - &key, &rc->cluster); + ret = relocate_data_extent(rc, &key); if (ret < 0) { err = ret; break; @@ -3810,8 +3653,7 @@ restart: } if (!err) { - ret = relocate_file_extent_cluster(rc->data_inode, - &rc->cluster); + ret = relocate_file_extent_cluster(rc); if (ret < 0) err = ret; } @@ -3880,7 +3722,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -3927,7 +3768,7 @@ static noinline_for_stack struct inode *create_reloc_inode( struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; - int err = 0; + int ret = 0; root = btrfs_grab_root(fs_info->data_reloc_root); trans = btrfs_start_transaction(root, 6); @@ -3936,31 +3777,31 @@ static noinline_for_stack struct inode *create_reloc_inode( return ERR_CAST(trans); } - err = btrfs_get_free_objectid(root, &objectid); - if (err) + ret = btrfs_get_free_objectid(root, &objectid); + if (ret) goto out; - err = __insert_orphan_inode(trans, root, objectid); - if (err) + ret = __insert_orphan_inode(trans, root, objectid); + if (ret) goto out; - inode = btrfs_iget(fs_info->sb, objectid, root); + inode = btrfs_iget(objectid, root); if (IS_ERR(inode)) { delete_orphan_inode(trans, root, objectid); - err = PTR_ERR(inode); + ret = PTR_ERR(inode); inode = NULL; goto out; } - BTRFS_I(inode)->index_cnt = group->start; + BTRFS_I(inode)->reloc_block_group_start = group->start; - err = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - if (err) { + if (ret) { iput(inode); - inode = ERR_PTR(err); + inode = ERR_PTR(ret); } return inode; } @@ -4038,15 +3879,13 @@ static void free_reloc_control(struct reloc_control *rc) /* * Print the block group being relocated */ -static void describe_relocation(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *block_group) +static void describe_relocation(struct btrfs_block_group *block_group) { char buf[128] = {'\0'}; btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); - btrfs_info(fs_info, - "relocating block group %llu flags %s", + btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", block_group->start, buf); } @@ -4154,13 +3993,11 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - describe_relocation(fs_info, rc->block_group); + describe_relocation(rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); - btrfs_wait_ordered_roots(fs_info, U64_MAX, - rc->block_group->start, - rc->block_group->length); + btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); ret = btrfs_zone_finish(rc->block_group); WARN_ON(ret && ret != -EAGAIN); @@ -4185,7 +4022,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) * out of the loop if we hit an error. */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - ret = btrfs_wait_ordered_range(rc->data_inode, 0, + ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, (u64)-1); if (ret) err = ret; @@ -4257,8 +4094,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) struct extent_buffer *leaf; struct reloc_control *rc = NULL; struct btrfs_trans_handle *trans; - int ret; - int err = 0; + int ret2; + int ret = 0; path = btrfs_alloc_path(); if (!path) @@ -4272,15 +4109,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) while (1) { ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (ret > 0) { if (path->slots[0] == 0) break; path->slots[0]--; } + ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); @@ -4291,7 +4127,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(reloc_root)) { - err = PTR_ERR(reloc_root); + ret = PTR_ERR(reloc_root); goto out; } @@ -4303,15 +4139,12 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); - if (ret != -ENOENT) { - err = ret; + if (ret != -ENOENT) goto out; - } ret = mark_garbage_root(reloc_root); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } + ret = 0; } else { btrfs_put_root(fs_root); } @@ -4329,15 +4162,13 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) rc = alloc_reloc_control(fs_info); if (!rc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } ret = reloc_chunk_start(fs_info); - if (ret < 0) { - err = ret; + if (ret < 0) goto out_end; - } rc->extent_root = btrfs_extent_root(fs_info, 0); @@ -4345,7 +4176,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_unset; } @@ -4365,15 +4196,15 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { - err = PTR_ERR(fs_root); + ret = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_end_transaction(trans); goto out_unset; } - err = __add_reloc_root(reloc_root); - ASSERT(err != -EEXIST); - if (err) { + ret = __add_reloc_root(reloc_root); + ASSERT(ret != -EEXIST); + if (ret) { list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_put_root(fs_root); btrfs_end_transaction(trans); @@ -4383,8 +4214,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_root); } - err = btrfs_commit_transaction(trans); - if (err) + ret = btrfs_commit_transaction(trans); + if (ret) goto out_unset; merge_reloc_roots(rc); @@ -4393,14 +4224,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_clean; } - err = btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); out_clean: - ret = clean_dirty_subvols(rc); - if (ret < 0 && !err) - err = ret; + ret2 = clean_dirty_subvols(rc); + if (ret2 < 0 && !ret) + ret = ret2; out_unset: unset_reloc_control(rc); out_end: @@ -4411,14 +4242,14 @@ out: btrfs_free_path(path); - if (err == 0) { + if (ret == 0) { /* cleanup orphan inode in data relocation tree */ fs_root = btrfs_grab_root(fs_info->data_reloc_root); ASSERT(fs_root); - err = btrfs_orphan_cleanup(fs_root); + ret = btrfs_orphan_cleanup(fs_root); btrfs_put_root(fs_root); } - return err; + return ret; } /* @@ -4429,18 +4260,20 @@ out: */ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 disk_bytenr = ordered->file_offset + inode->index_cnt; + u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); LIST_HEAD(list); int ret; ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, disk_bytenr + ordered->num_bytes - 1, - &list, 0, false); - if (ret) + &list, false); + if (ret < 0) { + btrfs_mark_ordered_extent_error(ordered); return ret; + } while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = @@ -4490,13 +4323,22 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, btrfs_root_last_snapshot(&root->root_item)) first_cow = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && - rc->create_reloc_tree) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; - BUG_ON(node->bytenr != buf->start && - node->new_bytenr != buf->start); + + /* + * If node->bytenr != buf->start and node->new_bytenr != + * buf->start then we've got the wrong backref node for what we + * expected to see here and the cache is incorrect. + */ + if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { + btrfs_err(fs_info, +"bytenr %llu was found but our backref cache was expecting %llu or %llu", + buf->start, node->bytenr, node->new_bytenr); + return -EUCLEAN; + } btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs); @@ -4584,8 +4426,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, } new_root = pending->snap; - reloc_root = create_reloc_root(trans, root->reloc_root, - new_root->root_key.objectid); + reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); if (IS_ERR(reloc_root)) return PTR_ERR(reloc_root); @@ -4597,10 +4438,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return ret; } new_root->reloc_root = btrfs_grab_root(reloc_root); - - if (rc->create_reloc_tree) - ret = clone_backref_node(trans, rc, root, reloc_root); - return ret; + return 0; } /* diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 5fb60f2deb53..788c86d8633a 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -3,6 +3,15 @@ #ifndef BTRFS_RELOCATION_H #define BTRFS_RELOCATION_H +#include <linux/types.h> + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ordered_extent; +struct btrfs_pending_snapshot; + int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 603ad1459368..e22e6b06927a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -10,7 +10,6 @@ #include "messages.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "qgroup.h" #include "space-info.h" #include "accessors.h" @@ -82,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, if (ret > 0) goto out; } else { - BUG_ON(ret == 0); /* Logical error */ + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of the valid range. + */ + if (ret == 0) { + ret = -EUCLEAN; + goto out; + } if (path->slots[0] == 0) goto out; path->slots[0]--; @@ -142,8 +148,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret > 0) { btrfs_crit(fs_info, "unable to find root key (%llu %u %llu) in tree %llu", - key->objectid, key->type, key->offset, - root->root_key.objectid); + key->objectid, key->type, key->offset, btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); goto out; @@ -192,7 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); return ret; @@ -323,8 +327,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) goto out; - - BUG_ON(ret != 0); + if (ret != 0) { + /* The root must exist but we did not find it by the key. */ + ret = -EUCLEAN; + goto out; + } ret = btrfs_del_item(trans, root, path); out: @@ -439,7 +446,6 @@ again: btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); write_extent_buffer(leaf, name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); @@ -539,13 +545,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, } return ret; } - -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - u64 qgroup_to_release; - - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); - btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); -} diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index 8b2c3859e464..8f5739e732b9 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -3,13 +3,21 @@ #ifndef BTRFS_ROOT_TREE_H #define BTRFS_ROOT_TREE_H +#include <linux/types.h> + struct fscrypt_str; +struct extent_buffer; +struct btrfs_key; +struct btrfs_root; +struct btrfs_root_item; +struct btrfs_path; +struct btrfs_fs_info; +struct btrfs_block_rsv; +struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const struct fscrypt_str *name); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0123d2728923..531312efee8d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -261,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0); + ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false); if (ret < 0) goto error; @@ -838,7 +838,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); bbio = NULL; @@ -857,7 +857,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); } @@ -1012,6 +1012,7 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) struct btrfs_fs_info *fs_info = sctx->fs_info; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); + unsigned long repaired; int mirror; int i; @@ -1078,16 +1079,15 @@ out: * Submit the repaired sectors. For zoned case, we cannot do repair * in-place, but queue the bg to be relocated. */ - if (btrfs_is_zoned(fs_info)) { - if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap, + stripe->nr_sectors); + if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { + if (btrfs_is_zoned(fs_info)) { btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); - } else if (!sctx->readonly) { - unsigned long repaired; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - wait_scrub_stripe_io(stripe); + } else { + scrub_write_sectors(sctx, stripe, repaired, false); + wait_scrub_stripe_io(stripe); + } } scrub_stripe_report_errors(sctx, stripe); @@ -1390,8 +1390,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + btrfs_release_path(path); + return -EUCLEAN; + } - ASSERT(ret > 0); /* * Here we intentionally pass 0 as @min_objectid, as there could be * an extent item starting before @search_start. @@ -1534,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root)) { + btrfs_err(fs_info, "no valid extent root for scrub"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); @@ -1641,14 +1652,19 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) } } -static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) +static u32 stripe_length(const struct scrub_stripe *stripe) +{ + ASSERT(stripe->bg); + + return min(BTRFS_STRIPE_LEN, + stripe->bg->start + stripe->bg->length - stripe->logical); +} + +static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; - unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + - stripe->bg->length - stripe->logical) >> - fs_info->sectorsize_bits; + unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1670,7 +1686,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); bbio = NULL; } @@ -1681,20 +1697,34 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, (i << fs_info->sectorsize_bits); int err; - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; - - io_stripe.is_scrub = true; + io_stripe.rst_search_commit_root = true; + stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; + /* + * For RST cases, we need to manually split the bbio to + * follow the RST boundary. + */ err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &stripe_len, &bioc, &io_stripe, - &mirror); + &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); - if (err) { - btrfs_bio_end_io(bbio, - errno_to_blk_status(err)); - return; + if (err < 0) { + if (err != -ENODATA) { + /* + * Earlier btrfs_get_raid_extent_offset() + * returned -ENODATA, which means there's + * no entry for the corresponding range + * in the stripe tree. But if it's in + * the extent tree, then it's a preallocated + * extent and not an error. + */ + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + } + continue; } + + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; } __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); @@ -1703,7 +1733,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1718,9 +1748,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; - unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + - stripe->bg->length - stripe->logical) >> - fs_info->sectorsize_bits; + unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1728,7 +1756,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { - scrub_submit_extent_sector_read(sctx, stripe); + scrub_submit_extent_sector_read(stripe); return; } @@ -1761,7 +1789,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, mirror = calc_next_mirror(mirror, num_copies); } - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } static bool stripe_has_metadata_error(struct scrub_stripe *stripe) @@ -1860,6 +1888,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) stripe = &sctx->stripes[i]; wait_scrub_stripe_io(stripe); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = stripe->physical + stripe_length(stripe); + spin_unlock(&sctx->stat_lock); scrub_reset_stripe(stripe); } out: @@ -1936,7 +1967,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); /* - * For data stripe search, we cannot re-use the same extent/csum paths, + * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ @@ -2085,7 +2116,6 @@ out: */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2093,7 +2123,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 logical_end = logical_start + logical_length; u64 cur_logical = logical_start; - int ret; + int ret = 0; /* The range must be inside the bg */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); @@ -2128,7 +2158,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ + spin_lock(&sctx->stat_lock); sctx->stat.last_physical = physical + logical_length; + spin_unlock(&sctx->stat_lock); ret = 0; break; } @@ -2202,7 +2234,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, * just RAID1, so we can reuse scrub_simple_mirror() to scrub * this stripe. */ - ret = scrub_simple_mirror(sctx, bg, map, cur_logical, + ret = scrub_simple_mirror(sctx, bg, cur_logical, BTRFS_STRIPE_LEN, device, cur_physical, mirror_num); if (ret) @@ -2236,7 +2268,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Offset inside the chunk */ u64 offset; u64 stripe_logical; - int stop_loop = 0; /* Extent_path should be released by now. */ ASSERT(sctx->extent_path.nodes[0] == NULL); @@ -2287,7 +2318,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Only @physical and @mirror_num needs to calculated using * @stripe_index. */ - ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, + ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, scrub_dev, map->stripes[stripe_index].physical, stripe_index + 1); offset = 0; @@ -2325,6 +2356,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, stripe_logical += chunk_logical; ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, map, stripe_logical); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, + physical_end); + spin_unlock(&sctx->stat_lock); if (ret) goto out; goto next; @@ -2338,7 +2373,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * We can reuse scrub_simple_mirror() here, as the repair part * is still based on @mirror_num. */ - ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, + ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, scrub_dev, physical, 1); if (ret < 0) goto out; @@ -2346,14 +2381,8 @@ next: logical += increment; physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); - if (stop_loop) - sctx->stat.last_physical = - map->stripes[stripe_index].physical + dev_stripe_len; - else - sctx->stat.last_physical = physical; + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); - if (stop_loop) - break; } out: ret2 = flush_scrub_stripes(sctx); @@ -2430,19 +2459,15 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root, struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_trans_handle *trans; if (!btrfs_is_zoned(fs_info)) return 0; btrfs_wait_block_group_reservations(cache); btrfs_wait_nocow_writers(cache); - btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - return btrfs_commit_transaction(trans); + return btrfs_commit_current_transaction(root); } static noinline_for_stack @@ -2673,8 +2698,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ if (sctx->is_dev_replace) { btrfs_wait_nocow_writers(cache); - btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, - cache->length); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); } scrub_pause_off(fs_info); @@ -2805,7 +2829,17 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); + if (ret == -ENOENT) + break; + + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.super_errors++; + spin_unlock(&sctx->stat_lock); + continue; + } + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->commit_total_bytes) break; diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index 7639103ebf9d..f0df597b75c7 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -3,6 +3,12 @@ #ifndef BTRFS_SCRUB_H #define BTRFS_SCRUB_H +#include <linux/types.h> + +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_scrub_progress; + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e48a063ef085..f437138fefbc 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -25,7 +25,6 @@ #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" -#include "xattr.h" #include "print-tree.h" #include "accessors.h" #include "dir-item.h" @@ -63,7 +62,7 @@ struct fs_path { /* * Average path length does not exceed 200 bytes, we'll have * better packing in the slab and higher chance to satisfy - * a allocation later during send. + * an allocation later during send. */ char pad[256]; }; @@ -347,8 +346,10 @@ struct name_cache_entry { u64 parent_gen; int ret; int need_later_update; + /* Name length without NUL terminator. */ int name_len; - char name[]; + /* Not NUL terminated. */ + char name[] __counted_by(name_len) __nonstring; }; /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ @@ -393,9 +394,8 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, btrfs_err(sctx->send_root->fs_info, "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", result_string, what, sctx->cmp_key->objectid, - sctx->send_root->root_key.objectid, - (sctx->parent_root ? - sctx->parent_root->root_key.objectid : 0)); + btrfs_root_id(sctx->send_root), + (sctx->parent_root ? btrfs_root_id(sctx->parent_root) : 0)); } __maybe_unused @@ -777,7 +777,12 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) if (WARN_ON(!sctx->send_buf)) return -EINVAL; - BUG_ON(sctx->send_size); + if (unlikely(sctx->send_size != 0)) { + btrfs_err(sctx->send_root->fs_info, + "send: command header buffer not empty cmd %d offset %llu", + cmd, sctx->send_off); + return -EINVAL; + } sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; @@ -975,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) return ret; } -typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, - struct fs_path *p, - void *ctx); +typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); /* * Helper function to iterate the entries in ONE btrfs_inode_ref or @@ -1002,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, u32 name_len; char *start; int ret = 0; - int num = 0; - int index; u64 dir; unsigned long name_off; unsigned long elem_size; @@ -1038,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, iref = (struct btrfs_inode_ref *)(ptr + cur); name_len = btrfs_inode_ref_name_len(eb, iref); name_off = (unsigned long)(iref + 1); - index = btrfs_inode_ref_index(eb, iref); dir = found_key->offset; } else { extref = (struct btrfs_inode_extref *)(ptr + cur); name_len = btrfs_inode_extref_name_len(eb, extref); name_off = (unsigned long)&extref->name; - index = btrfs_inode_extref_index(eb, extref); dir = btrfs_inode_extref_parent(eb, extref); } @@ -1070,7 +1069,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, ret = PTR_ERR(start); goto out; } - BUG_ON(start < p->buf); + if (unlikely(start < p->buf)) { + btrfs_err(root->fs_info, + "send: path ref buffer underflow for key (%llu %u %llu)", + found_key->objectid, + found_key->type, + found_key->offset); + ret = -EINVAL; + goto out; + } } p->start = start; } else { @@ -1081,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } cur += elem_size + name_len; - ret = iterate(num, dir, index, p, ctx); + ret = iterate(dir, p, ctx); if (ret) goto out; - num++; } out: @@ -1125,7 +1131,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Start with a small buffer (1 page). If later we end up needing more * space, which can happen for xattrs on a fs with a leaf size greater - * then the page size, attempt to increase the buffer. Typically xattr + * than the page size, attempt to increase the buffer. Typically xattr * values are small. */ buf_len = PATH_MAX; @@ -1214,8 +1220,7 @@ out: return ret; } -static int __copy_first_ref(int num, u64 dir, int index, - struct fs_path *p, void *ctx) +static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx) { int ret; struct fs_path *pt = ctx; @@ -1304,9 +1309,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) u64 root = (u64)(uintptr_t)key; const struct clone_root *cr = elt; - if (root < cr->root->root_key.objectid) + if (root < btrfs_root_id(cr->root)) return -1; - if (root > cr->root->root_key.objectid) + if (root > btrfs_root_id(cr->root)) return 1; return 0; } @@ -1316,9 +1321,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) const struct clone_root *cr1 = e1; const struct clone_root *cr2 = e2; - if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) + if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root)) return -1; - if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) + if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root)) return 1; return 0; } @@ -1406,7 +1411,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) + if (sctx->backref_cache.size == 0) return false; /* @@ -1504,7 +1509,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + if (sctx->backref_cache.size == 1) sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } @@ -1766,7 +1771,7 @@ static int read_symlink(struct btrfs_root *root, */ btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu", - ino, root->root_key.objectid); + ino, btrfs_root_id(root)); ret = -EIO; goto out; } @@ -2377,7 +2382,7 @@ out_cache: /* * Store the result of the lookup in the name cache. */ - nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL); + nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); if (!nce) { ret = -ENOMEM; goto out; @@ -2389,7 +2394,7 @@ out_cache: nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); nce->ret = ret; - strcpy(nce->name, dest->start); + memcpy(nce->name, dest->start, nce->name_len); if (ino < sctx->send_progress) nce->need_later_update = 0; @@ -2520,7 +2525,7 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; } - key.objectid = send_root->root_key.objectid; + key.objectid = btrfs_root_id(send_root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; @@ -2536,7 +2541,7 @@ static int send_subvol_begin(struct send_ctx *sctx) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || - key.objectid != send_root->root_key.objectid) { + key.objectid != btrfs_root_id(send_root)) { ret = -ENOENT; goto out; } @@ -2809,8 +2814,7 @@ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) static int trim_dir_utimes_cache(struct send_ctx *sctx) { - while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > - SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) { struct btrfs_lru_cache_entry *lru; int ret; @@ -3756,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key di_key; @@ -3785,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, + di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); if (!di) { ret = 0; @@ -4182,7 +4185,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * This should never happen as the root dir always has the same ref * which is always '..' */ - BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) { + btrfs_err(fs_info, + "send: unexpected inode %llu in process_recorded_refs()", + sctx->cur_ino); + ret = -EINVAL; + goto out; + } valid_path = fs_path_alloc(); if (!valid_path) { @@ -4690,8 +4699,7 @@ out: return ret; } -static int record_new_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -4720,8 +4728,7 @@ out: return ret; } -static int record_deleted_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -5172,11 +5179,10 @@ out: static int process_verity(struct send_ctx *sctx) { int ret = 0; - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct inode *inode; struct fs_path *p; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root); + inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5257,10 +5263,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct page *page; + struct folio *folio; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); + struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; ret = put_data_header(sctx, len); @@ -5273,44 +5280,50 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_lock_page(sctx->cur_inode->i_mapping, index); - if (!page) { - page_cache_sync_readahead(sctx->cur_inode->i_mapping, +again: + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { + page_cache_sync_readahead(mapping, &sctx->ra, NULL, index, last_index + 1 - index); - page = find_or_create_page(sctx->cur_inode->i_mapping, - index, GFP_KERNEL); - if (!page) { - ret = -ENOMEM; + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; } } - if (PageReadahead(page)) - page_cache_async_readahead(sctx->cur_inode->i_mapping, - &sctx->ra, NULL, page_folio(page), - index, last_index + 1 - index); + WARN_ON(folio_order(folio)); + + if (folio_test_readahead(folio)) + page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, + last_index + 1 - index); - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", - page_offset(page), sctx->cur_ino, - sctx->send_root->root_key.objectid); - put_page(page); + folio_pos(folio), sctx->cur_ino, + btrfs_root_id(sctx->send_root)); + folio_put(folio); ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } - memcpy_from_page(sctx->send_buf + sctx->send_size, page, - pg_offset, cur_len); - unlock_page(page); - put_page(page); + memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, + pg_offset, cur_len); + folio_unlock(folio); + folio_put(folio); index++; pg_offset = 0; len -= cur_len; @@ -5371,7 +5384,7 @@ static int send_clone(struct send_ctx *sctx, btrfs_debug(sctx->send_root->fs_info, "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, clone_root->root->root_key.objectid, + offset, len, btrfs_root_id(clone_root->root), clone_root->ino, clone_root->offset); p = fs_path_alloc(); @@ -5533,7 +5546,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, size_t inline_size; int ret; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + inode = btrfs_iget(sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5600,7 +5613,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, u32 crc; int ret; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + inode = btrfs_iget(sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5659,10 +5672,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), disk_bytenr, disk_num_bytes, sctx->send_buf_pages + - (data_offset >> PAGE_SHIFT)); + (data_offset >> PAGE_SHIFT), + NULL); if (ret) goto out; @@ -5729,7 +5743,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, if (sctx->cur_inode == NULL) { struct btrfs_root *root = sctx->send_root; - sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); + sctx->cur_inode = btrfs_iget(sctx->cur_ino, root); if (IS_ERR(sctx->cur_inode)) { int err = PTR_ERR(sctx->cur_inode); @@ -6140,26 +6154,73 @@ static int send_write_or_clone(struct send_ctx *sctx, int ret = 0; u64 offset = key->offset; u64 end; - u64 bs = sctx->send_root->fs_info->sb->s_blocksize; + u64 bs = sctx->send_root->fs_info->sectorsize; + struct btrfs_file_extent_item *ei; + u64 disk_byte; + u64 data_offset; + u64 num_bytes; + struct btrfs_inode_info info = { 0 }; end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); if (offset >= end) return 0; - if (clone_root && IS_ALIGNED(end, bs)) { - struct btrfs_file_extent_item *ei; - u64 disk_byte; - u64 data_offset; + num_bytes = end - offset; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); - data_offset = btrfs_file_extent_offset(path->nodes[0], ei); - ret = clone_range(sctx, path, clone_root, disk_byte, - data_offset, offset, end - offset); - } else { - ret = send_extent_data(sctx, path, offset, end - offset); + if (!clone_root) + goto write_data; + + if (IS_ALIGNED(end, bs)) + goto clone_data; + + /* + * If the extent end is not aligned, we can clone if the extent ends at + * the i_size of the inode and the clone range ends at the i_size of the + * source inode, otherwise the clone operation fails with -EINVAL. + */ + if (end != sctx->cur_inode_size) + goto write_data; + + ret = get_inode_info(clone_root->root, clone_root->ino, &info); + if (ret < 0) + return ret; + + if (clone_root->offset + num_bytes == info.size) { + /* + * The final size of our file matches the end offset, but it may + * be that its current size is larger, so we have to truncate it + * to any value between the start offset of the range and the + * final i_size, otherwise the clone operation is invalid + * because it's unaligned and it ends before the current EOF. + * We do this truncate to the final i_size when we finish + * processing the inode, but it's too late by then. And here we + * truncate to the start offset of the range because it's always + * sector size aligned while if it were the final i_size it + * would result in dirtying part of a page, filling part of a + * page with zeroes and then having the clone operation at the + * receiver trigger IO and wait for it due to the dirty page. + */ + if (sctx->parent_root != NULL) { + ret = send_truncate(sctx, sctx->cur_ino, + sctx->cur_inode_gen, offset); + if (ret < 0) + return ret; + } + goto clone_data; } + +write_data: + ret = send_extent_data(sctx, path, offset, num_bytes); + sctx->cur_inode_next_write_offset = end; + return ret; + +clone_data: + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset, + num_bytes); sctx->cur_inode_next_write_offset = end; return ret; } @@ -6458,21 +6519,18 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) return 0; - if (sctx->cur_inode_last_extent == (u64)-1) { - ret = get_last_extent(sctx, key->offset - 1); - if (ret) - return ret; - } - - if (path->slots[0] == 0 && - sctx->cur_inode_last_extent < key->offset) { - /* - * We might have skipped entire leafs that contained only - * file extent items for our current inode. These leafs have - * a generation number smaller (older) than the one in the - * current leaf and the leaf our last extent came from, and - * are located between these 2 leafs. - */ + /* + * Get last extent's end offset (exclusive) if we haven't determined it + * yet (we're processing the first file extent item that is new), or if + * we're at the first slot of a leaf and the last extent's end is less + * than the current extent's offset, because we might have skipped + * entire leaves that contained only file extent items for our current + * inode. These leaves have a generation number smaller (older) than the + * one in the current leaf and the leaf our last extent came from, and + * are located between these 2 leaves. + */ + if ((sctx->cur_inode_last_extent == (u64)-1) || + (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) { ret = get_last_extent(sctx, key->offset - 1); if (ret) return ret; @@ -7128,13 +7186,11 @@ static int changed_extent(struct send_ctx *sctx, static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { - int ret = 0; - if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) sctx->cur_inode_needs_verity = true; } - return ret; + return 0; } static int dir_changed(struct send_ctx *sctx, u64 dir) @@ -7203,7 +7259,7 @@ static int changed_cb(struct btrfs_path *left_path, enum btrfs_compare_tree_result result, struct send_ctx *sctx) { - int ret = 0; + int ret; /* * We can not hold the commit root semaphore here. This is because in @@ -7263,7 +7319,6 @@ static int changed_cb(struct btrfs_path *left_path, return 0; } result = BTRFS_COMPARE_TREE_CHANGED; - ret = 0; } sctx->left_path = left_path; @@ -7323,7 +7378,7 @@ static int search_key_again(const struct send_ctx *sctx, "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", key->objectid, key->type, key->offset, (root == sctx->parent_root ? "parent" : "send"), - root->root_key.objectid, path->lowest_level, + btrfs_root_id(root), path->lowest_level, path->slots[path->lowest_level]); return -EUCLEAN; } @@ -7429,8 +7484,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_done = 0; lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + ASSERT(*level != 0); - BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) return PTR_ERR(eb); @@ -7984,34 +8039,18 @@ out: */ static int ensure_commit_roots_uptodate(struct send_ctx *sctx) { - int i; - struct btrfs_trans_handle *trans = NULL; - -again: - if (sctx->parent_root && - sctx->parent_root->node != sctx->parent_root->commit_root) - goto commit_trans; - - for (i = 0; i < sctx->clone_roots_cnt; i++) - if (sctx->clone_roots[i].root->node != - sctx->clone_roots[i].root->commit_root) - goto commit_trans; - - if (trans) - return btrfs_end_transaction(trans); + struct btrfs_root *root = sctx->parent_root; - return 0; + if (root && root->node != root->commit_root) + return btrfs_commit_current_transaction(root); -commit_trans: - /* Use any root, all fs roots will get their commit roots updated. */ - if (!trans) { - trans = btrfs_join_transaction(sctx->send_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - goto again; + for (int i = 0; i < sctx->clone_roots_cnt; i++) { + root = sctx->clone_roots[i].root; + if (root->node != root->commit_root) + return btrfs_commit_current_transaction(root); } - return btrfs_commit_transaction(trans); + return 0; } /* @@ -8032,7 +8071,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; - btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + btrfs_wait_ordered_extents(root, U64_MAX, NULL); } for (i = 0; i < sctx->clone_roots_cnt; i++) { @@ -8040,7 +8079,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; - btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + btrfs_wait_ordered_extents(root, U64_MAX, NULL); } return 0; @@ -8057,7 +8096,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) if (root->send_in_progress < 0) btrfs_err(root->fs_info, "send_in_progress unbalanced %d root %llu", - root->send_in_progress, root->root_key.objectid); + root->send_in_progress, btrfs_root_id(root)); spin_unlock(&root->root_item_lock); } @@ -8065,13 +8104,13 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) { btrfs_warn_rl(root->fs_info, "cannot use root %llu for send while deduplications on it are in progress (%d in progress)", - root->root_key.objectid, root->dedupe_in_progress); + btrfs_root_id(root), root->dedupe_in_progress); } -long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = BTRFS_I(inode)->root; + struct btrfs_root *send_root = inode->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -8091,7 +8130,20 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); - if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started. + */ + if (btrfs_root_dead(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + /* Userspace tools do the checks and warn the user if it's not RO. */ + if (!btrfs_root_readonly(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + if (send_root->dedupe_in_progress) { dedupe_in_progress_warn(send_root); spin_unlock(&send_root->root_item_lock); return -EAGAIN; @@ -8100,15 +8152,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) spin_unlock(&send_root->root_item_lock); /* - * Userspace tools do the checks and warn the user if it's - * not RO. - */ - if (!btrfs_root_readonly(send_root)) { - ret = -EPERM; - goto out; - } - - /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside * access_ok. Also set an upper limit for allocation size so this can't @@ -8173,15 +8216,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } sctx->send_root = send_root; - /* - * Unlikely but possible, if the subvolume is marked for deletion but - * is slow to remove the directory entry, send can still be started - */ - if (btrfs_root_dead(sctx->send_root)) { - ret = -EPERM; - goto out; - } - sctx->clone_roots_cnt = arg->clone_sources_count; if (sctx->proto >= 2) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 4f5509cb1803..9309886c5ea1 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -8,10 +8,15 @@ #define BTRFS_SEND_H #include <linux/types.h> +#include <linux/sizes.h> +#include <linux/align.h> + +struct btrfs_inode; +struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL #define BTRFS_SEND_STREAM_VERSION 3 #else #define BTRFS_SEND_STREAM_VERSION 2 @@ -25,9 +30,6 @@ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K #define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) -struct inode; -struct btrfs_ioctl_send_args; - enum btrfs_tlv_type { BTRFS_TLV_U8, BTRFS_TLV_U16, @@ -180,6 +182,6 @@ enum { __BTRFS_SEND_A_MAX = 35, }; -long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3b54eb583474..a341d087567a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include "linux/spinlock.h" +#include <linux/minmax.h> #include "misc.h" #include "ctree.h" #include "space-info.h" @@ -9,10 +11,10 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -126,6 +128,14 @@ * churn a lot and we can avoid making some extent tree modifications if we * are able to delay for as long as possible. * + * RESET_ZONES + * This state works only for the zoned mode. On the zoned mode, we cannot + * reuse once allocated then freed region until we reset the zone, due to + * the sequential write zone requirement. The RESET_ZONES state resets the + * zones of an unused block group and let us reuse the space. The reusing + * is faster than removing the block group and allocating another block + * group on the zones. + * * ALLOC_CHUNK * We will skip this the first time through space reservation, because of * overcommit and we don't want to have a lot of useless metadata space when @@ -162,7 +172,7 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -191,6 +201,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) + /* * Calculate chunk size depending on volume type (regular or zoned). */ @@ -233,6 +245,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (!space_info) return -ENOMEM; + space_info->fs_info = info; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); @@ -312,7 +325,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, found->bytes_used += block_group->used; found->disk_used += block_group->used * factor; found->bytes_readonly += block_group->bytes_super; - found->bytes_zone_unusable += block_group->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable); if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); @@ -341,11 +354,32 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, return NULL; } +static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *data_sinfo; + u64 data_chunk_size; + + /* + * Calculate the data_chunk_size, space_info->chunk_size is the + * "optimal" chunk size based on the fs size. However when we actually + * allocate the chunk we will strip this down further, making it no + * more than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size) + * as it is. + */ + data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + if (btrfs_is_zoned(fs_info)) + return data_sinfo->chunk_size; + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + return min_t(u64, data_chunk_size, SZ_1G); +} + static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, + const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *data_sinfo; u64 profile; u64 avail; u64 data_chunk_size; @@ -369,16 +403,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, if (avail == 0) return 0; - /* - * Calculate the data_chunk_size, space_info->chunk_size is the - * "optimal" chunk size based on the fs size. However when we actually - * allocate the chunk we will strip this down further, making it no more - * than 10% of the disk or 1G, whichever is smaller. - */ - data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - data_chunk_size = min(data_sinfo->chunk_size, - mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); - data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + data_chunk_size = calc_effective_data_chunk_size(fs_info); /* * Since data allocations immediately use block groups as part of the @@ -406,11 +431,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + + /* + * On the zoned mode, we always allocate one zone as one chunk. + * Returning non-zone size alingned bytes here will result in + * less pressure for the async metadata reclaim process, and it + * will over-commit too much leading to ENOSPC. Align down to the + * zone size to avoid that. + */ + if (btrfs_is_zoned(fs_info)) + avail = ALIGN_DOWN(avail, fs_info->zone_size); + return avail; } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -462,9 +498,7 @@ again: if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, - ticket->bytes); + btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; @@ -515,8 +549,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -556,8 +590,7 @@ again: spin_lock(&cache->lock); avail = cache->length - cache->used - cache->pinned - - cache->reserved - cache->delalloc_bytes - - cache->bytes_super - cache->zone_unusable; + cache->reserved - cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", cache->start, cache->length, cache->used, cache->pinned, @@ -588,8 +621,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, return nr; } -#define EXTENT_SIZE_PER_ITEM SZ_256K - /* * shrink metadata reservation for delalloc */ @@ -689,7 +720,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, items, NULL); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -808,14 +839,10 @@ static void flush_space(struct btrfs_fs_info *fs_info, * because that does not wait for a transaction to fully commit * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). */ - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - if (ret == -ENOENT) - ret = 0; - break; - } - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(root); + break; + case RESET_ZONES: + ret = btrfs_reset_unused_block_groups(space_info, num_bytes); break; default: ret = -ENOSPC; @@ -827,9 +854,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -854,7 +880,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + const struct btrfs_space_info *space_info) { const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; @@ -1070,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; + enum btrfs_flush_state final_state; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + if (btrfs_is_zoned(fs_info)) + final_state = RESET_ZONES; + else + final_state = COMMIT_TRANS; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); @@ -1125,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) flush_state++; - if (flush_state > COMMIT_TRANS) { + if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { if (maybe_fail_all_tickets(fs_info, space_info)) { @@ -1139,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } } spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); + } while (flush_state <= final_state); } /* @@ -1263,13 +1294,17 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * If we are freeing inodes, we want to make sure all delayed iputs have * completed, because they could have been on an inode with i_nlink == 0, and * thus have been truncated and freed up space. But again this space is not - * immediately re-usable, it comes in the form of a delayed ref, which must be + * immediately reusable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * * COMMIT_TRANS * This is where we reclaim all of the pinned space generated by running the * iputs * + * RESET_ZONES + * This state works only for the zoned mode. We scan the unused block group + * list and reset the zones and reuse the block group. + * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full * before, and then the transaction commit could have freed new block groups, @@ -1279,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, + RESET_ZONES, ALLOC_CHUNK_FORCE, }; @@ -1370,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) static const enum btrfs_flush_state priority_flush_states[] = { FLUSH_DELAYED_ITEMS_NR, FLUSH_DELAYED_ITEMS, + RESET_ZONES, ALLOC_CHUNK, }; @@ -1383,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, + RESET_ZONES, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, @@ -1472,8 +1510,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void wait_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { @@ -1531,7 +1568,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: - wait_reserve_ticket(fs_info, space_info, ticket); + wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: priority_reclaim_metadata_space(fs_info, space_info, ticket, @@ -1675,8 +1712,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1688,8 +1724,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { used = btrfs_space_info_used(space_info, false); if (used + orig_bytes <= space_info->total_bytes) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } } @@ -1869,3 +1904,230 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } + +static u64 calc_pct_ratio(u64 x, u64 y) +{ + int err; + + if (!y) + return 0; +again: + err = check_mul_overflow(100, x, &x); + if (err) + goto lose_precision; + return div64_u64(x, y); +lose_precision: + x >>= 10; + y >>= 10; + if (!y) + y = 1; + goto again; +} + +/* + * A reasonable buffer for unallocated space is 10 data block_groups. + * If we claw this back repeatedly, we can still achieve efficient + * utilization when near full, and not do too much reclaim while + * always maintaining a solid buffer for workloads that quickly + * allocate and pressure the unallocated space. + */ +static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) +{ + u64 chunk_sz = calc_effective_data_chunk_size(fs_info); + + return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz; +} + +/* + * The fundamental goal of automatic reclaim is to protect the filesystem's + * unallocated space and thus minimize the probability of the filesystem going + * read only when a metadata allocation failure causes a transaction abort. + * + * However, relocations happen into the space_info's unused space, therefore + * automatic reclaim must also back off as that space runs low. There is no + * value in doing trivial "relocations" of re-writing the same block group + * into a fresh one. + * + * Furthermore, we want to avoid doing too much reclaim even if there are good + * candidates. This is because the allocator is pretty good at filling up the + * holes with writes. So we want to do just enough reclaim to try and stay + * safe from running out of unallocated space but not be wasteful about it. + * + * Therefore, the dynamic reclaim threshold is calculated as follows: + * - calculate a target unallocated amount of 5 block group sized chunks + * - ratchet up the intensity of reclaim depending on how far we are from + * that target by using a formula of unalloc / target to set the threshold. + * + * Typically with 10 block groups as the target, the discrete values this comes + * out to are 0, 10, 20, ... , 80, 90, and 99. + */ +static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 target = calc_unalloc_target(fs_info); + u64 alloc = space_info->total_bytes; + u64 used = btrfs_space_info_used(space_info, false); + u64 unused = alloc - used; + u64 want = target > unalloc ? target - unalloc : 0; + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + + /* If we have no unused space, don't bother, it won't work anyway. */ + if (unused < data_chunk_size) + return 0; + + /* Cast to int is OK because want <= target. */ + return calc_pct_ratio(want, target); +} + +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) +{ + lockdep_assert_held(&space_info->lock); + + if (READ_ONCE(space_info->dynamic_reclaim)) + return calc_dynamic_reclaim_threshold(space_info); + return READ_ONCE(space_info->bg_reclaim_threshold); +} + +/* + * Under "urgent" reclaim, we will reclaim even fresh block groups that have + * recently seen successful allocations, as we are desperate to reclaim + * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs. + */ +static bool is_reclaim_urgent(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + + return unalloc < data_chunk_size; +} + +static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) +{ + struct btrfs_block_group *bg; + int thresh_pct; + bool try_again = true; + bool urgent; + + spin_lock(&space_info->lock); + urgent = is_reclaim_urgent(space_info); + thresh_pct = btrfs_calc_reclaim_threshold(space_info); + spin_unlock(&space_info->lock); + + down_read(&space_info->groups_sem); +again: + list_for_each_entry(bg, &space_info->block_groups[raid], list) { + u64 thresh; + bool reclaim = false; + + btrfs_get_block_group(bg); + spin_lock(&bg->lock); + thresh = mult_perc(bg->length, thresh_pct); + if (bg->used < thresh && bg->reclaim_mark) { + try_again = false; + reclaim = true; + } + bg->reclaim_mark++; + spin_unlock(&bg->lock); + if (reclaim) + btrfs_mark_bg_to_reclaim(bg); + btrfs_put_block_group(bg); + } + + /* + * In situations where we are very motivated to reclaim (low unalloc) + * use two passes to make the reclaim mark check best effort. + * + * If we have any staler groups, we don't touch the fresher ones, but if we + * really need a block group, do take a fresh one. + */ + if (try_again && urgent) { + try_again = false; + goto again; + } + + up_read(&space_info->groups_sem); +} + +void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) +{ + u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info); + + lockdep_assert_held(&space_info->lock); + space_info->reclaimable_bytes += bytes; + + if (space_info->reclaimable_bytes >= chunk_sz) + btrfs_set_periodic_reclaim_ready(space_info, true); +} + +void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready) +{ + lockdep_assert_held(&space_info->lock); + if (!READ_ONCE(space_info->periodic_reclaim)) + return; + if (ready != space_info->periodic_reclaim_ready) { + space_info->periodic_reclaim_ready = ready; + if (!ready) + space_info->reclaimable_bytes = 0; + } +} + +bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) +{ + bool ret; + + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return false; + if (!READ_ONCE(space_info->periodic_reclaim)) + return false; + + spin_lock(&space_info->lock); + ret = space_info->periodic_reclaim_ready; + btrfs_set_periodic_reclaim_ready(space_info, false); + spin_unlock(&space_info->lock); + + return ret; +} + +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) +{ + int raid; + struct btrfs_space_info *space_info; + + list_for_each_entry(space_info, &fs_info->space_info, list) { + if (!btrfs_should_periodic_reclaim(space_info)) + continue; + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) + do_reclaim_sweep(space_info, raid); + } +} + +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + lockdep_assert_held(&space_info->lock); + + /* Prioritize the global reservation to receive the freed space. */ + if (global_rsv->space_info != space_info) + goto grant; + + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + u64 to_add = min(len, global_rsv->size - global_rsv->reserved); + + global_rsv->reserved += to_add; + btrfs_space_info_update_bytes_may_use(space_info, to_add); + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + len -= to_add; + } + spin_unlock(&global_rsv->lock); + +grant: + /* Add to any tickets we may have. */ + if (len) + btrfs_try_granting_tickets(fs_info, space_info); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 92c595fed1b0..a96efdb5e681 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -4,8 +4,17 @@ #define BTRFS_SPACE_INFO_H #include <trace/events/btrfs.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/kobject.h> +#include <linux/lockdep.h> +#include <linux/wait.h> +#include <linux/rwsem.h> #include "volumes.h" +struct btrfs_fs_info; +struct btrfs_block_group; + /* * Different levels for to flush space when doing space reservations. * @@ -70,6 +79,10 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_EMERGENCY, }; +/* + * Please be aware that the order of enum values will be the order of the reclaim + * process in btrfs_async_reclaim_metadata_space(). + */ enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, @@ -82,9 +95,11 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 9, RUN_DELAYED_IPUTS = 10, COMMIT_TRANS = 11, + RESET_ZONES = 12, }; struct btrfs_space_info { + struct btrfs_fs_info *fs_info; spinlock_t lock; u64 total_bytes; /* total bytes in the space, @@ -156,6 +171,47 @@ struct btrfs_space_info { struct kobject kobj; struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; + + /* + * Monotonically increasing counter of block group reclaim attempts + * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_count + */ + u64 reclaim_count; + + /* + * Monotonically increasing counter of reclaimed bytes + * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_bytes + */ + u64 reclaim_bytes; + + /* + * Monotonically increasing counter of reclaim errors + * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_errors + */ + u64 reclaim_errors; + + /* + * If true, use the dynamic relocation threshold, instead of the + * fixed bg_reclaim_threshold. + */ + bool dynamic_reclaim; + + /* + * Periodically check all block groups against the reclaim + * threshold in the cleaner thread. + */ + bool periodic_reclaim; + + /* + * Periodic reclaim should be a no-op if a space_info hasn't + * freed any space since the last time we tried. + */ + bool periodic_reclaim_ready; + + /* + * Net bytes freed or allocated since the last reclaim pass. + */ + s64 reclaimable_bytes; }; struct reserve_ticket { @@ -166,7 +222,7 @@ struct reserve_ticket { wait_queue_head_t wait; }; -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); @@ -178,10 +234,10 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) */ #define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ static inline void \ -btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ - struct btrfs_space_info *sinfo, \ +btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ + struct btrfs_fs_info *fs_info = sinfo->fs_info; \ const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ lockdep_assert_held(&sinfo->lock); \ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ @@ -198,6 +254,7 @@ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); +DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, @@ -206,7 +263,7 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -219,17 +276,16 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( - struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes) { spin_lock(&space_info->lock); - btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); + btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, @@ -238,4 +294,11 @@ void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); +void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); +bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 0e49dab8dad2..722acf768396 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,6 +64,7 @@ * This means a slightly higher tree locking latency. */ +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) @@ -74,7 +75,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space * mapping. And if page->mapping->host is data inode, it's subpage. * As we have ruled our sectorsize >= PAGE_SIZE case already. */ - if (!mapping || !mapping->host || is_data_inode(mapping->host)) + if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host))) return true; /* @@ -85,34 +86,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space return true; return false; } - -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) -{ - unsigned int cur = 0; - unsigned int nr_bits; - - ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); - - nr_bits = PAGE_SIZE / sectorsize; - subpage_info->bitmap_nr_bits = nr_bits; - - subpage_info->uptodate_offset = cur; - cur += nr_bits; - - subpage_info->dirty_offset = cur; - cur += nr_bits; - - subpage_info->writeback_offset = cur; - cur += nr_bits; - - subpage_info->ordered_offset = cur; - cur += nr_bits; - - subpage_info->checked_offset = cur; - cur += nr_bits; - - subpage_info->total_nr_bits = cur; -} +#endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) @@ -160,18 +134,16 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, ASSERT(fs_info->sectorsize < PAGE_SIZE); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); spin_lock_init(&ret->lock); - if (type == BTRFS_SUBPAGE_METADATA) { + if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&ret->eb_refs, 0); - } else { - atomic_set(&ret->readers, 0); - atomic_set(&ret->writers, 0); - } + else + atomic_set(&ret->nr_locked, 0); return ret; } @@ -237,40 +209,15 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, start + len <= folio_pos(folio) + PAGE_SIZE); } -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int nbits = len >> fs_info->sectorsize_bits; - - btrfs_subpage_assert(fs_info, folio, start, len); - - atomic_add(nbits, &subpage->readers); -} - -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int nbits = len >> fs_info->sectorsize_bits; - bool is_data; - bool last; - - btrfs_subpage_assert(fs_info, folio, start, len); - is_data = is_data_inode(folio->mapping->host); - ASSERT(atomic_read(&subpage->readers) >= nbits); - last = atomic_sub_and_test(nbits, &subpage->readers); - - /* - * For data we need to unlock the page if the last read has finished. - * - * And please don't replace @last with atomic_sub_and_test() call - * inside if () condition. - * As we want the atomic_sub_and_test() to be always executed. - */ - if (is_data && last) - folio_unlock(folio); -} +#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ +({ \ + unsigned int __start_bit; \ + \ + btrfs_subpage_assert(fs_info, folio, start, len); \ + __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ + __start_bit; \ +}) static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { @@ -290,100 +237,130 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) orig_start + orig_len) - *start; } -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int nbits = (len >> fs_info->sectorsize_bits); - int ret; - - btrfs_subpage_assert(fs_info, folio, start, len); - - ASSERT(atomic_read(&subpage->readers) == 0); - ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret == nbits); -} - -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); + unsigned long flags; + unsigned int cleared = 0; + int bit = start_bit; + bool last; btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. * * This @locked_page is locked by plain lock_page(), thus its - * subpage::writers is 0. Handle them in a special way. + * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) + if (atomic_read(&subpage->nr_locked) == 0) { + spin_unlock_irqrestore(&subpage->lock, flags); return true; + } - ASSERT(atomic_read(&subpage->writers) >= nbits); - return atomic_sub_and_test(nbits, &subpage->writers); + for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { + clear_bit(bit, subpage->bitmaps); + cleared++; + } + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); + spin_unlock_irqrestore(&subpage->lock, flags); + return last; } /* - * Lock a folio for delalloc page writeback. + * Handle different locked folios: * - * Return -EAGAIN if the page is not properly initialized. - * Return 0 with the page locked, and writer counter updated. + * - Non-subpage folio + * Just unlock it. * - * Even with 0 returned, the page still need extra check to make sure - * it's really the correct page, as the caller is using - * filemap_get_folios_contig(), which can race with page invalidating. + * - folio locked but without any subpage locked + * This happens either before writepage_delalloc() or the delalloc range is + * already handled by previous folio. + * We can simple unlock it. + * + * - folio locked with subpage range locked. + * We go through the locked sectors inside the range and clear their locked + * bitmap, reduce the writer lock number, and unlock the page if that's + * the last locked range. */ -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { + struct btrfs_subpage *subpage = folio_get_private(folio); + + ASSERT(folio_test_locked(folio)); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { - folio_lock(folio); - return 0; + folio_unlock(folio); + return; } - folio_lock(folio); - if (!folio_test_private(folio) || !folio_get_private(folio)) { + + /* + * For subpage case, there are two types of locked page. With or + * without locked number. + * + * Since we own the page lock, no one else could touch subpage::locked + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); - return -EAGAIN; + return; } + btrfs_subpage_clamp_range(folio, &start, &len); - btrfs_subpage_start_writer(fs_info, folio, start, len); - return 0; + if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len)) + folio_unlock(folio); } -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + unsigned long flags; + bool last = false; + int cleared = 0; + int bit; + + if (!btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } - btrfs_subpage_clamp_range(folio, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) + + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); -} + return; + } -#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ -({ \ - unsigned int start_bit; \ - \ - btrfs_subpage_assert(fs_info, folio, start, len); \ - start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - start_bit += fs_info->subpage_info->name##_offset; \ - start_bit; \ -}) + spin_lock_irqsave(&subpage->lock, flags); + for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + cleared++; + } + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); + spin_unlock_irqrestore(&subpage->lock, flags); + if (last) + folio_unlock(folio); +} #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) #define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -658,109 +635,146 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +{ \ + const int sectors_per_page = fs_info->sectors_per_page; \ + \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ + *dst = bitmap_read(subpage->bitmaps, \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ +} + +#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ +{ \ + const struct btrfs_subpage *subpage = folio_get_private(folio); \ + unsigned long bitmap; \ + \ + GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + btrfs_warn(fs_info, \ + "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + start, len, folio_pos(folio), \ + fs_info->sectors_per_page, &bitmap); \ +} + /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. */ -void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio) +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_subpage *subpage; + unsigned int start_bit; + unsigned int nbits; + unsigned long flags; if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - ASSERT(!folio_test_dirty(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) { + ASSERT(!folio_test_dirty(folio)); return; + } - ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); + start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); + nbits = len >> fs_info->sectorsize_bits; + subpage = folio_get_private(folio); + ASSERT(subpage); + spin_lock_irqsave(&subpage->lock, flags); + if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + } + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + spin_unlock_irqrestore(&subpage->lock, flags); } /* - * Handle different locked pages with different page sizes: - * - * - Page locked by plain lock_page() - * It should not have any subpage::writers count. - * Can be unlocked by unlock_page(). - * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages(). - * Rarer cases include the @locked_page from extent_write_locked_range(). + * This is for folio already locked by plain lock_page()/folio_lock(), which + * doesn't have any subpage awareness. * - * - Page locked by lock_delalloc_pages() - * There is only one caller, all pages except @locked_page for - * extent_write_locked_range(). - * In this case, we have to call subpage helper to handle the case. + * This populates the involved subpage ranges so that subpage helpers can + * properly unlock them. */ -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; + unsigned long flags; + unsigned int start_bit; + unsigned int nbits; + int ret; ASSERT(folio_test_locked(folio)); - /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) return; - } - ASSERT(folio_test_private(folio) && folio_get_private(folio)); subpage = folio_get_private(folio); - - /* - * For subpage case, there are two types of locked page. With or - * without writers number. - * - * Since we own the page lock, no one else could touch subpage::writers - * and we are safe to do several atomic operations without spinlock. - */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page() */ - folio_unlock(folio); - return; + start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); + nbits = len >> fs_info->sectorsize_bits; + spin_lock_irqsave(&subpage->lock, flags); + /* Target range should not yet be locked. */ + if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); } - - /* Have writers, use proper subpage helper to end it */ - btrfs_folio_end_writer_lock(fs_info, folio, start, len); + bitmap_set(subpage->bitmaps, start_bit, nbits); + ret = atomic_add_return(nbits, &subpage->nr_locked); + ASSERT(ret <= fs_info->sectors_per_page); + spin_unlock_irqrestore(&subpage->lock, flags); } -#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ - bitmap_cut(dst, subpage->bitmaps, 0, \ - subpage_info->name##_offset, subpage_info->bitmap_nr_bits) - void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; + const u32 sectors_per_page = fs_info->sectors_per_page; unsigned long uptodate_bitmap; - unsigned long error_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; unsigned long ordered_bitmap; unsigned long checked_bitmap; + unsigned long locked_bitmap; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(subpage_info); + ASSERT(sectors_per_page > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, -"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - subpage_info->bitmap_nr_bits, &uptodate_bitmap, - subpage_info->bitmap_nr_bits, &error_bitmap, - subpage_info->bitmap_nr_bits, &dirty_bitmap, - subpage_info->bitmap_nr_bits, &writeback_bitmap, - subpage_info->bitmap_nr_bits, &ordered_bitmap, - subpage_info->bitmap_nr_bits, &checked_bitmap); + sectors_per_page, &uptodate_bitmap, + sectors_per_page, &dirty_bitmap, + sectors_per_page, &locked_bitmap, + sectors_per_page, &writeback_bitmap, + sectors_per_page, &ordered_bitmap, + sectors_per_page, &checked_bitmap); +} + +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap) +{ + struct btrfs_subpage *subpage; + unsigned long flags; + + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + ASSERT(fs_info->sectors_per_page > 1); + subpage = folio_get_private(folio); + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 793c2b314a58..44fff1f4eac4 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -4,6 +4,12 @@ #define BTRFS_SUBPAGE_H #include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/sizes.h> + +struct address_space; +struct folio; +struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -13,29 +19,23 @@ * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- dirty_offset /- ordered_offset + * /- uptodate /- dirty /- ordered * | | | * v v v * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| - * |<- bitmap_nr_bits ->| - * |<----------------- total_nr_bits ------------------>| + * |< sectors_per_page >| + * + * Unlike regular macro-like enums, here we do not go upper-case names, as + * these names will be utilized in various macros to define function names. */ -struct btrfs_subpage_info { - /* Number of bits for each bitmap */ - unsigned int bitmap_nr_bits; - - /* Total number of bits for the whole bitmap */ - unsigned int total_nr_bits; - - /* - * *_start indicates where the bitmap starts, the length is always - * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. - */ - unsigned int uptodate_offset; - unsigned int dirty_offset; - unsigned int writeback_offset; - unsigned int ordered_offset; - unsigned int checked_offset; +enum { + btrfs_bitmap_nr_uptodate = 0, + btrfs_bitmap_nr_dirty, + btrfs_bitmap_nr_writeback, + btrfs_bitmap_nr_ordered, + btrfs_bitmap_nr_checked, + btrfs_bitmap_nr_locked, + btrfs_bitmap_nr_max }; /* @@ -45,14 +45,6 @@ struct btrfs_subpage_info { struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - /* - * Both data and metadata needs to track how many readers are for the - * page. - * Data relies on @readers to unlock the page when last reader finished. - * While metadata doesn't need page unlock, it needs to prevent - * page::private get cleared before the last end_page_read(). - */ - atomic_t readers; union { /* * Structures only used by metadata @@ -62,8 +54,12 @@ struct btrfs_subpage { */ atomic_t eb_refs; - /* Structures only used by data */ - atomic_t writers; + /* + * Structures only used by data, + * + * How many sectors inside the page is locked. + */ + atomic_t nr_locked; }; unsigned long bitmaps[]; }; @@ -73,9 +69,16 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#else +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct address_space *mapping) +{ + return false; +} +#endif -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -88,20 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); - -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); - +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); /* * Template for subpage related operations. * @@ -142,12 +137,27 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); DECLARE_BTRFS_SUBPAGE_OPS(checked); +/* + * Helper for error cleanup, where a folio will have its dirty flag cleared, + * with writeback started and finished. + */ +static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info, + struct folio *locked_folio, + u64 start, u32 len) +{ + btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len); +} + bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); -void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio); -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 101f786963d4..dc4fee519ca6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -34,13 +34,12 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" +#include "direct-io.h" #include "props.h" #include "xattr.h" #include "bio.h" #include "export.h" #include "compression.h" -#include "rcu-string.h" #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" @@ -83,7 +82,7 @@ struct btrfs_fs_context { u32 commit_interval; u32 metadata_ratio; u32 thread_pool_size; - unsigned long mount_opt; + unsigned long long mount_opt; unsigned long compress_type:4; unsigned int compress_level; refcount_t refs; @@ -121,14 +120,12 @@ enum { Opt_thread_pool, Opt_treelog, Opt_user_subvol_rm_allowed, + Opt_norecovery, /* Rescue options */ Opt_rescue, Opt_usebackuproot, Opt_nologreplay, - Opt_ignorebadroots, - Opt_ignoredatacsums, - Opt_rescue_all, /* Debugging options */ Opt_enospc_debug, @@ -179,6 +176,8 @@ enum { Opt_rescue_nologreplay, Opt_rescue_ignorebadroots, Opt_rescue_ignoredatacsums, + Opt_rescue_ignoremetacsums, + Opt_rescue_ignoresuperflags, Opt_rescue_parameter_all, }; @@ -188,7 +187,11 @@ static const struct constant_table btrfs_parameter_rescue[] = { { "ignorebadroots", Opt_rescue_ignorebadroots }, { "ibadroots", Opt_rescue_ignorebadroots }, { "ignoredatacsums", Opt_rescue_ignoredatacsums }, + { "ignoremetacsums", Opt_rescue_ignoremetacsums}, + { "ignoresuperflags", Opt_rescue_ignoresuperflags}, { "idatacsums", Opt_rescue_ignoredatacsums }, + { "imetacsums", Opt_rescue_ignoremetacsums}, + { "isuperflags", Opt_rescue_ignoresuperflags}, { "all", Opt_rescue_parameter_all }, {} }; @@ -247,6 +250,8 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), + /* For compatibility only, alias for "rescue=nologreplay". */ + fsparam_flag("norecovery", Opt_norecovery), /* Debugging options. */ fsparam_flag_no("enospc_debug", Opt_enospc_debug), @@ -334,6 +339,15 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) fallthrough; case Opt_compress: case Opt_compress_type: + /* + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears + * "force-compress" without the need to pass + * "compress-force=[no|none]" before specifying "compress". + */ + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + if (opt == Opt_compress || opt == Opt_compress_force) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; @@ -440,6 +454,11 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; + case Opt_norecovery: + btrfs_info(NULL, +"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; case Opt_flushoncommit: if (result.negated) btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); @@ -567,8 +586,16 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_rescue_ignoredatacsums: btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); break; + case Opt_rescue_ignoremetacsums: + btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS); + break; + case Opt_rescue_ignoresuperflags: + btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS); + break; case Opt_rescue_parameter_all: btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS); btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; @@ -623,8 +650,8 @@ static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); } -static bool check_ro_option(struct btrfs_fs_info *fs_info, - unsigned long mount_opt, unsigned long opt, +static bool check_ro_option(const struct btrfs_fs_info *fs_info, + unsigned long long mount_opt, unsigned long long opt, const char *opt_name) { if (mount_opt & opt) { @@ -635,7 +662,8 @@ static bool check_ro_option(struct btrfs_fs_info *fs_info, return false; } -bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, +bool btrfs_check_options(const struct btrfs_fs_info *info, + unsigned long long *mount_opt, unsigned long flags) { bool ret = true; @@ -643,7 +671,9 @@ bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, if (!(flags & SB_RDONLY) && (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || - check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))) + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREMETACSUMS, "ignoremetacsums") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNORESUPERFLAGS, "ignoresuperflags"))) ret = false; if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && @@ -662,8 +692,11 @@ bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, ret = false; if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { - if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_info(info, "disk space caching is enabled"); + btrfs_warn(info, +"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); + } if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) btrfs_info(info, "using free-space-tree"); } @@ -912,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec } static int btrfs_fill_super(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - void *data) + struct btrfs_fs_devices *fs_devices) { struct inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -929,7 +961,7 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; - sb->s_iflags |= SB_I_CGROUPWB; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; err = super_setup_bdi(sb); if (err) { @@ -937,13 +969,13 @@ static int btrfs_fill_super(struct super_block *sb, return err; } - err = open_ctree(sb, fs_devices, (char *)data); + err = open_ctree(sb, fs_devices); if (err) { - btrfs_err(fs_info, "open_ctree failed"); + btrfs_err(fs_info, "open_ctree failed: %d", err); return err; } - inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); + inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { err = PTR_ERR(inode); btrfs_handle_fs_error(fs_info, err, NULL); @@ -977,7 +1009,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1059,6 +1091,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) print_rescue_option(seq, "ignorebadroots", &printed); if (btrfs_test_opt(info, IGNOREDATACSUMS)) print_rescue_option(seq, "ignoredatacsums", &printed); + if (btrfs_test_opt(info, IGNOREMETACSUMS)) + print_rescue_option(seq, "ignoremetacsums", &printed); + if (btrfs_test_opt(info, IGNORESUPERFLAGS)) + print_rescue_option(seq, "ignoresuperflags", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) @@ -1099,10 +1135,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif if (btrfs_test_opt(info, REF_VERIFY)) seq_puts(seq, ",ref_verify"); - seq_printf(seq, ",subvolid=%llu", - BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); subvol_name = btrfs_get_subvol_name_from_objectid(info, - BTRFS_I(d_inode(dentry))->root->root_key.objectid); + btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); if (!IS_ERR(subvol_name)) { seq_puts(seq, ",subvol="); seq_escape(seq, subvol_name, " \t\n\\"); @@ -1154,7 +1189,7 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, struct super_block *s = root->d_sb; struct btrfs_fs_info *fs_info = btrfs_sb(s); struct inode *root_inode = d_inode(root); - u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root); ret = 0; if (!is_subvolume_inode(root_inode)) { @@ -1208,7 +1243,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, } static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, - unsigned long old_opts, int flags) + unsigned long long old_opts, int flags) { if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || @@ -1222,7 +1257,7 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, } static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, - unsigned long old_opts) + unsigned long long old_opts) { const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); @@ -1417,6 +1452,8 @@ static void btrfs_emit_options(struct btrfs_fs_info *info, btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time"); btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots"); btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums"); + btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums"); + btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags"); btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); @@ -1468,8 +1505,7 @@ static int btrfs_reconfigure(struct fs_context *fc) sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - if (!mount_reconfigure && - !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) return -EINVAL; ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); @@ -1767,7 +1803,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; - buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_bsize = fs_info->sectorsize; buf->f_namelen = BTRFS_NAME_LEN; /* We treat it as constant endianness (it doesn't matter _which_) @@ -1776,10 +1812,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= - BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32; - buf->f_fsid.val[1] ^= - BTRFS_I(d_inode(dentry))->root->root_key.objectid; + buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32; + buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root); return 0; } @@ -1851,18 +1885,21 @@ static int btrfs_get_tree_super(struct fs_context *fc) if (sb->s_root) { btrfs_close_devices(fs_devices); - if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) - ret = -EBUSY; + /* + * At this stage we may have RO flag mismatch between + * fc->sb_flags and sb->s_flags. Caller should detect such + * mismatch and reconfigure with sb->s_umount rwsem held if + * needed. + */ } else { snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; - ret = btrfs_fill_super(sb, fs_devices, NULL); - } - - if (ret) { - deactivate_locked_super(sb); - return ret; + ret = btrfs_fill_super(sb, fs_devices); + if (ret) { + deactivate_locked_super(sb); + return ret; + } } btrfs_clear_oneshot_options(fs_info); @@ -1943,59 +1980,23 @@ error: * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem * in fc->sb_flags. * - * This disambiguation has rather positive consequences. Mounting a subvolume - * ro will not also turn the superblock ro. Only the mount for the subvolume - * will become ro. - * - * So, if the superblock creation request comes from the new mount API the - * caller must have explicitly done: - * - * fsconfig(FSCONFIG_SET_FLAG, "ro") - * fsmount/mount_setattr(MOUNT_ATTR_RDONLY) - * - * IOW, at some point the caller must have explicitly turned the whole - * superblock ro and we shouldn't just undo it like we did for the old mount - * API. In any case, it lets us avoid the hack in the new mount API. - * - * Consequently, the remounting hack must only be used for requests originating - * from the old mount API and should be marked for full deprecation so it can be - * turned off in a couple of years. - * - * The new mount API has no reason to support this hack. + * But, currently the util-linux mount command already utilizes the new mount + * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's + * btrfs or not, setting the whole super block RO. To make per-subvolume mounting + * work with different options work we need to keep backward compatibility. */ -static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) +static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt) { - struct vfsmount *mnt; - int ret; - const bool ro2rw = !(fc->sb_flags & SB_RDONLY); - - /* - * We got an EBUSY because our SB_RDONLY flag didn't match the existing - * super block, so invert our setting here and retry the mount so we - * can get our vfsmount. - */ - if (ro2rw) - fc->sb_flags |= SB_RDONLY; - else - fc->sb_flags &= ~SB_RDONLY; - - mnt = fc_mount(fc); - if (IS_ERR(mnt)) - return mnt; + int ret = 0; - if (!fc->oldapi || !ro2rw) - return mnt; + if (fc->sb_flags & SB_RDONLY) + return ret; - /* We need to convert to rw, call reconfigure. */ - fc->sb_flags &= ~SB_RDONLY; down_write(&mnt->mnt_sb->s_umount); - ret = btrfs_reconfigure(fc); + if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY)) + ret = btrfs_reconfigure(fc); up_write(&mnt->mnt_sb->s_umount); - if (ret) { - mntput(mnt); - return ERR_PTR(ret); - } - return mnt; + return ret; } static int btrfs_get_tree_subvol(struct fs_context *fc) @@ -2005,6 +2006,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) struct fs_context *dup_fc; struct dentry *dentry; struct vfsmount *mnt; + int ret = 0; /* * Setup a dummy root and fs_info for test/set super. This is because @@ -2047,11 +2049,16 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fc->security = NULL; mnt = fc_mount(dup_fc); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) - mnt = btrfs_reconfigure_for_mount(dup_fc); - put_fs_context(dup_fc); - if (IS_ERR(mnt)) + if (IS_ERR(mnt)) { + put_fs_context(dup_fc); return PTR_ERR(mnt); + } + ret = btrfs_reconfigure_for_mount(dup_fc, mnt); + put_fs_context(dup_fc); + if (ret) { + mntput(mnt); + return ret; + } /* * This free's ->subvol_name, because if it isn't set we have to @@ -2170,7 +2177,8 @@ static struct file_system_type btrfs_fs_type = { .init_fs_context = btrfs_init_fs_context, .parameters = btrfs_fs_parameters, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | + FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("btrfs"); @@ -2203,7 +2211,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); - vol->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol); + if (ret < 0) + goto out; switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -2233,7 +2243,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - ret = PTR_ERR(device); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + ret = 0; break; } ret = !(device->fs_devices->num_devices == @@ -2245,15 +2258,14 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; } +out: kfree(vol); return ret; } static int btrfs_freeze(struct super_block *sb) { - struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; set_bit(BTRFS_FS_FROZEN, &fs_info->flags); /* @@ -2262,14 +2274,7 @@ static int btrfs_freeze(struct super_block *sb) * we want to avoid on a frozen filesystem), or do the commit * ourselves. */ - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - /* no transaction, don't bother */ - if (PTR_ERR(trans) == -ENOENT) - return 0; - return PTR_ERR(trans); - } - return btrfs_commit_transaction(trans); + return btrfs_commit_current_transaction(fs_info->tree_root); } static int check_dev_super(struct btrfs_device *dev) @@ -2373,6 +2378,27 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) return 0; } +static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_count(fs_info, nr); + + return nr; +} + +static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) +{ + const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_free_extent_maps(fs_info, nr_to_scan); + + /* The extent map shrinker runs asynchronously, so always return 0. */ + return 0; +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -2386,6 +2412,8 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .freeze_fs = btrfs_freeze, .unfreeze_fs = btrfs_unfreeze, + .nr_cached_objects = btrfs_nr_cached_objects, + .free_cached_objects = btrfs_free_cached_objects, }; static const struct file_operations btrfs_ctl_fops = { @@ -2418,6 +2446,9 @@ static __cold void btrfs_interface_exit(void) static int __init btrfs_print_mod_info(void) { static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL + ", experimental=on" +#endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2438,7 +2469,17 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (btrfs_get_mod_read_policy() == NULL) + pr_info("Btrfs loaded%s\n", options); + else + pr_info("Btrfs loaded%s, read_policy=%s\n", + options, btrfs_get_mod_read_policy()); +#else pr_info("Btrfs loaded%s\n", options); +#endif + return 0; } @@ -2473,6 +2514,9 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_init_cachep, .exit_func = btrfs_destroy_cachep, }, { + .init_func = btrfs_init_dio, + .exit_func = btrfs_destroy_dio, + }, { .init_func = btrfs_transaction_init, .exit_func = btrfs_transaction_exit, }, { @@ -2493,6 +2537,11 @@ static const struct init_sequence mod_init_seq[] = { }, { .init_func = extent_map_init, .exit_func = extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + }, { + .init_func = btrfs_read_policy_init, + .exit_func = NULL, +#endif }, { .init_func = ordered_data_init, .exit_func = ordered_data_exit, @@ -2564,6 +2613,7 @@ static int __init init_btrfs_fs(void) late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) +MODULE_DESCRIPTION("B-Tree File System (BTRFS)"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); MODULE_SOFTDEP("pre: xxhash64"); diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index f18253ca280d..d80a86acfbbe 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -3,7 +3,15 @@ #ifndef BTRFS_SUPER_H #define BTRFS_SUPER_H -bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, +#include <linux/types.h> +#include <linux/fs.h> +#include "fs.h" + +struct super_block; +struct btrfs_fs_info; + +bool btrfs_check_options(const struct btrfs_fs_info *info, + unsigned long long *mount_opt, unsigned long flags); int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 84c05246ffd8..14f53f757555 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -295,7 +295,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ @@ -329,7 +329,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif @@ -385,6 +385,8 @@ static const char *rescue_opts[] = { "nologreplay", "ignorebadroots", "ignoredatacsums", + "ignoremetacsums", + "ignoresuperflags", "all", }; @@ -421,7 +423,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { - return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); + return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); } BTRFS_ATTR(static_feature, acl, acl_show); @@ -894,6 +896,9 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +SPACE_INFO_ATTR(reclaim_count); +SPACE_INFO_ATTR(reclaim_bytes); +SPACE_INFO_ATTR(reclaim_errors); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); @@ -902,8 +907,12 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); + ssize_t ret; - return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); + spin_lock(&space_info->lock); + ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info)); + spin_unlock(&space_info->lock); + return ret; } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, @@ -914,6 +923,9 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, int thresh; int ret; + if (READ_ONCE(space_info->dynamic_reclaim)) + return -EINVAL; + ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; @@ -930,6 +942,72 @@ BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, btrfs_sinfo_bg_reclaim_threshold_show, btrfs_sinfo_bg_reclaim_threshold_store); +static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim)); +} + +static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int dynamic_reclaim; + int ret; + + ret = kstrtoint(buf, 10, &dynamic_reclaim); + if (ret) + return ret; + + if (dynamic_reclaim < 0) + return -EINVAL; + + WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0); + + return len; +} + +BTRFS_ATTR_RW(space_info, dynamic_reclaim, + btrfs_sinfo_dynamic_reclaim_show, + btrfs_sinfo_dynamic_reclaim_store); + +static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim)); +} + +static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int periodic_reclaim; + int ret; + + ret = kstrtoint(buf, 10, &periodic_reclaim); + if (ret) + return ret; + + if (periodic_reclaim < 0) + return -EINVAL; + + WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0); + + return len; +} + +BTRFS_ATTR_RW(space_info, periodic_reclaim, + btrfs_sinfo_periodic_reclaim_show, + btrfs_sinfo_periodic_reclaim_store); + /* * Allocation information about block group types. * @@ -947,8 +1025,13 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, dynamic_reclaim), BTRFS_ATTR_PTR(space_info, chunk_size), BTRFS_ATTR_PTR(space_info, size_classes), + BTRFS_ATTR_PTR(space_info, reclaim_count), + BTRFS_ATTR_PTR(space_info, reclaim_bytes), + BTRFS_ATTR_PTR(space_info, reclaim_errors), + BTRFS_ATTR_PTR(space_info, periodic_reclaim), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif @@ -1035,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1045,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1097,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -1222,24 +1305,102 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", + "devid", +#endif +}; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + +/* Global module configuration parameters. */ +static char *read_policy; +char *btrfs_get_mod_read_policy(void) +{ + return read_policy; +} + +/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ +module_param(read_policy, charp, 0); +MODULE_PARM_DESC(read_policy, +"Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]"); +#endif + +int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) +{ + char param[32]; + char __maybe_unused *value_str; + + if (!str || strlen(str) == 0) + return 0; + + strscpy(param, str); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Separate value from input in policy:value format. */ + value_str = strchr(param, ':'); + if (value_str) { + int ret; + + *value_str = 0; + value_str++; + if (!value_ret) + return -EINVAL; + ret = kstrtos64(value_str, 10, value_ret); + if (ret) + return -EINVAL; + if (*value_ret < 0) + return -ERANGE; + } +#endif + + return sysfs_match_string(btrfs_read_policy_name, param); +} + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void) +{ + s64 value; + + if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { + btrfs_err(NULL, "invalid read policy or value %s", read_policy); + return -EINVAL; + } + + return 0; +} +#endif static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy); ssize_t ret = 0; int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (fs_devices->read_policy == i) - ret += sysfs_emit_at(buf, ret, "%s[%s]", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); - else - ret += sysfs_emit_at(buf, ret, "%s%s", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); + if (ret != 0) + ret += sysfs_emit_at(buf, ret, " "); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "["); + + ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%u", + READ_ONCE(fs_devices->rr_min_contig_read)); + + if (i == BTRFS_READ_POLICY_DEVID) + ret += sysfs_emit_at(buf, ret, ":%llu", + READ_ONCE(fs_devices->read_devid)); +#endif + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); @@ -1252,21 +1413,80 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int i; + int index; + s64 value = -1; - for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != fs_devices->read_policy) { - fs_devices->read_policy = i; - btrfs_info(fs_devices->fs_info, - "read policy set to '%s'", - btrfs_read_policy_name[i]); + index = btrfs_read_policy_to_enum(buf, &value); + if (index < 0) + return -EINVAL; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* If moving from RR then disable collecting fs stats. */ + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = false; + + if (index == BTRFS_READ_POLICY_RR) { + if (value != -1) { + const u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_debug(fs_devices->fs_info, +"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; } - return len; + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contig_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contig_read, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); } + + fs_devices->collect_fs_stats = true; + + return len; } - return -EINVAL; + if (index == BTRFS_READ_POLICY_DEVID) { + if (value != -1) { + BTRFS_DEV_LOOKUP_ARGS(args); + + /* Validate input devid. */ + args.devid = value; + if (btrfs_find_device(fs_devices, &args) == NULL) + return -EINVAL; + } else { + /* Set default devid to the devid of the latest device. */ + value = fs_devices->latest_dev->devid; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->read_devid)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->read_devid, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", + btrfs_read_policy_name[index], value); + } + + return len; + } +#endif + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); + btrfs_info(fs_devices->fs_info, "read policy set to '%s'", + btrfs_read_policy_name[index]); + } + + return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); @@ -1306,6 +1526,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static ssize_t btrfs_offload_csum_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + + switch (READ_ONCE(fs_devices->offload_csum_mode)) { + case BTRFS_OFFLOAD_CSUM_AUTO: + return sysfs_emit(buf, "auto\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_ON: + return sysfs_emit(buf, "1\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_OFF: + return sysfs_emit(buf, "0\n"); + default: + WARN_ON(1); + return -EINVAL; + } +} + +static ssize_t btrfs_offload_csum_store(struct kobject *kobj, + struct kobj_attribute *a, const char *buf, + size_t len) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int ret; + bool val; + + ret = kstrtobool(buf, &val); + if (ret == 0) + WRITE_ONCE(fs_devices->offload_csum_mode, + val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); + else if (ret == -EINVAL && sysfs_streq(buf, "auto")) + WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); + else + return -EINVAL; + + return len; +} +BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); +#endif + /* * Per-filesystem information and stats. * @@ -1325,6 +1586,9 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), +#ifdef CONFIG_BTRFS_EXPERIMENTAL + BTRFS_ATTR_PTR(, offload_csum), +#endif NULL, }; @@ -2294,7 +2558,7 @@ int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, struct kobject *qgroups_kobj = fs_info->qgroups_kobj; int ret; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return 0; if (qgroup->kobj.state_initialized) return 0; @@ -2315,7 +2579,7 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return; rbtree_postorder_for_each_entry_safe(qgroup, next, @@ -2336,7 +2600,7 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) struct btrfs_qgroup *next; int ret = 0; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return 0; ASSERT(fsid_kobj); @@ -2368,7 +2632,7 @@ out: void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return; if (qgroup->kobj.state_initialized) { diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 86c7eef12873..3fc5c6f90dc4 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -3,8 +3,17 @@ #ifndef BTRFS_SYSFS_H #define BTRFS_SYSFS_H +#include <linux/types.h> +#include <linux/compiler_types.h> #include <linux/kobject.h> +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_block_group; +struct btrfs_space_info; +struct btrfs_qgroup; + enum btrfs_feature_set { FEAT_COMPAT, FEAT_COMPAT_RO, @@ -38,5 +47,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); +int btrfs_read_policy_to_enum(const char *str, s64 *value); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void); +char *btrfs_get_mod_read_policy(void); +#endif #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 709c6cc9706a..5eff8d7d2360 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -29,6 +29,8 @@ const char *test_error[] = { [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", + [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", + [TEST_ALLOC_TRANSACTION] = "cannot allocate transaction", }; static const struct super_operations btrfs_test_super_ops = { @@ -61,10 +63,7 @@ struct inode *btrfs_new_test_inode(void) return NULL; inode->i_mode = S_IFREG; - inode->i_ino = BTRFS_FIRST_FREE_OBJECTID; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; - BTRFS_I(inode)->location.offset = 0; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_FIRST_FREE_OBJECTID); inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG); return inode; @@ -144,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + + /* CRC32C csum size. */ + fs_info->csum_size = 4; + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / + fs_info->csum_size; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -160,8 +164,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) if (!fs_info) return; - if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, - &fs_info->fs_state))) + if (WARN_ON(!btrfs_is_testing(fs_info))) return; test_mnt->mnt_sb->s_fs_info = NULL; @@ -250,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) kfree(cache); } +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) +{ + memset(trans, 0, sizeof(*trans)); + trans->fs_info = fs_info; + xa_init(&trans->delayed_refs.head_refs); + xa_init(&trans->delayed_refs.dirty_extents); + spin_lock_init(&trans->delayed_refs.lock); +} + void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -295,6 +307,12 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_free_space_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_delayed_refs(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index dc2f2ab15fa5..4307bdaa6749 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -6,6 +6,8 @@ #ifndef BTRFS_TESTS_H #define BTRFS_TESTS_H +#include <linux/types.h> + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); @@ -24,12 +26,15 @@ enum { TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, + TEST_ALLOC_IO_CONTEXT, + TEST_ALLOC_TRANSACTION, }; extern const char *test_error[]; struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_transaction; int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); @@ -37,7 +42,9 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); @@ -47,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c new file mode 100644 index 000000000000..265370e79a54 --- /dev/null +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -0,0 +1,1016 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> +#include "btrfs-tests.h" +#include "../transaction.h" +#include "../delayed-ref.h" +#include "../extent-tree.h" + +#define FAKE_ROOT_OBJECTID 256 +#define FAKE_BYTENR 0 +#define FAKE_LEVEL 1 +#define FAKE_INO 256 +#define FAKE_FILE_OFFSET 0 +#define FAKE_PARENT SZ_1M + +struct ref_head_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + int total_ref_mod; + int must_insert; +}; + +struct ref_node_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + enum btrfs_delayed_ref_action action; + u8 type; + u64 parent; + u64 root; + u64 owner; + u64 offset; +}; + +static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type) +{ + if ((type == BTRFS_TREE_BLOCK_REF_KEY) || + (type == BTRFS_SHARED_BLOCK_REF_KEY)) + return BTRFS_REF_METADATA; + return BTRFS_REF_DATA; +} + +static void delete_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); +} + +static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *node) +{ + rb_erase_cached(&node->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&node->ref_node); + if (!list_empty(&node->add_list)) + list_del_init(&node->add_list); + btrfs_put_delayed_ref(node); +} + +static int validate_ref_head(struct btrfs_delayed_ref_head *head, + struct ref_head_check *check) +{ + if (head->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", head->bytenr, + check->bytenr); + return -EINVAL; + } + + if (head->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + head->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (head->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", head->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (head->total_ref_mod != check->total_ref_mod) { + test_err("invalid total_ref_mod have: %d want: %d", + head->total_ref_mod, check->total_ref_mod); + return -EINVAL; + } + + if (head->must_insert_reserved != check->must_insert) { + test_err("invalid must_insert have: %d want: %d", + head->must_insert_reserved, check->must_insert); + return -EINVAL; + } + + return 0; +} + +static int validate_ref_node(struct btrfs_delayed_ref_node *node, + struct ref_node_check *check) +{ + if (node->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", node->bytenr, + check->bytenr); + return -EINVAL; + } + + if (node->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + node->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (node->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", node->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (node->action != check->action) { + test_err("invalid action have: %d want: %d", node->action, + check->action); + return -EINVAL; + } + + if (node->parent != check->parent) { + test_err("invalid parent have: %llu want: %llu", node->parent, + check->parent); + return -EINVAL; + } + + if (node->ref_root != check->root) { + test_err("invalid root have: %llu want: %llu", node->ref_root, + check->root); + return -EINVAL; + } + + if (node->type != check->type) { + test_err("invalid type have: %d want: %d", node->type, + check->type); + return -EINVAL; + } + + if (btrfs_delayed_ref_owner(node) != check->owner) { + test_err("invalid owner have: %llu want: %llu", + btrfs_delayed_ref_owner(node), check->owner); + return -EINVAL; + } + + if (btrfs_delayed_ref_offset(node) != check->offset) { + test_err("invalid offset have: %llu want: %llu", + btrfs_delayed_ref_offset(node), check->offset); + return -EINVAL; + } + + return 0; +} + +static int simple_test(struct btrfs_trans_handle *trans, + struct ref_head_check *head_check, + struct ref_node_check *node_check) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = ref_type_from_disk_ref_type(node_check->type), + .action = node_check->action, + .parent = node_check->parent, + .ref_root = node_check->root, + .bytenr = node_check->bytenr, + .num_bytes = fs_info->nodesize, + }; + int ret; + + if (ref.type == BTRFS_REF_METADATA) + btrfs_init_tree_ref(&ref, node_check->owner, node_check->root, + false); + else + btrfs_init_data_ref(&ref, node_check->owner, node_check->offset, + node_check->root, true); + + if (ref.type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + return -EINVAL; + } + + ret = -EINVAL; + if (validate_ref_head(head, head_check)) + goto out; + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, node_check)) + goto out; + ret = 0; +out: + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * These are simple tests, make sure that our btrfs_ref's get turned into the + * appropriate btrfs_delayed_ref_node based on their settings and action. + */ +static int simple_tests(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .total_ref_mod = 1, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + .owner = FAKE_LEVEL, + .offset = 0, + }; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared data failed"); + return -EINVAL; + } + + head_check.ref_mod = -1; + head_check.total_ref_mod = -1; + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + node_check.parent = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared data failed"); + return -EINVAL; + } + + return 0; +} + +/* + * Merge tests, validate that we do delayed ref merging properly, the ref counts + * all end up properly, and delayed refs are deleted once they're no longer + * needed. + */ +static int merge_tests(struct btrfs_trans_handle *trans, + enum btrfs_ref_type type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = type, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 2, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + }; + int ret; + + /* + * First add a ref and then drop it, make sure we get a head ref with a + * 0 total ref mod and no nodes. + */ + if (type == BTRFS_REF_METADATA) { + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + } else { + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET, + FAKE_ROOT_OBJECTID, true); + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("single add and drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a ref, then add another ref, make sure we get a head ref with a + * 2 total ref mod and 1 node. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double add failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add two drop refs, make sure they are merged properly. */ + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add multiple refs, then drop until we go negative again. */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Drop multiple refs, then add until we go positive again. */ + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop to positive failed"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a bunch of refs with different roots and parents, then drop them + * all, make sure everything is properly merged. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 0; + head_check.total_ref_mod = 0; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop multiple failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + ret = 0; +out: + if (!IS_ERR_OR_NULL(head)) + btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * Basic test to validate we always get the add operations first followed by any + * delete operations. + */ +static int select_delayed_refs_test(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = BTRFS_REF_METADATA, + .action = BTRFS_DROP_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .owner = FAKE_LEVEL, + .offset = 0, + }; + int ret; + + /* Add the drop first. */ + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + /* + * Now add the add, and make it a different root so it's logically later + * in the rb tree. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.root = FAKE_ROOT_OBJECTID + 1; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Now we're going to do the same thing, but we're going to have an add + * that gets deleted because of a merge, and make sure we still have + * another add in place. + */ + ref.action = BTRFS_DROP_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 2; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID + 2; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + ret = 0; +out: + if (head) + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) +{ + struct btrfs_transaction *transaction; + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + int ret; + + test_msg("running delayed refs tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + transaction = kmalloc(sizeof(*transaction), GFP_KERNEL); + if (!transaction) { + test_std_err(TEST_ALLOC_TRANSACTION); + ret = -ENOMEM; + goto out_free_fs_info; + } + btrfs_init_dummy_trans(&trans, fs_info); + btrfs_init_dummy_transaction(transaction, fs_info); + trans.transaction = transaction; + + ret = simple_tests(&trans); + if (!ret) { + test_msg("running delayed refs merg tests on metadata refs"); + ret = merge_tests(&trans, BTRFS_REF_METADATA); + } + + if (!ret) { + test_msg("running delayed refs merg tests on data refs"); + ret = merge_tests(&trans, BTRFS_REF_DATA); + } + + if (!ret) + ret = select_delayed_refs_test(&trans); + + kfree(transaction); +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 25b3349595e0..0a2dbfaaf49e 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -11,6 +11,7 @@ #include "btrfs-tests.h" #include "../ctree.h" #include "../extent_io.h" +#include "../disk-io.h" #include "../btrfs_inode.h" #define PROCESS_UNLOCK (1 << 0) @@ -105,9 +106,11 @@ static void dump_extent_io_tree(const struct extent_io_tree *tree) } } -static int test_find_delalloc(u32 sectorsize) +static int test_find_delalloc(u32 sectorsize, u32 nodesize) { - struct inode *inode; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + struct inode *inode = NULL; struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; @@ -121,12 +124,27 @@ static int test_find_delalloc(u32 sectorsize) test_msg("running find delalloc tests"); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + inode = btrfs_new_test_inode(); if (!inode) { test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; + ret = -ENOMEM; + goto out; } tmp = &BTRFS_I(inode)->io_tree; + BTRFS_I(inode)->root = root; /* * Passing NULL as we don't have fs_info but tracepoints are not used @@ -162,7 +180,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -193,7 +211,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -227,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -248,7 +266,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); @@ -289,7 +307,7 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); @@ -316,6 +334,8 @@ out: process_page_range(inode, 0, total_dirty - 1, PROCESS_UNLOCK | PROCESS_RELEASE); iput(inode); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } @@ -794,7 +814,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) test_msg("running extent I/O tests"); - ret = test_find_delalloc(sectorsize); + ret = test_find_delalloc(sectorsize, nodesize); if (ret) goto out; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 253cce7ffecf..56e61ac1cc64 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -11,23 +11,27 @@ #include "../disk-io.h" #include "../block-group.h" -static void free_extent_map_tree(struct extent_map_tree *em_tree) +static int free_extent_map_tree(struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; struct rb_node *node; + int ret = 0; write_lock(&em_tree->lock); - while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { - node = rb_first_cached(&em_tree->map); + while (!RB_EMPTY_ROOT(&em_tree->root)) { + node = rb_first(&em_tree->root); em = rb_entry(node, struct extent_map, rb_node); - remove_extent_mapping(em_tree, em); + remove_extent_mapping(inode, em); #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { + ret = -EINVAL; test_err( -"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d", - em->start, em->len, em->block_start, - em->block_len, refcount_read(&em->refs)); +"em leak: em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu offset %llu) refs %d", + em->start, em->len, em->disk_bytenr, + em->disk_num_bytes, em->offset, + refcount_read(&em->refs)); refcount_set(&em->refs, 1); } @@ -35,6 +39,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) free_extent_map(em); } write_unlock(&em_tree->lock); + + return ret; } /* @@ -53,13 +59,14 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) * ->add_extent_mapping(0, 16K) * -> #handle -EEXIST */ -static int test_case_1(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 start = 0; u64 len = SZ_8K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -70,10 +77,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info, /* Add [0, 16K) */ em->start = 0; em->len = SZ_16K; - em->block_start = 0; - em->block_len = SZ_16K; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_16K; + em->ram_bytes = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); @@ -91,10 +99,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->start = SZ_16K; em->len = SZ_4K; - em->block_start = SZ_32K; /* avoid merging */ - em->block_len = SZ_4K; + em->disk_bytenr = SZ_32K; /* avoid merging */ + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_4K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); @@ -112,10 +121,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info, /* Add [0, 8K), should return [0, 16K) instead. */ em->start = start; em->len = len; - em->block_start = start; - em->block_len = len; + em->disk_bytenr = start; + em->disk_num_bytes = len; + em->ram_bytes = len; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret) { test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); @@ -128,16 +138,18 @@ static int test_case_1(struct btrfs_fs_info *fs_info, goto out; } if (em->start != 0 || extent_map_end(em) != SZ_16K || - em->block_start != 0 || em->block_len != SZ_16K) { + em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) { test_err( -"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", +"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu", start, start + len, ret, em->start, em->len, - em->block_start, em->block_len); + em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -148,11 +160,12 @@ out: * Reading the inline ending up with EEXIST, ie. read an inline * extent and discard page cache and read it again. */ -static int test_case_2(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -163,10 +176,11 @@ static int test_case_2(struct btrfs_fs_info *fs_info, /* Add [0, 1K) */ em->start = 0; em->len = SZ_1K; - em->block_start = EXTENT_MAP_INLINE; - em->block_len = (u64)-1; + em->disk_bytenr = EXTENT_MAP_INLINE; + em->disk_num_bytes = 0; + em->ram_bytes = SZ_1K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); @@ -184,10 +198,11 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->start = SZ_4K; em->len = SZ_4K; - em->block_start = SZ_4K; - em->block_len = SZ_4K; + em->disk_bytenr = SZ_4K; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_4K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -205,10 +220,11 @@ static int test_case_2(struct btrfs_fs_info *fs_info, /* Add [0, 1K) */ em->start = 0; em->len = SZ_1K; - em->block_start = EXTENT_MAP_INLINE; - em->block_len = (u64)-1; + em->disk_bytenr = EXTENT_MAP_INLINE; + em->disk_num_bytes = 0; + em->ram_bytes = SZ_1K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret) { test_err("case2 [0 1K]: ret %d", ret); @@ -220,26 +236,29 @@ static int test_case_2(struct btrfs_fs_info *fs_info, goto out; } if (em->start != 0 || extent_map_end(em) != SZ_1K || - em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) { + em->disk_bytenr != EXTENT_MAP_INLINE) { test_err( -"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", - ret, em->start, em->len, em->block_start, - em->block_len); +"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", + ret, em->start, em->len, em->disk_bytenr); ret = -EINVAL; } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } static int __test_case_3(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, u64 start) + struct btrfs_inode *inode, u64 start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 len = SZ_4K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -250,10 +269,11 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, /* Add [4K, 8K) */ em->start = SZ_4K; em->len = SZ_4K; - em->block_start = SZ_4K; - em->block_len = SZ_4K; + em->disk_bytenr = SZ_4K; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_4K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -271,10 +291,11 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, /* Add [0, 16K) */ em->start = 0; em->len = SZ_16K; - em->block_start = 0; - em->block_len = SZ_16K; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_16K; + em->ram_bytes = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); if (ret) { test_err("case3 [%llu %llu): ret %d", @@ -292,16 +313,18 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, * em->start. */ if (start < em->start || start + len > extent_map_end(em) || - em->start != em->block_start || em->len != em->block_len) { + em->start != extent_map_block_start(em)) { test_err( -"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)", +"case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)", start, start + len, ret, em->start, em->len, - em->block_start, em->block_len); + em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -322,28 +345,29 @@ out: * -> add_extent_mapping() * -> add_extent_mapping() */ -static int test_case_3(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_3(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { int ret; - ret = __test_case_3(fs_info, em_tree, 0); + ret = __test_case_3(fs_info, inode, 0); if (ret) return ret; - ret = __test_case_3(fs_info, em_tree, SZ_8K); + ret = __test_case_3(fs_info, inode, SZ_8K); if (ret) return ret; - ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K)); + ret = __test_case_3(fs_info, inode, (12 * SZ_1K)); return ret; } static int __test_case_4(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, u64 start) + struct btrfs_inode *inode, u64 start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 len = SZ_4K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -354,10 +378,11 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, /* Add [0K, 8K) */ em->start = 0; em->len = SZ_8K; - em->block_start = 0; - em->block_len = SZ_8K; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_8K; + em->ram_bytes = SZ_8K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); @@ -375,10 +400,11 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, /* Add [8K, 32K) */ em->start = SZ_8K; em->len = 24 * SZ_1K; - em->block_start = SZ_16K; /* avoid merging */ - em->block_len = 24 * SZ_1K; + em->disk_bytenr = SZ_16K; /* avoid merging */ + em->disk_num_bytes = 24 * SZ_1K; + em->ram_bytes = 24 * SZ_1K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); @@ -395,10 +421,11 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, /* Add [0K, 32K) */ em->start = 0; em->len = SZ_32K; - em->block_start = 0; - em->block_len = SZ_32K; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_32K; + em->ram_bytes = SZ_32K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); if (ret) { test_err("case4 [%llu %llu): ret %d", @@ -413,14 +440,16 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, } if (start < em->start || start + len > extent_map_end(em)) { test_err( -"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)", - start, start + len, ret, em->start, em->len, em->block_start, - em->block_len); +"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)", + start, start + len, ret, em->start, em->len, + em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -450,23 +479,22 @@ out: * # handle -EEXIST when adding * # [0, 32K) */ -static int test_case_4(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_4(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { int ret; - ret = __test_case_4(fs_info, em_tree, 0); + ret = __test_case_4(fs_info, inode, 0); if (ret) return ret; - ret = __test_case_4(fs_info, em_tree, SZ_4K); + ret = __test_case_4(fs_info, inode, SZ_4K); return ret; } -static int add_compressed_extent(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +static int add_compressed_extent(struct btrfs_inode *inode, u64 start, u64 len, u64 block_start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; @@ -478,11 +506,12 @@ static int add_compressed_extent(struct btrfs_fs_info *fs_info, em->start = start; em->len = len; - em->block_start = block_start; - em->block_len = SZ_4K; + em->disk_bytenr = block_start; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = len; em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); free_extent_map(em); if (ret < 0) { @@ -534,7 +563,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index) struct rb_node *n; int i; - for (i = 0, n = rb_first_cached(&em_tree->map); + for (i = 0, n = rb_first(&em_tree->root); valid_ranges[index][i].len && n; i++, n = rb_next(n)) { struct extent_map *entry = rb_entry(n, struct extent_map, rb_node); @@ -588,53 +617,44 @@ static int validate_range(struct extent_map_tree *em_tree, int index) * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from * merging the em's. */ -static int test_case_5(struct btrfs_fs_info *fs_info) +static int test_case_5(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { - struct extent_map_tree *em_tree; - struct inode *inode; u64 start, end; int ret; + int ret2; test_msg("Running btrfs_drop_extent_map_range tests"); - inode = btrfs_new_test_inode(); - if (!inode) { - test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; - } - - em_tree = &BTRFS_I(inode)->extent_tree; - /* [0, 12k) */ - ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0); + ret = add_compressed_extent(inode, 0, SZ_4K * 3, 0); if (ret) { test_err("cannot add extent range [0, 12K)"); goto out; } /* [12k, 24k) */ - ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + ret = add_compressed_extent(inode, SZ_4K * 3, SZ_4K * 3, SZ_4K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [24k, 36k) */ - ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + ret = add_compressed_extent(inode, SZ_4K * 6, SZ_4K * 3, SZ_8K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [36k, 40k) */ - ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + ret = add_compressed_extent(inode, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [40k, 64k) */ - ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + ret = add_compressed_extent(inode, SZ_4K * 10, SZ_4K * 6, SZ_16K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; @@ -643,36 +663,39 @@ static int test_case_5(struct btrfs_fs_info *fs_info) /* Drop [8k, 12k) */ start = SZ_8K; end = (3 * SZ_4K) - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 0); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 0); if (ret) goto out; /* Drop [12k, 20k) */ start = SZ_4K * 3; end = SZ_16K + SZ_4K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 1); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 1); if (ret) goto out; /* Drop [28k, 32k) */ start = SZ_32K - SZ_4K; end = SZ_32K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 2); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 2); if (ret) goto out; /* Drop [32k, 64k) */ start = SZ_32K; end = SZ_64K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 3); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 3); if (ret) goto out; out: - iput(inode); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -681,31 +704,35 @@ out: * for areas between two existing ems. Validate it doesn't do this when there * are two unmerged em's side by side. */ -static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) +static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em = NULL; int ret; + int ret2; - ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0); + ret = add_compressed_extent(inode, 0, SZ_4K, 0); if (ret) goto out; - ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0); + ret = add_compressed_extent(inode, SZ_4K, SZ_4K, 0); if (ret) goto out; em = alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); - return -ENOMEM; + ret = -ENOMEM; + goto out; } em->start = SZ_4K; em->len = SZ_4K; - em->block_start = SZ_16K; - em->block_len = SZ_16K; + em->disk_bytenr = SZ_16K; + em->disk_num_bytes = SZ_16K; + em->ram_bytes = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K); + ret = btrfs_add_extent_mapping(inode, &em, 0, SZ_8K); write_unlock(&em_tree->lock); if (ret != 0) { @@ -725,7 +752,10 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em ret = 0; out: free_extent_map(em); - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -734,38 +764,30 @@ out: * true would mess up the start/end calculations and subsequent splits would be * incorrect. */ -static int test_case_7(struct btrfs_fs_info *fs_info) +static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { - struct extent_map_tree *em_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; - struct inode *inode; int ret; + int ret2; test_msg("Running btrfs_drop_extent_cache with pinned"); - inode = btrfs_new_test_inode(); - if (!inode) { - test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; - } - - em_tree = &BTRFS_I(inode)->extent_tree; - em = alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); - ret = -ENOMEM; - goto out; + return -ENOMEM; } /* [0, 16K), pinned */ em->start = 0; em->len = SZ_16K; - em->block_start = 0; - em->block_len = SZ_4K; - em->flags |= EXTENT_FLAG_PINNED; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_16K; + em->flags |= (EXTENT_FLAG_PINNED | EXTENT_FLAG_COMPRESS_ZLIB); write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -783,10 +805,11 @@ static int test_case_7(struct btrfs_fs_info *fs_info) /* [32K, 48K), not pinned */ em->start = SZ_32K; em->len = SZ_16K; - em->block_start = SZ_32K; - em->block_len = SZ_16K; + em->disk_bytenr = SZ_32K; + em->disk_num_bytes = SZ_16K; + em->ram_bytes = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -798,7 +821,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info) * Drop [0, 36K) This should skip the [0, 4K) extent and then split the * [32K, 48K) extent. */ - btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true); + btrfs_drop_extent_map_range(inode, 0, (36 * SZ_1K) - 1, true); /* Make sure our extent maps look sane. */ ret = -EINVAL; @@ -847,6 +870,12 @@ static int test_case_7(struct btrfs_fs_info *fs_info) goto out; } + if (extent_map_block_start(em) != SZ_32K + SZ_4K) { + test_err("em->block_start is %llu, expected 36K", + extent_map_block_start(em)); + goto out; + } + free_extent_map(em); read_lock(&em_tree->lock); @@ -860,7 +889,110 @@ static int test_case_7(struct btrfs_fs_info *fs_info) ret = 0; out: free_extent_map(em); - iput(inode); + /* Unpin our extent to prevent warning when removing it below. */ + ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0); + if (ret == 0) + ret = ret2; + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + + return ret; +} + +/* + * Test a regression for compressed extent map adjustment when we attempt to + * add an extent map that is partially overlapped by another existing extent + * map. The resulting extent map offset was left unchanged despite having + * incremented its start offset. + */ +static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + int ret; + int ret2; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + /* Compressed extent for the file range [120K, 128K). */ + em->start = SZ_1K * 120; + em->len = SZ_8K; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_8K; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("couldn't add extent map for range [120K, 128K)"); + goto out; + } + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* + * Compressed extent for the file range [108K, 144K), which overlaps + * with the [120K, 128K) we previously inserted. + */ + em->start = SZ_1K * 108; + em->len = SZ_1K * 36; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_1K * 36; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + + /* + * Try to add the extent map but with a search range of [140K, 144K), + * this should succeed and adjust the extent map to the range + * [128K, 144K), with a length of 16K and an offset of 20K. + * + * This simulates a scenario where in the subvolume tree of an inode we + * have a compressed file extent item for the range [108K, 144K) and we + * have an overlapping compressed extent map for the range [120K, 128K), + * which was created by an encoded write, but its ordered extent was not + * yet completed, so the subvolume tree doesn't have yet the file extent + * item for that range - we only have the extent map in the inode's + * extent map tree. + */ + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("couldn't add extent map for range [108K, 144K)"); + goto out; + } + + if (em->start != SZ_128K) { + test_err("unexpected extent map start %llu (should be 128K)", em->start); + ret = -EINVAL; + goto out; + } + if (em->len != SZ_16K) { + test_err("unexpected extent map length %llu (should be 16K)", em->len); + ret = -EINVAL; + goto out; + } + if (em->offset != SZ_1K * 20) { + test_err("unexpected extent map offset %llu (should be 20K)", em->offset); + ret = -EINVAL; + goto out; + } +out: + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -954,7 +1086,8 @@ out_free: int btrfs_test_extent_map(void) { struct btrfs_fs_info *fs_info = NULL; - struct extent_map_tree *em_tree; + struct inode *inode; + struct btrfs_root *root = NULL; int ret = 0, i; struct rmap_test_vector rmap_tests[] = { { @@ -1003,33 +1136,45 @@ int btrfs_test_extent_map(void) return -ENOMEM; } - em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); - if (!em_tree) { + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); ret = -ENOMEM; goto out; } - extent_map_tree_init(em_tree); + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + root = NULL; + goto out; + } + + BTRFS_I(inode)->root = root; - ret = test_case_1(fs_info, em_tree); + ret = test_case_1(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_2(fs_info, em_tree); + ret = test_case_2(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_3(fs_info, em_tree); + ret = test_case_3(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_4(fs_info, em_tree); + ret = test_case_4(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_5(fs_info); + ret = test_case_5(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_6(fs_info, em_tree); + ret = test_case_6(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_7(fs_info); + ret = test_case_7(fs_info, BTRFS_I(inode)); + if (ret) + goto out; + ret = test_case_8(fs_info, BTRFS_I(inode)); if (ret) goto out; @@ -1041,7 +1186,8 @@ int btrfs_test_extent_map(void) } out: - kfree(em_tree); + iput(inode); + btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); return ret; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 9957de9f7806..3ea3bc2225fe 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -117,7 +117,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) /* Now for a regular extent */ insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, - disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); + disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; disk_bytenr += sectorsize; offset += sectorsize - 1; @@ -258,14 +258,14 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } free_extent_map(em); @@ -278,13 +278,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_INLINE) { - test_err("expected an inline, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_INLINE) { + test_err("expected an inline, got %llu", em->disk_bytenr); goto out; } @@ -316,13 +316,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != 4) { @@ -339,13 +339,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize - 1) { @@ -358,22 +358,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -386,23 +385,22 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = em->block_start; + disk_bytenr = extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -418,13 +416,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -437,28 +435,28 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != orig_start) { - test_err("wrong orig offset, want %llu, have %llu", - orig_start, em->orig_start); + if (em->start - em->offset != orig_start) { + test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu", + em->start, em->offset, orig_start); goto out; } disk_bytenr += (em->start - orig_start); - if (em->block_start != disk_bytenr) { + if (extent_map_block_start(em) != disk_bytenr) { test_err("wrong block start, want %llu, have %llu", - disk_bytenr, em->block_start); + disk_bytenr, extent_map_block_start(em)); goto out; } offset = em->start + em->len; free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -472,22 +470,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) prealloc_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -501,23 +498,22 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) prealloc_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = em->block_start; + disk_bytenr = extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_HOLE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_HOLE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -530,27 +526,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != orig_start) { - test_err("unexpected orig offset, wanted %llu, have %llu", - orig_start, em->orig_start); + if (em->start - em->offset != orig_start) { + test_err("unexpected offset, wanted %llu, have %llu", + em->start - orig_start, em->offset); goto out; } - if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + if (extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + (em->start - em->orig_start), - em->block_start); + disk_bytenr + em->offset, extent_map_block_start(em)); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -564,28 +559,27 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) prealloc_only, em->flags); goto out; } - if (em->orig_start != orig_start) { - test_err("wrong orig offset, want %llu, have %llu", orig_start, - em->orig_start); + if (em->start - em->offset != orig_start) { + test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu", + em->start, em->offset, orig_start); goto out; } - if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + if (extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + (em->start - em->orig_start), - em->block_start); + disk_bytenr + em->offset, extent_map_block_start(em)); goto out; } offset = em->start + em->len; free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -599,9 +593,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) compressed_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", - em->start, em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { @@ -613,13 +606,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -633,9 +626,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) compressed_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", - em->start, em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { @@ -643,18 +635,18 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } - disk_bytenr = em->block_start; + disk_bytenr = extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -667,22 +659,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != disk_bytenr) { + if (extent_map_block_start(em) != disk_bytenr) { test_err("block start does not match, want %llu got %llu", - disk_bytenr, em->block_start); + disk_bytenr, extent_map_block_start(em)); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -696,9 +687,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) compressed_only, em->flags); goto out; } - if (em->orig_start != orig_start) { - test_err("wrong orig offset, want %llu, have %llu", - em->start, orig_start); + if (em->start - em->offset != orig_start) { + test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu", + em->start, em->offset, orig_start); goto out; } if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { @@ -710,13 +701,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -729,21 +720,20 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole extent, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole extent, got %llu", em->disk_bytenr); goto out; } /* @@ -762,21 +752,20 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) vacancy_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -789,9 +778,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong orig offset, want 0, have %llu", em->offset); goto out; } ret = 0; @@ -850,13 +838,13 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } if (em->start != 0 || em->len != sectorsize) { @@ -872,13 +860,13 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != sectorsize) { - test_err("expected a real extent, got %llu", em->block_start); + if (extent_map_block_start(em) != sectorsize) { + test_err("expected a real extent, got %llu", extent_map_block_start(em)); goto out; } if (em->start != sectorsize || em->len != sectorsize) { diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c new file mode 100644 index 000000000000..a7bc58a5c1e2 --- /dev/null +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -0,0 +1,1161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Western Digital Corporation or its affiliates. + */ + +#include <linux/sizes.h> +#include "../fs.h" +#include "../disk-io.h" +#include "../transaction.h" +#include "../volumes.h" +#include "../raid-stripe-tree.h" +#include "btrfs-tests.h" + +#define RST_TEST_NUM_DEVICES (2) +#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) + +#define SZ_48K (SZ_32K + SZ_16K) + +typedef int (*test_func_t)(struct btrfs_trans_handle *trans); + +static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, + u64 devid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (dev->devid == devid) + return dev; + } + + return NULL; +} + +/* + * Test creating a range of three extents and then punch a hole in the middle, + * deleting all of the middle extents and partially deleting the "book ends". + */ +static int test_punch_hole_3extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + u64 hole_start = logical1 + SZ_256K; + u64 hole_len = SZ_2M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 + 256K and 2M in length. Extent + * 1 is truncated to 256k length, extent 2 is completely dropped and + * extent 3 is moved 256K to the right. + */ + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + /* Get the first extent and check its size. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_256K, len1); + ret = -EINVAL; + goto out; + } + + /* Get the second extent and check it's absent. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical2, logical2 + len2); + ret = -EINVAL; + goto out; + } + + /* Get the third extent and check its size. */ + logical3 += SZ_256K; + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, logical3 + len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3 + SZ_256K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M - SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M - SZ_256K, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static int test_delete_two_extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 and 2M in length. Extents 1 + * and 2 are dropped and extent 3 is kept as is. + */ + ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1 + len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical1, len1); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical2, len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* Test punching a hole into a single RAID stripe-extent. */ +static int test_punch_hole(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 hole_start = logical1 + SZ_32K; + u64 hole_len = SZ_64K; + u64 logical2 = hole_start + hole_len; + u64 len = SZ_1M; + u64 len1 = SZ_32K; + u64 len2 = len - len1 - hole_len; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_1M) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_1M, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len1); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical2, + logical2 + len2); + goto out; + } + + if (io_stripe.physical != logical2) { + test_err("invalid physical address, expected %llu, got %llu", + logical2, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len2 != len - len1 - hole_len) { + test_err("invalid length, expected %llu, got %llu", + len - len1 - hole_len, len2); + ret = -EINVAL; + goto out; + } + + /* Check for the absence of the hole. */ + ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + hole_start, hole_start + SZ_64K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2, len2); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 1M RST write that spans two adjacent RST items on disk and then + * delete a portion starting in the first item and spanning into the second + * item. This is similar to test_front_delete(), but spanning multiple items. + */ +static int test_front_delete_prev_item(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 logical2 = SZ_2M; + u64 len = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + /* Insert RAID extent 1. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + /* Insert RAID extent 2, directly adjacent to it. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1 + SZ_512K, (u64)SZ_1M); + goto out; + } + + /* Verify item 1 is truncated to 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify item 2's start is moved by 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical2 + SZ_512K, logical2 + len); + goto out; + } + + if (io_stripe.physical != logical2 + SZ_512K) { + test_err("invalid physical address, expected %llu got %llu", + logical2 + SZ_512K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify there's a hole at [1M+512K, 2M+512K] . */ + len = SZ_1M; + ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID [%llu, %llu] succeeded, should fail", + logical1 + SZ_512K, logical1 + SZ_512K + len); + goto out; + } + + /* Clean up after us. */ + ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * delete the 1st 32K, making the new start address 1M+32K. + */ +static int test_front_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, SZ_16K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + SZ_16K); + goto out; + } + + len -= SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical + SZ_16K, logical + SZ_64K); + goto out; + } + + if (io_stripe.physical != logical + SZ_16K) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_16K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_48K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_48K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret != -ENODATA) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical, logical + SZ_16K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * truncate the stripe extent down to 32K. + */ +static int test_tail_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical + SZ_48K, logical + SZ_64K); + goto out; + } + + len = SZ_48K; + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu, got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_48K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_48K, len); + ret = -EINVAL; + goto out; + } + + len = SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical + SZ_48K, logical + SZ_64K); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * overwrite the whole range giving it new physical address at an offset of 1G. + * The intent of this test is to exercise the 'update_raid_extent_item()' + * function called be btrfs_insert_one_raid_extent(). + */ +static int test_create_update_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = SZ_1G + logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("updating RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_1G) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_1G, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M. + * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M. + */ +static int test_simple_create_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + bioc->map_type = map_type; + bioc->size = SZ_64K; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static const test_func_t tests[] = { + test_simple_create_delete, + test_create_update_delete, + test_tail_delete, + test_front_delete, + test_front_delete_prev_item, + test_punch_hole, + test_punch_hole_3extents, + test_delete_two_extents, +}; + +static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + int ret; + + fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + ret = -ENOMEM; + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + btrfs_set_super_incompat_flags(root->fs_info->super_copy, + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); + root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + fs_info->stripe_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); + if (IS_ERR(root->node)) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = PTR_ERR(root->node); + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 2 * nodesize; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_device *dev; + + dev = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + dev->devid = i; + } + + btrfs_init_dummy_trans(&trans, root->fs_info); + ret = test(&trans); + if (ret) + goto out; + +out: + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + + return ret; +} + +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize) +{ + int ret = 0; + + test_msg("running raid-stripe-tree tests"); + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + ret = run_test(tests[i], sectorsize, nodesize); + if (ret) { + test_err("test-case %ps failed with %d\n", tests[i], ret); + goto out; + } + } + +out: + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bf8e64c766b6..aca83a98b75a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,12 +23,10 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" -#include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" @@ -143,10 +141,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.href_root.rb_root)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.dirty_extent_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); + WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -278,8 +274,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); @@ -352,9 +350,8 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); - cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; - cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; - atomic_set(&cur_trans->delayed_refs.num_entries, 0); + xa_init(&cur_trans->delayed_refs.head_refs); + xa_init(&cur_trans->delayed_refs.dirty_extents); /* * although the tree mod log is per file system and not per transaction, @@ -407,7 +404,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int ret = 0; if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - root->last_trans < trans->transid) || force) { + btrfs_get_root_last_trans(root) < trans->transid) || force) { WARN_ON(!force && root->commit_root != root->node); /* @@ -423,15 +420,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, smp_wmb(); spin_lock(&fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid && !force) { + if (btrfs_get_root_last_trans(root) == trans->transid && !force) { spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } radix_tree_tag_set(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); - root->last_trans = trans->transid; + btrfs_set_root_last_trans(root, trans->transid); /* this is pretty tricky. We don't want to * take the relocation lock in btrfs_record_root_in_trans @@ -474,7 +471,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, /* Make sure we don't try to update the root at commit time */ spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); } @@ -493,7 +490,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, * and barriers */ smp_rmb(); - if (root->last_trans == trans->transid && + if (btrfs_get_root_last_trans(root) == trans->transid && !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; @@ -552,7 +549,7 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) if (!fs_info->reloc_ctl || !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; @@ -747,14 +744,6 @@ again: h->reloc_reserved = reloc_reserved; } - /* - * Now that we have found a transaction to be a part of, convert the - * qgroup reservation from prealloc to pertrans. A different transaction - * can't race in and free our pertrans out from under us. - */ - if (qgroup_reserved) - btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); - got_it: if (!current->journal_info) current->journal_info = h; @@ -788,8 +777,15 @@ got_it: * not just freed. */ btrfs_end_transaction(h); - return ERR_PTR(ret); + goto reserve_fail; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); return h; @@ -801,8 +797,7 @@ alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); if (delayed_refs_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, - delayed_refs_bytes); + btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); @@ -1055,7 +1050,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - int err = 0; + int ret = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); @@ -1094,13 +1089,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) - err = trans->aborted; + ret = trans->aborted; else - err = -EROFS; + ret = -EROFS; } kmem_cache_free(btrfs_trans_handle_cachep, trans); - return err; + return ret; } int btrfs_end_transaction(struct btrfs_trans_handle *trans) @@ -1121,8 +1116,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { - int err = 0; - int werr = 0; + int ret = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; @@ -1132,7 +1126,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, mark, &cached_state)) { bool wait_writeback = false; - err = convert_extent_bit(dirty_pages, start, end, + ret = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, &cached_state); /* @@ -1148,22 +1142,22 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, * We cleanup any entries left in the io tree when committing * the transaction (through extent_io_tree_release()). */ - if (err == -ENOMEM) { - err = 0; + if (ret == -ENOMEM) { + ret = 0; wait_writeback = true; } - if (!err) - err = filemap_fdatawrite_range(mapping, start, end); - if (err) - werr = err; - else if (wait_writeback) - werr = filemap_fdatawait_range(mapping, start, end); + if (!ret) + ret = filemap_fdatawrite_range(mapping, start, end); + if (!ret && wait_writeback) + ret = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); + if (ret) + break; cached_state = NULL; cond_resched(); start = end + 1; } - return werr; + return ret; } /* @@ -1175,12 +1169,11 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { - int err = 0; - int werr = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + int ret = 0; while (find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { @@ -1192,22 +1185,20 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ - err = clear_extent_bit(dirty_pages, start, end, + ret = clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, &cached_state); - if (err == -ENOMEM) - err = 0; - if (!err) - err = filemap_fdatawait_range(mapping, start, end); - if (err) - werr = err; + if (ret == -ENOMEM) + ret = 0; + if (!ret) + ret = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); + if (ret) + break; cached_state = NULL; cond_resched(); start = end + 1; } - if (err) - werr = err; - return werr; + return ret; } static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, @@ -1232,7 +1223,7 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) bool errors = false; int err; - ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); err = __btrfs_wait_marked_extents(fs_info, dirty_pages); if ((mark & EXTENT_DIRTY) && @@ -1495,8 +1486,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) ASSERT(atomic_read(&root->log_commit[1]) == 0); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); + btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); @@ -1521,7 +1513,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) if (ret2) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1586,8 +1577,8 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, goto out; /* Now qgroup are all updated, we can inherit it to new qgroups */ - ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, - parent->root_key.objectid, inherit); + ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid, + btrfs_root_id(parent), inherit); if (ret < 0) goto out; @@ -1644,7 +1635,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode = pending->dir; + struct inode *parent_inode = &pending->dir->vfs_inode; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; @@ -1825,7 +1816,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert root back/forward references */ ret = btrfs_add_root_ref(trans, objectid, - parent_root->root_key.objectid, + btrfs_root_id(parent_root), btrfs_ino(BTRFS_I(parent_inode)), index, &fname.disk_name); if (ret) { @@ -1858,16 +1849,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = qgroup_account_snapshot(trans, root, parent_root, pending->inherit, objectid); else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) - ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid, - parent_root->root_key.objectid, pending->inherit); + ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid, + btrfs_root_id(parent_root), pending->inherit); if (ret < 0) goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, index); - /* We have check then name at the beginning, so it is impossible. */ - BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1959,19 +1948,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->uuid_tree_generation = root_item->generation; } -int btrfs_transaction_in_commit(struct btrfs_fs_info *info) -{ - struct btrfs_transaction *trans; - int ret = 0; - - spin_lock(&info->trans_lock); - trans = info->running_transaction; - if (trans) - ret = (trans->state >= TRANS_STATE_COMMIT_START); - spin_unlock(&info->trans_lock); - return ret; -} - int btrfs_transaction_blocked(struct btrfs_fs_info *info) { struct btrfs_transaction *trans; @@ -2011,6 +1987,25 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) btrfs_put_transaction(cur_trans); } +/* + * If there is a running transaction commit it or if it's already committing, + * wait for its commit to complete. Does not start and commit a new transaction + * if there isn't any running. + */ +int btrfs_commit_current_transaction(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + int ret = PTR_ERR(trans); + + return (ret == -ENOENT) ? 0 : ret; + } + + return btrfs_commit_transaction(trans); +} + static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2056,7 +2051,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) spin_unlock(&fs_info->trans_lock); - btrfs_cleanup_one_transaction(trans->transaction, fs_info); + btrfs_cleanup_one_transaction(trans->transaction); spin_lock(&fs_info->trans_lock); if (cur_trans == fs_info->running_transaction) @@ -2132,7 +2127,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } /* @@ -2641,7 +2636,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); + btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root)); btrfs_kill_all_delayed_nodes(root); @@ -2686,9 +2681,7 @@ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, int __init btrfs_transaction_init(void) { - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY); if (!btrfs_trans_handle_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 2bf8bbdfd0b3..9f7c777af635 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -6,13 +6,34 @@ #ifndef BTRFS_TRANSACTION_H #define BTRFS_TRANSACTION_H +#include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/list.h> +#include <linux/time64.h> +#include <linux/mutex.h> +#include <linux/wait.h> #include "btrfs_inode.h" #include "delayed-ref.h" -#include "ctree.h" +#include "extent-io-tree.h" +#include "block-rsv.h" +#include "messages.h" #include "misc.h" -/* Radix-tree tag for roots that are part of the trasaction. */ +struct dentry; +struct inode; +struct btrfs_pending_snapshot; +struct btrfs_fs_info; +struct btrfs_root_item; +struct btrfs_root; +struct btrfs_path; + +/* + * Signal that a direct IO write is in progress, to avoid deadlock for sync + * direct IO writes when fsync is called during the direct IO write path. + */ +#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) + +/* Radix-tree tag for roots that are part of the transaction. */ #define BTRFS_ROOT_TRANS_TAG 0 enum btrfs_trans_state { @@ -157,7 +178,7 @@ struct btrfs_trans_handle { struct btrfs_pending_snapshot { struct dentry *dentry; - struct inode *dir; + struct btrfs_inode *dir; struct btrfs_root *root; struct btrfs_root_item *root_item; struct btrfs_root *snap; @@ -206,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -bool __cold abort_should_print_stack(int error); +/* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +static inline bool btrfs_abort_should_print_stack(int error) +{ + switch (error) { + case -EIO: + case -EROFS: + case -ENOMEM: + return false; + } + return true; +} /* * Call btrfs_abort_transaction as early as possible when an error condition is @@ -214,12 +249,12 @@ bool __cold abort_should_print_stack(int error); */ #define btrfs_abort_transaction(trans, error) \ do { \ - bool first = false; \ + bool __first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(error), \ + __first = true; \ + if (WARN(btrfs_abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ (error))) { \ @@ -231,7 +266,7 @@ do { \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (error), first); \ + __LINE__, (error), __first); \ } while (0) int btrfs_end_transaction(struct btrfs_trans_handle *trans); @@ -253,6 +288,7 @@ void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); +int btrfs_commit_current_transaction(struct btrfs_root *root); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); @@ -262,7 +298,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); -int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 6eccf8496486..43979891f7c8 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -21,7 +21,6 @@ #include "messages.h" #include "ctree.h" #include "tree-checker.h" -#include "disk-io.h" #include "compression.h" #include "volumes.h" #include "misc.h" @@ -30,7 +29,6 @@ #include "file-item.h" #include "inode-item.h" #include "dir-item.h" -#include "raid-stripe-tree.h" #include "extent-tree.h" /* @@ -67,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -94,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -154,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -339,6 +340,24 @@ static int check_extent_data_item(struct extent_buffer *leaf, } } + /* + * For non-compressed data extents, ram_bytes should match its + * disk_num_bytes. + * However we do not really utilize ram_bytes in this case, so this check + * is only optional for DEBUG builds for developers to catch the + * unexpected behaviors. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG) && + btrfs_file_extent_compression(leaf, fi) == BTRFS_COMPRESS_NONE && + btrfs_file_extent_disk_bytenr(leaf, fi)) { + if (WARN_ON(btrfs_file_extent_ram_bytes(leaf, fi) != + btrfs_file_extent_disk_num_bytes(leaf, fi))) + file_extent_err(leaf, slot, +"mismatch ram_bytes (%llu) and disk_num_bytes (%llu) for non-compressed extent", + btrfs_file_extent_ram_bytes(leaf, fi), + btrfs_file_extent_disk_num_bytes(leaf, fi)); + } + return 0; } @@ -550,9 +569,10 @@ static int check_dir_item(struct extent_buffer *leaf, /* dir type check */ dir_type = btrfs_dir_ftype(leaf, di); - if (unlikely(dir_type >= BTRFS_FT_MAX)) { + if (unlikely(dir_type <= BTRFS_FT_UNKNOWN || + dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, - "invalid dir item type, have %u expect [0, %u)", + "invalid dir item type, have %u expect (0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; } @@ -615,7 +635,7 @@ static int check_dir_item(struct extent_buffer *leaf, */ if (key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_XATTR_ITEM_KEY) { - char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + char namebuf[MAX(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; read_extent_buffer(leaf, namebuf, (unsigned long)(di + 1), name_len); @@ -649,6 +669,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -743,22 +764,19 @@ static int check_block_group_item(struct extent_buffer *leaf, return 0; } -__printf(4, 5) +__printf(5, 6) __cold -static void chunk_err(const struct extent_buffer *leaf, +static void chunk_err(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, const struct btrfs_chunk *chunk, u64 logical, const char *fmt, ...) { - const struct btrfs_fs_info *fs_info = leaf->fs_info; - bool is_sb; + bool is_sb = !leaf; struct va_format vaf; va_list args; int i; int slot = -1; - /* Only superblock eb is able to have such small offset */ - is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); - if (!is_sb) { /* * Get the slot number by iterating through all slots, this @@ -791,13 +809,17 @@ static void chunk_err(const struct extent_buffer *leaf, /* * The common chunk check which could also work on super block sys chunk array. * + * If @leaf is NULL, then @chunk must be an on-stack chunk item. + * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable) + * * Return -EUCLEAN if anything is corrupted. * Return 0 if everything is OK. */ -int btrfs_check_chunk_valid(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical) +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + u32 sectorsize) { - struct btrfs_fs_info *fs_info = leaf->fs_info; u64 length; u64 chunk_end; u64 stripe_len; @@ -805,63 +827,73 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, u16 sub_stripes; u64 type; u64 features; + u32 chunk_sector_size; bool mixed = false; int raid_index; int nparity; int ncopies; - length = btrfs_chunk_length(leaf, chunk); - stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); - type = btrfs_chunk_type(leaf, chunk); + if (leaf) { + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk); + } else { + length = btrfs_stack_chunk_length(chunk); + stripe_len = btrfs_stack_chunk_stripe_len(chunk); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + sub_stripes = btrfs_stack_chunk_sub_stripes(chunk); + type = btrfs_stack_chunk_type(chunk); + chunk_sector_size = btrfs_stack_chunk_sector_size(chunk); + } raid_index = btrfs_bg_flags_to_raid_index(type); ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; if (unlikely(!num_stripes)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } if (unlikely(num_stripes < ncopies)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); return -EUCLEAN; } if (unlikely(nparity && num_stripes == nparity)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes == nparity, have %u == %d", num_stripes, nparity); return -EUCLEAN; } - if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) { - chunk_err(leaf, chunk, logical, + if (unlikely(!IS_ALIGNED(logical, sectorsize))) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", - logical, fs_info->sectorsize); + logical, sectorsize); return -EUCLEAN; } - if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) { - chunk_err(leaf, chunk, logical, + if (unlikely(chunk_sector_size != sectorsize)) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", - btrfs_chunk_sector_size(leaf, chunk), - fs_info->sectorsize); + chunk_sector_size, sectorsize); return -EUCLEAN; } - if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) { - chunk_err(leaf, chunk, logical, + if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk length, have %llu", length); return -EUCLEAN; } if (unlikely(check_add_overflow(logical, length, &chunk_end))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical start and length, have logical start %llu length %llu", logical, length); return -EUCLEAN; } if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); return -EUCLEAN; @@ -875,30 +907,29 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, * Thus it should be a good way to catch obvious bitflips. */ if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "chunk length too large: have %llu limit %llu", length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)); + BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); return -EUCLEAN; } if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EUCLEAN; @@ -907,7 +938,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) && (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); return -EUCLEAN; @@ -920,7 +951,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, if (!mixed) { if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) && (type & BTRFS_BLOCK_GROUP_DATA))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EUCLEAN; } @@ -942,7 +973,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -962,14 +993,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; int num_stripes; if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { - chunk_err(leaf, chunk, key->offset, + chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size(leaf, slot), sizeof(struct btrfs_chunk), - BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } @@ -980,14 +1012,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, if (unlikely(btrfs_chunk_item_size(num_stripes) != btrfs_item_size(leaf, slot))) { - chunk_err(leaf, chunk, key->offset, + chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", btrfs_item_size(leaf, slot), btrfs_chunk_item_size(num_stripes)); return -EUCLEAN; } out: - return btrfs_check_chunk_valid(leaf, chunk, key->offset); + return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset, + fs_info->sectorsize); } __printf(3, 4) @@ -1005,6 +1038,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1260,6 +1294,7 @@ static void extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1267,6 +1302,19 @@ static void extent_err(const struct extent_buffer *eb, int slot, va_end(args); } +static bool is_valid_dref_root(u64 rootid) +{ + /* + * The following tree root objectids are allowed to have a data backref: + * - subvolume trees + * - data reloc tree + * - tree root + * For v1 space cache + */ + return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || + rootid == BTRFS_ROOT_TREE_OBJECTID; +} + static int check_extent_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) @@ -1419,6 +1467,8 @@ static int check_extent_item(struct extent_buffer *leaf, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u64 seq; + u64 dref_root; + u64 dref_objectid; u64 dref_offset; u64 inline_offset; u8 inline_type; @@ -1462,11 +1512,26 @@ static int check_extent_item(struct extent_buffer *leaf, */ case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); + dref_root = btrfs_extent_data_ref_root(leaf, dref); + dref_objectid = btrfs_extent_data_ref_objectid(leaf, dref); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); seq = hash_extent_data_ref( btrfs_extent_data_ref_root(leaf, dref), btrfs_extent_data_ref_objectid(leaf, dref), btrfs_extent_data_ref_offset(leaf, dref)); + if (unlikely(!is_valid_dref_root(dref_root))) { + extent_err(leaf, slot, + "invalid data ref root value %llu", + dref_root); + return -EUCLEAN; + } + if (unlikely(dref_objectid < BTRFS_FIRST_FREE_OBJECTID || + dref_objectid > BTRFS_LAST_FREE_OBJECTID)) { + extent_err(leaf, slot, + "invalid data ref objectid value %llu", + dref_objectid); + return -EUCLEAN; + } if (unlikely(!IS_ALIGNED(dref_offset, fs_info->sectorsize))) { extent_err(leaf, slot, @@ -1474,6 +1539,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1486,6 +1556,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1558,8 +1633,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1605,6 +1690,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf, return -EUCLEAN; } for (; ptr < end; ptr += sizeof(*dref)) { + u64 root; + u64 objectid; u64 offset; /* @@ -1612,13 +1699,33 @@ static int check_extent_data_ref(struct extent_buffer *leaf, * overflow from the leaf due to hash collisions. */ dref = (struct btrfs_extent_data_ref *)ptr; + root = btrfs_extent_data_ref_root(leaf, dref); + objectid = btrfs_extent_data_ref_objectid(leaf, dref); offset = btrfs_extent_data_ref_offset(leaf, dref); + if (unlikely(!is_valid_dref_root(root))) { + extent_err(leaf, slot, + "invalid extent data backref root value %llu", + root); + return -EUCLEAN; + } + if (unlikely(objectid < BTRFS_FIRST_FREE_OBJECTID || + objectid > BTRFS_LAST_FREE_OBJECTID)) { + extent_err(leaf, slot, + "invalid extent data backref objectid value %llu", + root); + return -EUCLEAN; + } if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent data backref offset, have %llu expect aligned to %u", offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; } @@ -1678,9 +1785,6 @@ static int check_inode_ref(struct extent_buffer *leaf, static int check_raid_stripe_extent(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot) { - struct btrfs_stripe_extent *stripe_extent = - btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); - if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid for raid stripe extent, have %llu expect aligned to %u", @@ -1694,22 +1798,72 @@ static int check_raid_stripe_extent(const struct extent_buffer *leaf, return -EUCLEAN; } - switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) { - case BTRFS_STRIPE_RAID0: - case BTRFS_STRIPE_RAID1: - case BTRFS_STRIPE_DUP: - case BTRFS_STRIPE_RAID10: - case BTRFS_STRIPE_RAID5: - case BTRFS_STRIPE_RAID6: - case BTRFS_STRIPE_RAID1C3: - case BTRFS_STRIPE_RAID1C4: - break; - default: - generic_err(leaf, slot, "invalid raid stripe encoding %u", - btrfs_stripe_extent_encoding(leaf, stripe_extent)); + return 0; +} + +static int check_dev_extent_item(const struct extent_buffer *leaf, + const struct btrfs_key *key, + int slot, + struct btrfs_key *prev_key) +{ + struct btrfs_dev_extent *de; + const u32 sectorsize = leaf->fs_info->sectorsize; + + de = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + /* Basic fixed member checks. */ + if (unlikely(btrfs_dev_extent_chunk_tree(leaf, de) != + BTRFS_CHUNK_TREE_OBJECTID)) { + generic_err(leaf, slot, + "invalid dev extent chunk tree id, has %llu expect %llu", + btrfs_dev_extent_chunk_tree(leaf, de), + BTRFS_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + if (unlikely(btrfs_dev_extent_chunk_objectid(leaf, de) != + BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { + generic_err(leaf, slot, + "invalid dev extent chunk objectid, has %llu expect %llu", + btrfs_dev_extent_chunk_objectid(leaf, de), + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + /* Alignment check. */ + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent key.offset, has %llu not aligned to %u", + key->offset, sectorsize); return -EUCLEAN; } + if (unlikely(!IS_ALIGNED(btrfs_dev_extent_chunk_offset(leaf, de), + sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent chunk offset, has %llu not aligned to %u", + btrfs_dev_extent_chunk_objectid(leaf, de), + sectorsize); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(btrfs_dev_extent_length(leaf, de), + sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent length, has %llu not aligned to %u", + btrfs_dev_extent_length(leaf, de), sectorsize); + return -EUCLEAN; + } + /* Overlap check with previous dev extent. */ + if (slot && prev_key->objectid == key->objectid && + prev_key->type == key->type) { + struct btrfs_dev_extent *prev_de; + u64 prev_len; + prev_de = btrfs_item_ptr(leaf, slot - 1, struct btrfs_dev_extent); + prev_len = btrfs_dev_extent_length(leaf, prev_de); + if (unlikely(prev_key->offset + prev_len > key->offset)) { + generic_err(leaf, slot, + "dev extent overlap, prev offset %llu len %llu current offset %llu", + prev_key->objectid, prev_len, key->offset); + return -EUCLEAN; + } + } return 0; } @@ -1749,6 +1903,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(leaf, key, slot); break; + case BTRFS_DEV_EXTENT_KEY: + ret = check_dev_extent_item(leaf, key, slot, prev_key); + break; case BTRFS_INODE_ITEM_KEY: ret = check_inode_item(leaf, key, slot); break; @@ -1793,6 +1950,11 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) return BTRFS_TREE_BLOCK_INVALID_LEVEL; } + if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) { + generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set"); + return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; + } + /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an @@ -1854,6 +2016,7 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; u64 item_data_end; + enum btrfs_tree_block_status ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1909,21 +2072,10 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } - /* - * We only want to do this if WRITTEN is set, otherwise the leaf - * may be in some intermediate state and won't appear valid. - */ - if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { - enum btrfs_tree_block_status ret; - - /* - * Check if the item size and content meet other - * criteria - */ - ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) - return ret; - } + /* Check if the item size and content meet other criteria. */ + ret = check_leaf_item(leaf, &key, slot, &prev_key); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return ret; prev_key.objectid = key.objectid; prev_key.type = key.type; @@ -1953,6 +2105,11 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) int level = btrfs_header_level(node); u64 bytenr; + if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) { + generic_err(node, 0, "invalid flag for node, WRITTEN not set"); + return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; + } + if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", @@ -2017,7 +2174,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) * Skip dummy fs, as selftests don't create unique ebs for each dummy * root. */ - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) + if (btrfs_is_testing(eb->fs_info)) return 0; /* * There are several call sites (backref walking, qgroup, and data @@ -2063,8 +2220,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) return 0; } -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int found_level; @@ -2072,16 +2229,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, int ret; found_level = btrfs_header_level(eb); - if (found_level != level) { + if (found_level != check->level) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree level check failed\n"); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", - eb->start, level, found_level); + eb->start, check->level, found_level); return -EIO; } - if (!first_key) + if (!check->has_first_key) return 0; /* @@ -2106,15 +2263,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(first_key, &found_key); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); if (ret) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree first key check failed\n"); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", - eb->start, parent_transid, first_key->objectid, - first_key->type, first_key->offset, + eb->start, check->transid, check->first_key.objectid, + check->first_key.type, check->first_key.offset, found_key.objectid, found_key.type, found_key.offset); } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 14b9fbe82da4..eb201f4ec3c7 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,10 +6,13 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H +#include <linux/types.h> #include <uapi/linux/btrfs_tree.h> struct extent_buffer; +struct btrfs_fs_info; struct btrfs_chunk; +struct btrfs_key; /* All the extra info needed to verify the parentness of a tree block. */ struct btrfs_tree_parent_check { @@ -51,6 +54,7 @@ enum btrfs_tree_block_status { BTRFS_TREE_BLOCK_INVALID_BLOCKPTR, BTRFS_TREE_BLOCK_INVALID_ITEM, BTRFS_TREE_BLOCK_INVALID_OWNER, + BTRFS_TREE_BLOCK_WRITTEN_NOT_SET, }; /* @@ -63,10 +67,12 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node); int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); -int btrfs_check_chunk_valid(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + u32 sectorsize); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid); +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 331fc7429952..955d1677e865 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -13,13 +13,11 @@ #include "tree-log.h" #include "disk-io.h" #include "locking.h" -#include "print-tree.h" #include "backref.h" #include "compression.h" #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" @@ -140,6 +138,25 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ +static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +{ + unsigned int nofs_flag; + struct inode *inode; + + /* + * We're holding a transaction handle whether we are logging or + * replaying a log tree, so we must make sure NOFS semantics apply + * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL + * to allocate an inode, which can recurse back into the filesystem and + * attempt a transaction commit, resulting in a deadlock. + */ + nofs_flag = memalloc_nofs_save(); + inode = btrfs_iget(objectid, root); + memalloc_nofs_restore(nofs_flag); + + return inode; +} + /* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people @@ -393,7 +410,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * the leaf before writing into the log tree. See the comments at * copy_items() for more details. */ - ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); @@ -573,7 +590,6 @@ insert: } } no_copy: - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -602,7 +618,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, { struct inode *inode; - inode = btrfs_iget(root->fs_info->sb, objectid, root); + inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) inode = NULL; return inode; @@ -750,7 +766,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; if (ins.objectid > 0) { - struct btrfs_ref ref = { 0 }; u64 csum_start; u64 csum_end; LIST_HEAD(ordered_sums); @@ -764,13 +779,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret < 0) { goto out; } else if (ret == 0) { - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, - ins.objectid, ins.offset, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - key->objectid, offset, 0, false); + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, key->objectid, offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; @@ -780,7 +797,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * allocation tree */ ret = btrfs_alloc_logged_file_extent(trans, - root->root_key.objectid, + btrfs_root_id(root), key->objectid, offset, &ins); if (ret) goto out; @@ -799,9 +816,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, - &ordered_sums, 0, false); - if (ret) + &ordered_sums, false); + if (ret < 0) goto out; + ret = 0; /* * Now delete all existing cums in the csum root that * cover our range. We do this because we can have an @@ -1355,7 +1373,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1625,7 +1643,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (ret) goto out; } - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = (u64)-1; if (inode->i_nlink == 0) { if (S_ISDIR(inode->i_mode)) { @@ -1825,7 +1844,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2105,7 +2124,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct inode *inode = NULL; struct btrfs_key location; @@ -2820,6 +2839,52 @@ static void wait_for_writer(struct btrfs_root *root) finish_wait(&root->log_writer_wait, &wait); } +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + ctx->log_new_dentries = false; + ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; + ctx->logged_before = false; + ctx->inode = inode; + INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; + ctx->scratch_eb = NULL; +} + +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = ctx->inode; + + if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return; + + /* + * Don't care about allocation failure. This is just for optimization, + * if we fail to allocate here, we will try again later if needed. + */ + ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0); +} + +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + btrfs_assert_inode_locked(ctx->inode); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + + static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { @@ -3001,7 +3066,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret != -ENOSPC) btrfs_err(fs_info, "failed to update log for root %llu ret %d", - root->root_key.objectid, ret); + btrfs_root_id(root), ret); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); goto out; @@ -3522,7 +3587,6 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, last_offset = max(last_offset, curr_end); } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -3619,6 +3683,30 @@ out: return ret; } +static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) +{ + const int slot = path->slots[0]; + + if (ctx->scratch_eb) { + copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]); + } else { + ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!ctx->scratch_eb) + return -ENOMEM; + } + + btrfs_release_path(path); + path->nodes[0] = ctx->scratch_eb; + path->slots[0] = slot; + /* + * Add extra ref to scratch eb so that it is not freed when callers + * release the path, so we can reuse it later if needed. + */ + atomic_inc(&ctx->scratch_eb->refs); + + return 0; +} + static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, @@ -3633,23 +3721,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, bool last_found = false; int batch_start = 0; int batch_size = 0; - int i; + int ret; /* * We need to clone the leaf, release the read lock on it, and use the * clone before modifying the log tree. See the comment at copy_items() * about why we need to do this. */ - src = btrfs_clone_extent_buffer(path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(path, ctx); + if (ret < 0) + return ret; - i = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = src; - path->slots[0] = i; + src = path->nodes[0]; - for (; i < nritems; i++) { + for (int i = path->slots[0]; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -4167,8 +4252,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, bool inode_item_dropped) { struct btrfs_inode_item *inode_item; + struct btrfs_key key; int ret; + btrfs_get_inode_key(inode, &key); /* * If we are doing a fast fsync and the inode was logged before in the * current transaction, then we know the inode was previously logged and @@ -4180,7 +4267,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, * already exists can also result in unnecessarily splitting a leaf. */ if (!inode_item_dropped && inode->logged_trans == trans->transid) { - ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); ASSERT(ret <= 0); if (ret > 0) ret = -ENOENT; @@ -4194,7 +4281,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime * flags and set ->logged_trans to 0. */ - ret = btrfs_insert_empty_item(trans, log, path, &inode->location, + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*inode_item)); ASSERT(ret != -EEXIST); } @@ -4259,17 +4346,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct btrfs_path *src_path, int start_slot, int nr, int inode_only, - u64 logged_isize) + u64 logged_isize, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; struct extent_buffer *src; - int ret = 0; + int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; - int i; int dst_index; const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); @@ -4302,14 +4388,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * while the other is holding the delayed node's mutex and wants to * write lock the same subvolume leaf for flushing delayed items. */ - src = btrfs_clone_extent_buffer(src_path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(src_path, ctx); + if (ret < 0) + return ret; - i = src_path->slots[0]; - btrfs_release_path(src_path); - src_path->nodes[0] = src; - src_path->slots[0] = i; + src = src_path->nodes[0]; ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4324,7 +4407,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.nr = 0; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; struct btrfs_root *csum_root; struct btrfs_ordered_sum *sums; @@ -4399,9 +4482,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, disk_bytenr += extent_offset; ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, disk_bytenr + extent_num_bytes - 1, - &ordered_sums, 0, false); - if (ret) + &ordered_sums, false); + if (ret < 0) goto out; + ret = 0; list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { if (!ret) @@ -4431,7 +4515,7 @@ add_to_batch: goto out; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; const int dst_slot = dst_path->slots[0] + dst_index; struct btrfs_key key; @@ -4480,7 +4564,6 @@ copy_item: dst_index++; } - btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); btrfs_release_path(dst_path); out: kfree(ins_data); @@ -4511,16 +4594,17 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, { struct btrfs_ordered_extent *ordered; struct btrfs_root *csum_root; + u64 block_start; u64 csum_offset; u64 csum_len; - u64 mod_start = em->mod_start; - u64 mod_len = em->mod_len; + u64 mod_start = em->start; + u64 mod_len = em->len; LIST_HEAD(ordered_sums); int ret = 0; if (inode->flags & BTRFS_INODE_NODATASUM || (em->flags & EXTENT_FLAG_PREALLOC) || - em->block_start == EXTENT_MAP_HOLE) + em->disk_bytenr == EXTENT_MAP_HOLE) return 0; list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { @@ -4584,19 +4668,21 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, /* If we're compressed we have to save the entire range of csums. */ if (extent_map_is_compressed(em)) { csum_offset = 0; - csum_len = max(em->block_len, em->orig_block_len); + csum_len = em->disk_num_bytes; } else { csum_offset = mod_start - em->start; csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ - csum_root = btrfs_csum_root(trans->fs_info, em->block_start); - ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, - em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0, false); - if (ret) + block_start = extent_map_block_start(em); + csum_root = btrfs_csum_root(trans->fs_info, block_start); + ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset, + block_start + csum_offset + csum_len - 1, + &ordered_sums, false); + if (ret < 0) return ret; + ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, @@ -4623,7 +4709,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; enum btrfs_compression_type compress_type; - u64 extent_offset = em->start - em->orig_start; + u64 extent_offset = em->offset; + u64 block_start = extent_map_block_start(em); u64 block_len; int ret; @@ -4633,14 +4720,13 @@ static int log_one_extent(struct btrfs_trans_handle *trans, else btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); - block_len = max(em->block_len, em->orig_block_len); + block_len = em->disk_num_bytes; compress_type = extent_map_compression(em); if (compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); + btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - - extent_offset); + } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); } @@ -4687,7 +4773,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, &fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(fi)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -4704,7 +4789,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, */ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; struct btrfs_key key; @@ -4770,7 +4856,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) goto out; ins_nr = 0; @@ -4794,18 +4880,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, path->slots[0]++; continue; } - if (!dropped_extents) { - /* - * Avoid logging extent items logged in past fsync calls - * and leading to duplicate keys in the log tree. - */ + /* + * Avoid overlapping items in the log tree. The first time we + * get here, get rid of everything from a past fsync. After + * that, if the current extent starts before the end of the last + * extent we copied, truncate the last one. This can happen if + * an ordered extent completion modifies the subvolume tree + * while btrfs_next_leaf() has the tree unlocked. + */ + if (!dropped_extents || key.offset < truncate_offset) { ret = truncate_inode_items(trans, root->log_root, inode, - truncate_offset, + min(key.offset, truncate_offset), BTRFS_EXTENT_DATA_KEY); if (ret) goto out; dropped_extents = true; } + truncate_offset = btrfs_file_extent_end(path); if (ins_nr == 0) start_slot = slot; ins_nr++; @@ -4820,7 +4911,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } if (ins_nr > 0) ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); btrfs_free_path(dst_path); @@ -4883,7 +4974,7 @@ process: * private list. */ if (ret) { - clear_em_logging(tree, em); + clear_em_logging(inode, em); free_extent_map(em); continue; } @@ -4892,14 +4983,14 @@ process: ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); - clear_em_logging(tree, em); + clear_em_logging(inode, em); free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); if (!ret) - ret = btrfs_log_prealloc_extents(trans, inode, path); + ret = btrfs_log_prealloc_extents(trans, inode, path, ctx); if (ret) return ret; @@ -4980,7 +5071,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; int ret; @@ -5009,7 +5101,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, if (slot >= nritems) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5035,7 +5127,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, } if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; } @@ -5366,7 +5458,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_root *root = start_inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; @@ -5427,7 +5518,7 @@ again: continue; btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); + di_inode = btrfs_iget_logging(di_key.objectid, root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); goto out; @@ -5487,7 +5578,7 @@ again: btrfs_add_delayed_iput(curr_inode); curr_inode = NULL; - vfs_inode = btrfs_iget(fs_info->sb, ino, root); + vfs_inode = btrfs_iget_logging(ino, root); if (IS_ERR(vfs_inode)) { ret = PTR_ERR(vfs_inode); break; @@ -5582,7 +5673,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) return BTRFS_LOG_FORCE_COMMIT; - inode = btrfs_iget(root->fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); /* * If the other inode that had a conflicting dir entry was deleted in * the current transaction then we either: @@ -5683,7 +5774,6 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; /* @@ -5714,7 +5804,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, list_del(&curr->list); kfree(curr); - inode = btrfs_iget(fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent @@ -5725,7 +5815,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, if (ret != -ENOENT) break; - inode = btrfs_iget(fs_info->sb, parent, root); + inode = btrfs_iget_logging(parent, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); break; @@ -5838,7 +5928,7 @@ again: if (ret < 0) { return ret; } else if (ret > 0 && - other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { + other_ino != btrfs_ino(ctx->inode)) { if (ins_nr > 0) { ins_nr++; } else { @@ -5847,7 +5937,7 @@ again: } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, - inode_only, logged_isize); + inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5866,7 +5956,7 @@ again: goto next_slot; ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5883,7 +5973,7 @@ again: } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 1; @@ -5898,7 +5988,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, - logged_isize); + logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5923,7 +6013,7 @@ next_key: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret) return ret; } @@ -5934,7 +6024,7 @@ next_key: * lock the same leaf with btrfs_log_prealloc_extents() below. */ btrfs_release_path(path); - ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx); } return ret; @@ -6110,7 +6200,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, const struct list_head *delayed_del_list, const struct btrfs_delayed_item *first, const struct btrfs_delayed_item **last_ret) @@ -6171,7 +6260,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, if (ret < 0) { return ret; } else if (ret == 0) { - ret = batch_delete_dir_index_items(trans, inode, path, ctx, + ret = batch_delete_dir_index_items(trans, inode, path, delayed_del_list, curr, &last); if (ret) @@ -6247,7 +6336,6 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { const bool orig_log_new_dentries = ctx->log_new_dentries; - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_item *item; int ret = 0; @@ -6273,7 +6361,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, if (key.type == BTRFS_ROOT_ITEM_KEY) continue; - di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); + di_inode = btrfs_iget_logging(key.objectid, inode->root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); break; @@ -6526,7 +6614,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; xattrs_logged = true; @@ -6553,7 +6641,7 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; btrfs_release_path(path); @@ -6657,7 +6745,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -6722,8 +6809,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, cur_offset = item_size; } - dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, - root); + dir_inode = btrfs_iget_logging(inode_key.objectid, root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -6785,7 +6871,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); while (true) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; int slot; struct btrfs_key search_key; @@ -6800,7 +6885,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, search_key.objectid = found_key.offset; search_key.type = BTRFS_INODE_ITEM_KEY; search_key.offset = 0; - inode = btrfs_iget(fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -6989,6 +7074,15 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, } /* + * If we're logging an inode from a subvolume created in the current + * transaction we must force a commit since the root is not persisted. + */ + if (btrfs_root_generation(&root->root_item) == trans->transid) { + ret = BTRFS_LOG_FORCE_COMMIT; + goto end_no_trans; + } + + /* * Skip already logged inodes or inodes corresponding to tmpfiles * (since logging them is pointless, a link count of 0 means they * will never be accessible). @@ -7369,6 +7463,24 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, } /* + * Call this when creating a subvolume in a directory. + * Because we don't commit a transaction when creating a subvolume, we can't + * allow the directory pointing to the subvolume to be logged with an entry that + * points to an unpersisted root if we are still in the transaction used to + * create the subvolume, so make any attempt to log the directory to result in a + * full log sync. + * Also we don't need to worry with renames, since btrfs_rename() marks the log + * for full commit when renaming a subvolume. + */ +void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, + struct btrfs_inode *dir) +{ + mutex_lock(&dir->log_mutex); + dir->last_unlink_trans = trans->transid; + mutex_unlock(&dir->log_mutex); +} + +/* * Update the log after adding a new name for an inode. * * @trans: Transaction handle. @@ -7500,8 +7612,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, goto out; } - btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + btrfs_init_log_ctx(&ctx, inode); ctx.logging_new_name = true; + btrfs_init_log_ctx_scratch_eb(&ctx); /* * We don't care about the return value. If we fail to log the new name * then we know the next attempt to sync the log will fallback to a full @@ -7510,6 +7623,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.conflict_inodes)); out: /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a550a8a375cd..dc313e6bb2fa 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,10 +6,18 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include <linux/list.h> +#include <linux/fs.h> #include "messages.h" #include "ctree.h" #include "transaction.h" +struct inode; +struct dentry; +struct btrfs_ordered_extent; +struct btrfs_root; +struct btrfs_trans_handle; + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 @@ -29,44 +37,27 @@ struct btrfs_log_ctx { bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; - struct inode *inode; + struct btrfs_inode *inode; struct list_head list; /* Only used for fast fsyncs. */ struct list_head ordered_extents; struct list_head conflict_inodes; int num_conflict_inodes; bool logging_conflict_inodes; + /* + * Used for fsyncs that need to copy items from the subvolume tree to + * the log tree (full sync flag set or copy everything flag set) to + * avoid allocating a temporary extent buffer while holding a lock on + * an extent buffer of the subvolume tree and under the log transaction. + * Also helps to avoid allocating and freeing a temporary extent buffer + * in case we need to process multiple leaves from the subvolume tree. + */ + struct extent_buffer *scratch_eb; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, - struct inode *inode) -{ - ctx->log_ret = 0; - ctx->log_transid = 0; - ctx->log_new_dentries = false; - ctx->logging_new_name = false; - ctx->logging_new_delayed_dentries = false; - ctx->logged_before = false; - ctx->inode = inode; - INIT_LIST_HEAD(&ctx->list); - INIT_LIST_HEAD(&ctx->ordered_extents); - INIT_LIST_HEAD(&ctx->conflict_inodes); - ctx->num_conflict_inodes = 0; - ctx->logging_conflict_inodes = false; -} - -static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) -{ - struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_extent *tmp; - - ASSERT(inode_is_locked(ctx->inode)); - - list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { - list_del_init(&ordered->log_list); - btrfs_put_ordered_extent(ordered); - } -} +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode); +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx); +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx); static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) { @@ -103,6 +94,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, bool for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); +void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, + struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct dentry *old_dentry, struct btrfs_inode *old_dir, u64 old_dir_index, struct dentry *parent); diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 3df6153d5d5a..1ac2678fc4ca 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -44,7 +44,7 @@ struct tree_mod_elem { /* * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } @@ -170,8 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,8 +187,8 @@ static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, } /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ -static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; @@ -199,7 +198,7 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, return true; } -static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, +static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { @@ -222,7 +221,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; @@ -259,7 +258,7 @@ out_unlock: return ret; } -static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, +static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -279,7 +278,7 @@ static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -367,9 +366,9 @@ free_tms: return ret; } -static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct tree_mod_elem **tm_list, - int nritems) +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { int i, j; int ret; @@ -536,7 +535,7 @@ static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, } int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) @@ -910,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, * is freed (its refcount is decremented). */ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq) { @@ -1005,7 +1003,7 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); check.level = level; - check.owner_root = root->root_key.objectid; + check.owner_root = btrfs_root_id(root); old = read_tree_block(fs_info, logical, &check); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 94f10afeee97..1c12566040db 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -3,7 +3,13 @@ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H -#include "ctree.h" +#include <linux/list.h> + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_root; +struct btrfs_seq_list; /* Represents a tree mod log user. */ struct btrfs_seq_list { @@ -31,21 +37,20 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq); struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items); -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items); u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index b4ac2b0cd235..fc59b57257d6 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -7,7 +7,6 @@ #include <linux/slab.h> #include "messages.h" #include "ulist.h" -#include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 @@ -51,6 +50,7 @@ void ulist_init(struct ulist *ulist) INIT_LIST_HEAD(&ulist->nodes); ulist->root = RB_ROOT; ulist->nnodes = 0; + ulist->prealloc = NULL; } /* @@ -69,6 +69,8 @@ void ulist_release(struct ulist *ulist) list_for_each_entry_safe(node, next, &ulist->nodes, list) { kfree(node); } + kfree(ulist->prealloc); + ulist->prealloc = NULL; ulist->root = RB_ROOT; INIT_LIST_HEAD(&ulist->nodes); } @@ -106,6 +108,12 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } +void ulist_prealloc(struct ulist *ulist, gfp_t gfp_mask) +{ + if (!ulist->prealloc) + ulist->prealloc = kzalloc(sizeof(*ulist->prealloc), gfp_mask); +} + /* * Free dynamically allocated ulist. * @@ -207,9 +215,15 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, *old_aux = node->aux; return 0; } - node = kmalloc(sizeof(*node), gfp_mask); - if (!node) - return -ENOMEM; + + if (ulist->prealloc) { + node = ulist->prealloc; + ulist->prealloc = NULL; + } else { + node = kmalloc(sizeof(*node), gfp_mask); + if (!node) + return -ENOMEM; + } node->val = val; node->aux = aux; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index b2cef187ea8e..c62a372f1462 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -7,6 +7,7 @@ #ifndef BTRFS_ULIST_H #define BTRFS_ULIST_H +#include <linux/types.h> #include <linux/list.h> #include <linux/rbtree.h> @@ -40,12 +41,14 @@ struct ulist { struct list_head nodes; struct rb_root root; + struct ulist_node *prealloc; }; void ulist_init(struct ulist *ulist); void ulist_release(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); +void ulist_prealloc(struct ulist *ulist, gfp_t mask); void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 5be74f9e47eb..17b5e81123a1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -3,18 +3,19 @@ * Copyright (C) STRATO AG 2013. All rights reserved. */ +#include <linux/kthread.h> #include <linux/uuid.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "messages.h" #include "ctree.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "fs.h" #include "accessors.h" #include "uuid-tree.h" +#include "ioctl.h" -static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) +static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key) { key->type = type; key->objectid = get_unaligned_le64(uuid); @@ -22,7 +23,7 @@ static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) } /* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */ -static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, +static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, u8 type, u64 subid) { int ret; @@ -82,7 +83,7 @@ out: return ret; } -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid_cpu) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -114,7 +115,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); - if (ret >= 0) { + if (ret == 0) { /* Add an item for the type for the first time */ eb = path->nodes[0]; slot = path->slots[0]; @@ -139,14 +140,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; } -int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -257,7 +256,7 @@ out: * < 0 if an error occurred */ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, - u8 *uuid, u8 type, u64 subvolid) + const u8 *uuid, u8 type, u64 subvolid) { int ret = 0; struct btrfs_root *subvol_root; @@ -391,3 +390,180 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + bool closing = false; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + btrfs_release_path(path); + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + btrfs_release_path(path); + if (trans) { + ret = btrfs_end_transaction(trans); + trans = NULL; + if (ret) + break; + } + + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else if (!closing) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index 5350c87fe2ca..c60ad20325cc 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -3,10 +3,17 @@ #ifndef BTRFS_UUID_TREE_H #define BTRFS_UUID_TREE_H -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_fs_info; + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid); -int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); #endif diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 66e2270b0dae..e97ad824ae16 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -14,7 +14,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" -#include "disk-io.h" #include "locking.h" #include "fs.h" #include "accessors.h" @@ -285,7 +284,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * page and ignore dest, but it must still be non-NULL to avoid the * counting-only behavior. * @len: length in bytes to read - * @dest_page: copy into this page instead of the dest buffer + * @dest_folio: copy into this folio instead of the dest buffer * * Helper function to read items from the btree. This returns the number of * bytes read or < 0 for errors. We can return short reads if the items don't @@ -295,7 +294,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * Returns number of bytes read or a negative error code on failure. */ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, - char *dest, u64 len, struct page *dest_page) + char *dest, u64 len, struct folio *dest_folio) { struct btrfs_path *path; struct btrfs_root *root = inode->root; @@ -315,7 +314,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (!path) return -ENOMEM; - if (dest_page) + if (dest_folio) path->reada = READA_FORWARD; key.objectid = btrfs_ino(inode); @@ -372,15 +371,15 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, copy_offset = offset - key.offset; if (dest) { - if (dest_page) - kaddr = kmap_local_page(dest_page); + if (dest_folio) + kaddr = kmap_local_folio(dest_folio, 0); data = btrfs_item_ptr(leaf, path->slots[0], void); read_extent_buffer(leaf, kaddr + dest_offset, (unsigned long)data + copy_offset, copy_bytes); - if (dest_page) + if (dest_folio) kunmap_local(kaddr); } @@ -461,7 +460,7 @@ static int rollback_verity(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; int ret; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); ret = btrfs_drop_verity_items(inode); @@ -586,7 +585,7 @@ static int btrfs_begin_enable_verity(struct file *filp) struct btrfs_trans_handle *trans; int ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; @@ -634,7 +633,7 @@ static int btrfs_end_enable_verity(struct file *filp, const void *desc, int ret = 0; int rollback_ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (desc == NULL) goto rollback; @@ -763,7 +762,7 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - folio_address(folio), PAGE_SIZE, &folio->page); + folio_address(folio), PAGE_SIZE, folio); if (ret < 0) { folio_put(folio); return ERR_PTR(ret); diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h index 91c10f7d0a46..d696659e43e4 100644 --- a/fs/btrfs/verity.h +++ b/fs/btrfs/verity.h @@ -3,8 +3,13 @@ #ifndef BTRFS_VERITY_H #define BTRFS_VERITY_H +struct inode; +struct btrfs_inode; + #ifdef CONFIG_FS_VERITY +#include <linux/fsverity.h> + extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); @@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) #else +#include <linux/errno.h> + static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) { return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d67785be2c77..3f8afbd1ebb5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -13,11 +13,9 @@ #include <linux/list_sort.h> #include <linux/namei.h> #include "misc.h" -#include "ctree.h" -#include "extent_map.h" #include "disk-io.h" +#include "extent-tree.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "rcu-string.h" @@ -50,6 +48,7 @@ struct btrfs_io_geometry { u64 raid56_full_stripe_start; int max_errors; enum btrfs_map_op op; + bool use_rst; }; const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { @@ -468,39 +467,44 @@ static noinline struct btrfs_fs_devices *find_fsid( static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct bdev_handle **bdev_handle, + int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { struct block_device *bdev; int ret; - *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); - if (IS_ERR(*bdev_handle)) { - ret = PTR_ERR(*bdev_handle); + if (IS_ERR(*bdev_file)) { + ret = PTR_ERR(*bdev_file); + btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", + device_path, flags, ret); goto error; } - bdev = (*bdev_handle)->bdev; + bdev = file_bdev(*bdev_file); if (flush) sync_blockdev(bdev); - ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); - if (ret) { - bdev_release(*bdev_handle); - goto error; + if (holder) { + ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); + if (ret) { + fput(*bdev_file); + goto error; + } } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; } return 0; error: - *bdev_handle = NULL; + *disk_super = NULL; + *bdev_file = NULL; return ret; } @@ -643,7 +647,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -654,7 +658,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) return ret; @@ -678,22 +682,32 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev_handle->bdev)) + if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(bdev_handle->bdev)) + if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true; - if (bdev_max_discard_sectors(bdev_handle->bdev)) + if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + if (device->devt != device->bdev->bd_dev) { + btrfs_warn(NULL, + "device %s maj:min changed from %d:%d to %d:%d", + device->name->str, MAJOR(device->devt), + MINOR(device->devt), MAJOR(device->bdev->bd_dev), + MINOR(device->bdev->bd_dev)); + + device->devt = device->bdev->bd_dev; + } + fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -706,12 +720,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return -EINVAL; } -u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) +const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) { bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); @@ -720,6 +734,118 @@ u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) } /* + * We can have very weird soft links passed in. + * One example is "/proc/self/fd/<fd>", which can be a soft link to + * a block device. + * + * But it's never a good idea to use those weird names. + * Here we check if the path (not following symlinks) is a good one inside + * "/dev/". + */ +static bool is_good_dev_path(const char *dev_path) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + bool is_good = false; + int ret; + + if (!dev_path) + goto out; + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) + goto out; + + /* + * Do not follow soft link, just check if the original path is inside + * "/dev/". + */ + ret = kern_path(dev_path, 0, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) + goto out; + if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) + goto out; + is_good = true; +out: + kfree(path_buf); + path_put(&path); + return is_good; +} + +static int get_canonical_dev_path(const char *dev_path, char *canonical) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + int ret; + + if (!dev_path) { + ret = -EINVAL; + goto out; + } + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) { + ret = -ENOMEM; + goto out; + } + + ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) { + ret = PTR_ERR(resolved_path); + goto out; + } + ret = strscpy(canonical, resolved_path, PATH_MAX); +out: + kfree(path_buf); + path_put(&path); + return ret; +} + +static bool is_same_device(struct btrfs_device *device, const char *new_path) +{ + struct path old = { .mnt = NULL, .dentry = NULL }; + struct path new = { .mnt = NULL, .dentry = NULL }; + char *old_path = NULL; + bool is_same = false; + int ret; + + if (!device->name) + goto out; + + old_path = kzalloc(PATH_MAX, GFP_NOFS); + if (!old_path) + goto out; + + rcu_read_lock(); + ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + rcu_read_unlock(); + if (ret < 0) + goto out; + + ret = kern_path(old_path, LOOKUP_FOLLOW, &old); + if (ret) + goto out; + ret = kern_path(new_path, LOOKUP_FOLLOW, &new); + if (ret) + goto out; + if (path_equal(&old, &new)) + is_same = true; +out: + kfree(old_path); + path_put(&old); + path_put(&new); + return is_same; +} + +/* * Add new device to list of registered devices * * Returns: @@ -769,8 +895,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; - pr_info("BTRFS: device %s using temp-fsid %pU\n", - path, fs_devices->fsid); + pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid); } mutex_lock(&fs_devices->device_list_mutex); @@ -799,8 +926,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, -"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", - path, fs_devices->fsid, current->comm, +"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid, current->comm, task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); @@ -826,16 +954,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (disk_super->label[0]) pr_info( - "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->label, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); else pr_info( - "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->fsid, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); - } else if (!device->name || strcmp(device->name->str, path)) { + } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that @@ -1015,10 +1145,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; - if (device->bdev_handle) { - bdev_release(device->bdev_handle); + if (device->bdev_file) { + fput(device->bdev_file); device->bdev = NULL; - device->bdev_handle = NULL; + device->bdev_file = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1063,7 +1193,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - bdev_release(device->bdev_handle); + fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1088,6 +1218,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; + device->bdev_file = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); btrfs_destroy_dev_zone_info(device); @@ -1172,29 +1303,53 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + s64 __maybe_unused value = 0; + int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { - int ret; + int ret2; - ret = btrfs_open_one_device(fs_devices, device, flags, holder); - if (ret == 0 && + ret2 = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret2 == 0 && (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; - } else if (ret == -ENODATA) { + } else if (ret2 == -ENODATA) { fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); } + if (ret == 0 && ret2 != 0) + ret = ret2; } - if (fs_devices->open_devices == 0) + + if (fs_devices->open_devices == 0) { + if (ret) + return ret; return -EINVAL; + } fs_devices->opened = 1; fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + fs_devices->read_devid = latest_dev->devid; + fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), + &value); + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = true; + + if (value) { + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->rr_min_contig_read = value; + if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) + fs_devices->read_devid = value; + } +#else fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#endif return 0; } @@ -1268,7 +1423,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev return ERR_PTR(-EINVAL); /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); + page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); if (IS_ERR(page)) return ERR_CAST(page); @@ -1301,6 +1456,47 @@ int btrfs_forget_devices(dev_t devt) return ret; } +static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, + const char *path, dev_t devt, + bool mount_arg_dev) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Do not skip device registration for mounted devices with matching + * maj:min but different paths. Booting without initrd relies on + * /dev/root initially, later replaced with the actual root device. + * A successful scan ensures grub2-probe selects the correct device. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + struct btrfs_device *device; + + mutex_lock(&fs_devices->device_list_mutex); + + if (!fs_devices->opened) { + mutex_unlock(&fs_devices->device_list_mutex); + continue; + } + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->bdev && (device->bdev->bd_dev == devt) && + strcmp(device->name->str, path) != 0) { + mutex_unlock(&fs_devices->device_list_mutex); + + /* Do not skip registration. */ + return false; + } + } + mutex_unlock(&fs_devices->device_list_mutex); + } + + if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && + !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) + return true; + + return false; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@ -1316,19 +1512,24 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct bdev_handle *bdev_handle; - u64 bytenr, bytenr_orig; + struct file *bdev_file; + char *canonical_path = NULL; + u64 bytenr; + dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); - /* - * we would like to check all the supers, but that would make - * a btrfs mount succeed after a mkfs from a different FS. - * So, we need to add a special mount option to scan for - * later supers, using BTRFS_SUPER_MIRROR_MAX instead - */ - + if (!is_good_dev_path(path)) { + canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (canonical_path) { + ret = get_canonical_dev_path(path, canonical_path); + if (ret < 0) { + kfree(canonical_path); + canonical_path = NULL; + } + } + } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1339,41 +1540,42 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev_handle = bdev_open_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev_handle)) - return ERR_CAST(bdev_handle); + bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); - bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr); + /* + * We would like to check all the super blocks, but doing so would + * allow a mount to succeed after a mkfs from a different filesystem. + * Currently, recovery from a bad primary btrfs superblock is done + * using the userspace command 'btrfs check --super'. + */ + ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } - disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr, - bytenr_orig); + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, + btrfs_sb_offset(0)); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; } - if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && - !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) { - dev_t devt; + devt = file_bdev(bdev_file)->bd_dev; + if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { + pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", + path, MAJOR(devt), MINOR(devt)); - ret = lookup_bdev(path, &devt); - if (ret) - btrfs_warn(NULL, "lookup bdev failed for path %s: %d", - path, ret); - else - btrfs_free_stale_devices(devt, NULL); + btrfs_free_stale_devices(devt, NULL); - pr_debug("BTRFS: skip registering single non-seed device %s\n", path); device = NULL; goto free_disk_super; } - device = device_list_add(path, disk_super, &new_device_added); + device = device_list_add(canonical_path ? : path, disk_super, + &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1381,7 +1583,8 @@ free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - bdev_release(bdev_handle); + fput(bdev_file); + kfree(canonical_path); return device; } @@ -1403,7 +1606,7 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, if (in_range(physical_start, *start, len) || in_range(*start, physical_start, - physical_end - physical_start)) { + physical_end + 1 - physical_start)) { *start = physical_end + 1; return true; } @@ -1864,7 +2067,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -2032,11 +2234,10 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, copy_num, ret); } -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device) { int copy_num; + struct block_device *bdev = device->bdev; if (!bdev) return; @@ -2052,12 +2253,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + update_dev_time(device->name->str); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle) + struct file **bdev_file) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2166,7 +2367,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, btrfs_assign_next_active_device(device, NULL); - if (device->bdev_handle) { + if (device->bdev_file) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@ -2182,20 +2383,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and bdev_release() will pull in the ->open_mutex on - * the block device and it's dependencies. Instead just flush the - * device and let the caller do the final bdev_release. + * write lock, and fput() on the block device will pull in the + * ->open_mutex on the block device and it's dependencies. Instead + * just flush the device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { - btrfs_scratch_superblocks(fs_info, device->bdev, - device->name->str); + btrfs_scratch_superblocks(fs_info, device); if (device->bdev) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } } - *bdev_handle = device->bdev_handle; + *bdev_file = device->bdev_file; synchronize_rcu(); btrfs_free_device(device); @@ -2301,8 +2501,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_unlock(&fs_devices->device_list_mutex); - btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, - tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); btrfs_close_bdev(tgtdev); synchronize_rcu(); @@ -2332,7 +2531,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, const char *path) { struct btrfs_super_block *disk_super; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int ret; if (!path || !path[0]) @@ -2350,7 +2549,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@ -2363,7 +2562,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return 0; } @@ -2563,11 +2762,9 @@ next_slot: device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ - if (device->fs_devices->seeding) { + if (device->fs_devices->seeding) btrfs_set_device_generation(leaf, dev_item, device->generation); - btrfs_mark_buffer_dirty(trans, leaf); - } path->slots[0]++; goto next_slot; @@ -2583,7 +2780,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@ -2596,12 +2793,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); - if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) { + if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { ret = -EINVAL; goto error; } @@ -2613,11 +2810,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - sync_blockdev(bdev_handle->bdev); + sync_blockdev(file_bdev(bdev_file)); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev_handle->bdev) { + if (device->bdev == file_bdev(bdev_file)) { ret = -EEXIST; rcu_read_unlock(); goto error; @@ -2633,8 +2830,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } device->fs_info = fs_info; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@ -2661,11 +2858,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - btrfs_clear_sb_rdonly(sb); - /* GFP_KERNEL allocation must not be under device_list_mutex */ seed_devices = btrfs_init_sprout(fs_info); if (IS_ERR(seed_devices)) { @@ -2808,8 +3003,6 @@ error_sysfs: mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: - if (seeding_dev) - btrfs_set_sb_rdonly(sb); if (trans) btrfs_end_transaction(trans); error_free_zone: @@ -2817,7 +3010,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - bdev_release(bdev_handle); + fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -2864,8 +3057,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); - btrfs_mark_buffer_dirty(trans, leaf); - out: btrfs_free_path(path); return ret; @@ -2935,16 +3126,19 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_handle_fs_error(fs_info, -ENOENT, - "Failed lookup while freeing chunk."); - ret = -ENOENT; + btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", + chunk_offset); + btrfs_abort_transaction(trans, -ENOENT); + ret = -EUCLEAN; goto out; } ret = btrfs_del_item(trans, root, path); - if (ret < 0) - btrfs_handle_fs_error(fs_info, ret, - "Failed to delete chunk item."); + if (ret < 0) { + btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); + btrfs_abort_transaction(trans, ret); + goto out; + } out: btrfs_free_path(path); return ret; @@ -3393,7 +3587,18 @@ again: mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * On the first search we would find chunk tree with + * offset -1, which is not possible. On subsequent + * loops this would find an existing item on an invalid + * offset (one less than the previous one, wrong + * alignment and size). + */ + ret = -EUCLEAN; + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto error; + } ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); @@ -3480,6 +3685,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { @@ -3522,10 +3765,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_meta(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); btrfs_set_balance_sys(leaf, item, &disk_bargs); - btrfs_set_balance_flags(leaf, item, bctl->flags); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); @@ -3624,7 +3864,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) struct btrfs_balance_control *bctl = fs_info->balance_ctl; int ret; - BUG_ON(!fs_info->balance_ctl); + ASSERT(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = NULL; @@ -4678,183 +4918,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_uuid_scan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = data; - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_key key; - struct btrfs_path *path = NULL; - int ret = 0; - struct extent_buffer *eb; - int slot; - struct btrfs_root_item root_item; - u32 item_size; - struct btrfs_trans_handle *trans = NULL; - bool closing = false; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - key.objectid = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = 0; - - while (1) { - if (btrfs_fs_closing(fs_info)) { - closing = true; - break; - } - ret = btrfs_search_forward(root, &key, path, - BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - break; - } - - if (key.type != BTRFS_ROOT_ITEM_KEY || - (key.objectid < BTRFS_FIRST_FREE_OBJECTID && - key.objectid != BTRFS_FS_TREE_OBJECTID) || - key.objectid > BTRFS_LAST_FREE_OBJECTID) - goto skip; - - eb = path->nodes[0]; - slot = path->slots[0]; - item_size = btrfs_item_size(eb, slot); - if (item_size < sizeof(root_item)) - goto skip; - - read_extent_buffer(eb, &root_item, - btrfs_item_ptr_offset(eb, slot), - (int)sizeof(root_item)); - if (btrfs_root_refs(&root_item) == 0) - goto skip; - - if (!btrfs_is_empty_uuid(root_item.uuid) || - !btrfs_is_empty_uuid(root_item.received_uuid)) { - if (trans) - goto update_tree; - - btrfs_release_path(path); - /* - * 1 - subvol uuid item - * 1 - received_subvol uuid item - */ - trans = btrfs_start_transaction(fs_info->uuid_root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - continue; - } else { - goto skip; - } -update_tree: - btrfs_release_path(path); - if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - - if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, - root_item.received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - -skip: - btrfs_release_path(path); - if (trans) { - ret = btrfs_end_transaction(trans); - trans = NULL; - if (ret) - break; - } - - if (key.offset < (u64)-1) { - key.offset++; - } else if (key.type < BTRFS_ROOT_ITEM_KEY) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - } else if (key.objectid < (u64)-1) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.objectid++; - } else { - break; - } - cond_resched(); - } - -out: - btrfs_free_path(path); - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else if (!closing) - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); - up(&fs_info->uuid_tree_rescan_sem); - return 0; -} - -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *uuid_root; - struct task_struct *task; - int ret; - - /* - * 1 - root node - * 1 - root item - */ - trans = btrfs_start_transaction(tree_root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - fs_info->uuid_root = uuid_root; - - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_scan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -5378,7 +5441,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->stripe_size) + ctl->nparity, @@ -5464,33 +5527,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma btrfs_free_chunk_map(map); } +static int btrfs_chunk_map_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_chunk_map *new_map = + rb_entry(new, struct btrfs_chunk_map, rb_node); + const struct btrfs_chunk_map *exist_map = + rb_entry(exist, struct btrfs_chunk_map, rb_node); + + if (new_map->start == exist_map->start) + return 0; + if (new_map->start < exist_map->start) + return -1; + return 1; +} + EXPORT_FOR_TESTS int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) { - struct rb_node **p; - struct rb_node *parent = NULL; - bool leftmost = true; + struct rb_node *exist; write_lock(&fs_info->mapping_tree_lock); - p = &fs_info->mapping_tree.rb_root.rb_node; - while (*p) { - struct btrfs_chunk_map *entry; - - parent = *p; - entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); - - if (map->start < entry->start) { - p = &(*p)->rb_left; - } else if (map->start > entry->start) { - p = &(*p)->rb_right; - leftmost = false; - } else { - write_unlock(&fs_info->mapping_tree_lock); - return -EEXIST; - } + exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree, + btrfs_chunk_map_cmp); + + if (exist) { + write_unlock(&fs_info->mapping_tree_lock); + return -EEXIST; } - rb_link_node(&map->rb_node, parent, p); - rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); chunk_map_device_set_bits(map, CHUNK_ALLOCATED); chunk_map_device_clear_bits(map, CHUNK_TRIMMED); write_unlock(&fs_info->mapping_tree_lock); @@ -5513,21 +5577,6 @@ struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) return map; } -struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp) -{ - const int size = btrfs_chunk_map_size(map->num_stripes); - struct btrfs_chunk_map *clone; - - clone = kmemdup(map, size, gfp); - if (!clone) - return NULL; - - refcount_set(&clone->refs, 1); - RB_CLEAR_NODE(&clone->rb_node); - - return clone; -} - static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) @@ -5538,8 +5587,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, u64 start = ctl->start; u64 type = ctl->type; int ret; - int i; - int j; map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS); if (!map) @@ -5554,8 +5601,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, map->sub_stripes = ctl->sub_stripes; map->num_stripes = ctl->num_stripes; - for (i = 0; i < ctl->ndevs; ++i) { - for (j = 0; j < ctl->dev_stripes; ++j) { + for (int i = 0; i < ctl->ndevs; i++) { + for (int j = 0; j < ctl->dev_stripes; j++) { int s = i * ctl->dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + @@ -5867,11 +5914,31 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) write_unlock(&fs_info->mapping_tree_lock); } +static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type); + + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + + /* + * There could be two corrupted data stripes, we need to loop retry in + * order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the stripe under + * reconstruction. + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return map->num_stripes; + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + return btrfs_raid_array[index].ncopies; +} + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct btrfs_chunk_map *map; - enum btrfs_raid_types index; - int ret = 1; + int ret; map = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(map)) @@ -5883,22 +5950,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - index = btrfs_bg_flags_to_raid_index(map->type); - - /* Non-RAID56, use their ncopies from btrfs_raid_array. */ - if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) - ret = btrfs_raid_array[index].ncopies; - else if (map->type & BTRFS_BLOCK_GROUP_RAID5) - ret = 2; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - /* - * There could be two corrupted data stripes, we need - * to loop retry in order to rebuild the correct data. - * - * Fail a stripe at a time on every retry except the - * stripe under reconstruction. - */ - ret = map->num_stripes; + ret = btrfs_chunk_map_num_copies(map); btrfs_free_chunk_map(map); return ret; } @@ -5922,28 +5974,81 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) { - struct btrfs_chunk_map *map; - int ret = 0; + for (int index = first; index < first + num_stripes; index++) { + const struct btrfs_device *device = map->stripes[index].dev; - if (!btrfs_fs_incompat(fs_info, RAID56)) - return 0; + if (device->devid == READ_ONCE(device->fs_devices->read_devid)) + return index; + } - map = btrfs_get_chunk_map(fs_info, logical, len); + /* If no read-preferred device is set use the first stripe. */ + return first; +} - if (!WARN_ON(IS_ERR(map))) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - ret = 1; - btrfs_free_chunk_map(map); +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; + struct btrfs_device *device = map->stripes[first].dev; + struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; + unsigned int read_cycle; + unsigned int total_reads; + unsigned int min_reads_per_dev; + + total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); + min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> + fs_info->sectorsize_bits; + + for (int index = 0, i = first; i < first + num_stripes; i++) { + stripes[index].devid = map->stripes[i].dev->devid; + stripes[index].num = i; + index++; } - return ret; + sort(stripes, num_stripes, sizeof(struct stripe_mirror), + btrfs_cmp_devid, NULL); + + read_cycle = total_reads / min_reads_per_dev; + return stripes[read_cycle % num_stripes].num; } +#endif static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { + const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; int num_stripes; int preferred_mirror; @@ -5958,17 +6063,24 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - switch (fs_info->fs_devices->read_policy) { + switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ - btrfs_warn_rl(fs_info, - "unknown read_policy type %u, reset to pid", - fs_info->fs_devices->read_policy); - fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid", + policy); + WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID); fallthrough; case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; + case BTRFS_READ_POLICY_DEVID: + preferred_mirror = btrfs_read_preferred(map, first, num_stripes); + break; +#endif } if (dev_replace_is_ongoing && @@ -6000,9 +6112,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - u64 logical, - u16 total_stripes) +EXPORT_FOR_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -6198,20 +6310,19 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) return ret; } -static void handle_ops_on_dev_replace(enum btrfs_map_op op, - struct btrfs_io_context *bioc, +static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, struct btrfs_dev_replace *dev_replace, u64 logical, - int *num_stripes_ret, int *max_errors_ret) + struct btrfs_io_geometry *io_geom) { u64 srcdev_devid = dev_replace->srcdev->devid; /* * At this stage, num_stripes is still the real number of stripes, * excluding the duplicated stripes. */ - int num_stripes = *num_stripes_ret; + int num_stripes = io_geom->num_stripes; + int max_errors = io_geom->max_errors; int nr_extra_stripes = 0; - int max_errors = *max_errors_ret; int i; /* @@ -6252,7 +6363,7 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, * replace. * If we have 2 extra stripes, only choose the one with smaller physical. */ - if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { + if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; @@ -6270,8 +6381,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, } } - *num_stripes_ret = num_stripes + nr_extra_stripes; - *max_errors_ret = max_errors + nr_extra_stripes; + io_geom->num_stripes = num_stripes + nr_extra_stripes; + io_geom->max_errors = max_errors + nr_extra_stripes; bioc->replace_nr_stripes = nr_extra_stripes; } @@ -6328,8 +6439,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, { dst->dev = map->stripes[io_geom->stripe_index].dev; - if (io_geom->op == BTRFS_MAP_READ && - btrfs_need_stripe_tree_update(fs_info, map->type)) + if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst) return btrfs_get_raid_extent_offset(fs_info, logical, length, map->type, io_geom->stripe_index, dst); @@ -6344,7 +6454,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, const struct btrfs_io_stripe *smap, const struct btrfs_chunk_map *map, int num_alloc_stripes, - enum btrfs_map_op op, int mirror_num) + struct btrfs_io_geometry *io_geom) { if (!smap) return false; @@ -6352,10 +6462,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, if (num_alloc_stripes != 1) return false; - if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) + if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ) return false; - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1) return false; return true; @@ -6534,7 +6644,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_chunk_map *map; struct btrfs_io_geometry io_geom = { 0 }; u64 map_offset; - int i; int ret = 0; int num_copies; struct btrfs_io_context *bioc = NULL; @@ -6550,26 +6659,29 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom.stripe_index = 0; io_geom.op = op; - num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (io_geom.mirror_num > num_copies) - return -EINVAL; - map = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(map)) return PTR_ERR(map); + num_copies = btrfs_chunk_map_num_copies(map); + if (io_geom.mirror_num > num_copies) + return -EINVAL; + map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); + io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); + + if (dev_replace->replace_task != current) + down_read(&dev_replace->rwsem); - down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ - if (!dev_replace_is_ongoing) + if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -6628,8 +6740,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * physical block information on the stack instead of allocating an * I/O context structure. */ - if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, - io_geom.mirror_num)) { + if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) { ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) *mirror_num_ret = io_geom.mirror_num; @@ -6643,6 +6754,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, goto out; } bioc->map_type = map->type; + bioc->use_rst = io_geom.use_rst; /* * For RAID56 full map, we need to make sure the stripes[] follows the @@ -6680,7 +6792,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ - for (i = 0; i < io_geom.num_stripes; i++) { + for (int i = 0; i < io_geom.num_stripes; i++) { ret = set_io_stripe(fs_info, logical, length, &bioc->stripes[i], map, &io_geom); if (ret < 0) @@ -6700,8 +6812,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) { - handle_ops_on_dev_replace(op, bioc, dev_replace, logical, - &io_geom.num_stripes, &io_geom.max_errors); + handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom); } *bioc_ret = bioc; @@ -6710,7 +6821,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num; out: - if (dev_replace_is_ongoing) { + if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); @@ -6984,16 +7095,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, warn_32bit_meta_chunk(fs_info, logical, length, type); #endif - /* - * Only need to verify chunk item if we're reading from sys chunk array, - * as chunk item in tree block is already verified by tree-checker. - */ - if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { - ret = btrfs_check_chunk_valid(leaf, chunk, logical); - if (ret) - return ret; - } - map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ @@ -7054,6 +7155,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); + btrfs_free_chunk_map(map); } return ret; @@ -7099,8 +7201,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) @@ -7251,16 +7357,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; - struct btrfs_disk_key *disk_key; - struct btrfs_chunk *chunk; u8 *array_ptr; unsigned long sb_array_offset; int ret = 0; - u32 num_stripes; u32 array_size; - u32 len = 0; u32 cur_offset; - u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); @@ -7283,10 +7384,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) cur_offset = 0; while (cur_offset < array_size) { - disk_key = (struct btrfs_disk_key *)array_ptr; - len = sizeof(*disk_key); - if (cur_offset + len > array_size) - goto out_short_read; + struct btrfs_chunk *chunk; + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr; + u32 len = sizeof(*disk_key); + + /* + * The sys_chunk_array has been already verified at super block + * read time. Only do ASSERT()s for basic checks. + */ + ASSERT(cur_offset + len <= array_size); btrfs_disk_key_to_cpu(&key, disk_key); @@ -7294,44 +7400,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) sb_array_offset += len; cur_offset += len; - if (key.type != BTRFS_CHUNK_ITEM_KEY) { - btrfs_err(fs_info, - "unexpected item type %u in sys_array at offset %u", - (u32)key.type, cur_offset); - ret = -EIO; - break; - } + ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY); chunk = (struct btrfs_chunk *)sb_array_offset; - /* - * At least one btrfs_chunk with one stripe must be present, - * exact stripe count check comes afterwards - */ - len = btrfs_chunk_item_size(1); - if (cur_offset + len > array_size) - goto out_short_read; - - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - if (!num_stripes) { - btrfs_err(fs_info, - "invalid number of stripes %u in sys_array at offset %u", - num_stripes, cur_offset); - ret = -EIO; - break; - } + ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM); - type = btrfs_chunk_type(sb, chunk); - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { - btrfs_err(fs_info, - "invalid chunk type %llu in sys_array at offset %u", - type, cur_offset); - ret = -EIO; - break; - } + len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk)); - len = btrfs_chunk_item_size(num_stripes); - if (cur_offset + len > array_size) - goto out_short_read; + ASSERT(cur_offset + len <= array_size); ret = read_one_chunk(&key, sb, chunk); if (ret) @@ -7344,13 +7420,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return ret; - -out_short_read: - btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", - len, cur_offset); - clear_extent_buffer_uptodate(sb); - free_extent_buffer_stale(sb); - return -EIO; } /* @@ -7550,8 +7619,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_device *device; int ret = 0; - fs_devices->fs_info = fs_info; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; @@ -7727,8 +7794,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53f87f398da7..120f65e21eeb 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,15 +6,36 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H +#include <linux/blk_types.h> +#include <linux/sizes.h> +#include <linux/atomic.h> #include <linux/sort.h> -#include <linux/btrfs.h> -#include "async-thread.h" +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/log2.h> +#include <linux/kobject.h> +#include <linux/refcount.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs.h> #include "messages.h" -#include "tree-checker.h" #include "rcu-string.h" +struct block_device; +struct bdev_handle; +struct btrfs_fs_info; +struct btrfs_block_group; +struct btrfs_trans_handle; +struct btrfs_zoned_device_info; + #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) +/* + * Arbitratry maximum size of one discard request to limit potentially long time + * spent in blkdev_issue_discard(). + */ +#define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G) + extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K @@ -77,7 +98,10 @@ enum btrfs_raid_types { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) -struct btrfs_zoned_device_info; +/* Special value encoding failure to write primary super block. */ +#define BTRFS_SUPER_PRIMARY_WRITE_ERROR (INT_MAX / 2) + +struct btrfs_fs_devices; struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ @@ -90,7 +114,7 @@ struct btrfs_device { u64 generation; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; struct btrfs_zoned_device_info *zone_info; @@ -127,6 +151,12 @@ struct btrfs_device { /* type and info about this device */ u64 type; + /* + * Counter of super block write errors, values larger than + * BTRFS_SUPER_PRIMARY_WRITE_ERROR encode primary super block write failure. + */ + atomic_t sb_write_errors; + /* minimal io size for this device */ u32 sector_size; @@ -266,6 +296,9 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K) +/* Keep in sync with raid_attr table, current maximum is RAID1C4. */ +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -273,9 +306,34 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing RAID1 reads across all striped devices (round-robin). */ + BTRFS_READ_POLICY_RR, + /* Read from a specific device. */ + BTRFS_READ_POLICY_DEVID, +#endif BTRFS_NR_READ_POLICY, }; +#ifdef CONFIG_BTRFS_EXPERIMENTAL +/* + * Checksum mode - offload it to workqueues or do it synchronously in + * btrfs_submit_chunk(). + */ +enum btrfs_offload_csum_mode { + /* + * Choose offloading checksum or do it synchronously automatically. + * Do it synchronously if the checksum is fast, or offload to workqueues + * otherwise. + */ + BTRFS_OFFLOAD_CSUM_AUTO, + /* Always offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_ON, + /* Never offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_OFF, +}; +#endif + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ @@ -368,6 +426,8 @@ struct btrfs_fs_devices { bool seeding; /* The mount needs to use a randomly generated fsid. */ bool temp_fsid; + /* Enable/disable the filesystem stats tracking. */ + bool collect_fs_stats; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -380,6 +440,20 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * Minimum contiguous reads before switching to next device, the unit + * is one block/sectorsize. + */ + u32 rr_min_contig_read; + + /* Device to be used for reading in case of RAID1. */ + u64 read_devid; + + /* Checksum mode - offload it or do it synchronously. */ + enum btrfs_offload_csum_mode offload_csum_mode; +#endif }; #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ @@ -396,7 +470,7 @@ struct btrfs_io_stripe { /* Block mapping. */ u64 physical; u64 length; - bool is_scrub; + bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; }; @@ -431,6 +505,7 @@ struct btrfs_io_context { struct bio *orig_bio; atomic_t error; u16 max_errors; + bool use_rst; u64 logical; u64 size; @@ -557,8 +632,6 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) } } -struct btrfs_balance_args; -struct btrfs_balance_progress; struct btrfs_balance_control { struct btrfs_balance_args data; struct btrfs_balance_args meta; @@ -661,7 +734,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle); + struct file **bdev_file); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -679,8 +752,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, @@ -691,8 +762,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, - u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); @@ -706,7 +775,6 @@ struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp); int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); #endif -struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp); struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, @@ -780,9 +848,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device); enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags); int btrfs_bg_type_to_factor(u64 flags); @@ -791,6 +857,11 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); -u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb); +const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes); +#endif #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 6287763fdccc..3e0edbcf73e1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -24,7 +24,7 @@ #include "accessors.h" #include "dir-item.h" -int btrfs_getxattr(struct inode *inode, const char *name, +int btrfs_getxattr(const struct inode *inode, const char *name, void *buffer, size_t size) { struct btrfs_dir_item *di; @@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; size_t name_len = strlen(name); int ret = 0; @@ -120,7 +119,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(inode_is_locked(inode)); + btrfs_assert_inode_locked(BTRFS_I(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) @@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, */ ret = 0; btrfs_assert_tree_write_locked(path->nodes[0]); - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; goto out; } } else if (ret == -EEXIST) { ret = 0; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); ASSERT(di); /* logic error */ } else if (ret) { goto out; @@ -205,7 +204,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); - btrfs_mark_buffer_dirty(trans, leaf); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is @@ -451,7 +449,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_set_prop(trans, inode, name, value, size, flags); + ret = btrfs_set_prop(trans, BTRFS_I(inode), name, value, size, flags); if (!ret) { inode_inc_iversion(inode); inode_set_ctime_current(inode); @@ -504,7 +502,7 @@ static int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr; unsigned int nofs_flag; char *name; - int err = 0; + int ret = 0; /* * We're holding a transaction handle, so use a NOFS memory allocation @@ -515,7 +513,7 @@ static int btrfs_initxattrs(struct inode *inode, name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { - err = -ENOMEM; + ret = -ENOMEM; break; } strcpy(name, XATTR_SECURITY_PREFIX); @@ -524,14 +522,14 @@ static int btrfs_initxattrs(struct inode *inode, if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); - err = btrfs_setxattr(trans, inode, name, xattr->value, + ret = btrfs_setxattr(trans, inode, name, xattr->value, xattr->value_len, 0); kfree(name); - if (err < 0) + if (ret < 0) break; } memalloc_nofs_restore(nofs_flag); - return err; + return ret; } int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 118118ca3e1d..8dc4cf49f6f0 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,11 +6,15 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H -#include <linux/xattr.h> +struct dentry; +struct inode; +struct qstr; +struct xattr_handler; +struct btrfs_trans_handle; extern const struct xattr_handler * const btrfs_xattr_handlers[]; -int btrfs_getxattr(struct inode *inode, const char *name, +int btrfs_getxattr(const struct inode *inode, const char *name, void *buffer, size_t size); int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8da66ea699e8..c9e92c6941ec 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -18,7 +18,10 @@ #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/refcount.h> +#include "btrfs_inode.h" #include "compression.h" +#include "fs.h" +#include "subpage.h" /* workspace buffer size for s390 zlib hardware support */ #define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) @@ -91,29 +94,35 @@ fail: return ERR_PTR(-ENOMEM); } -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; char *data_in = NULL; - char *cpage_out; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; + char *cfolio_out; + int nr_folios = 0; + struct folio *in_folio = NULL; + struct folio *out_folio = NULL; unsigned long bytes_left; - unsigned int in_buf_pages; + unsigned int in_buf_folios; unsigned long len = *total_out; - unsigned long nr_dest_pages = *out_pages; - const unsigned long max_out = nr_dest_pages * PAGE_SIZE; + unsigned long nr_dest_folios = *out_folios; + const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u64 orig_end = start + len; - *out_pages = 0; + *out_folios = 0; *total_out = 0; *total_in = 0; - if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) { - pr_warn("BTRFS: deflateInit failed\n"); + ret = zlib_deflateInit(&workspace->strm, workspace->level); + if (unlikely(ret != Z_OK)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_err(inode->root->fs_info, + "zlib compression init failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; goto out; } @@ -121,18 +130,18 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[0] = out_page; - nr_pages = 1; + cfolio_out = folio_address(out_folio); + folios[0] = out_folio; + nr_folios = 1; workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; workspace->strm.avail_out = PAGE_SIZE; while (workspace->strm.total_in < len) { @@ -142,43 +151,63 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, */ if (workspace->strm.avail_in == 0) { bytes_left = len - workspace->strm.total_in; - in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_pages > 1) { + in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_folios > 1) { int i; - for (i = 0; i < in_buf_pages; i++) { + /* S390 hardware acceleration path, not subpage. */ + ASSERT(!btrfs_is_subpage( + inode_to_fs_info(mapping->host), + mapping)); + for (i = 0; i < in_buf_folios; i++) { if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); + data_in = NULL; } - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + data_in = kmap_local_folio(in_folio, 0); copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = min(bytes_left, + in_buf_folios << PAGE_SHIFT); } else { + unsigned int pg_off; + unsigned int cur_len; + if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); + data_in = NULL; } - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap_local_page(in_page); - start += PAGE_SIZE; + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + data_in = kmap_local_folio(in_folio, pg_off); + start += cur_len; workspace->strm.next_in = data_in; + workspace->strm.avail_in = cur_len; } - workspace->strm.avail_in = min(bytes_left, - (unsigned long) workspace->buf_size); } ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); - if (ret != Z_OK) { - pr_debug("BTRFS: deflate in loop returned %d\n", - ret); + if (unlikely(ret != Z_OK)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_warn(inode->root->fs_info, + "zlib compression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + start); zlib_deflateEnd(&workspace->strm); ret = -EIO; goto out; @@ -196,20 +225,20 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[nr_pages] = out_page; - nr_pages++; + cfolio_out = folio_address(out_folio); + folios[nr_folios] = out_folio; + nr_folios++; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; } /* we're all done */ if (workspace->strm.total_in >= len) @@ -231,21 +260,21 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -EIO; goto out; } else if (workspace->strm.avail_out == 0) { - /* get another page for the stream end */ - if (nr_pages == nr_dest_pages) { + /* Get another folio for the stream end. */ + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[nr_pages] = out_page; - nr_pages++; + cfolio_out = folio_address(out_folio); + folios[nr_folios] = out_folio; + nr_folios++; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; } } zlib_deflateEnd(&workspace->strm); @@ -259,10 +288,10 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = workspace->strm.total_out; *total_in = workspace->strm.total_in; out: - *out_pages = nr_pages; + *out_folios = nr_folios; if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); } return ret; @@ -275,13 +304,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; - unsigned long page_in_index = 0; + unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; - struct page **pages_in = cb->compressed_pages; + struct folio **folios_in = cb->compressed_folios; - data_in = kmap_local_page(pages_in[page_in_index]); + data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -301,9 +330,14 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->strm.avail_in -= 2; } - if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { - pr_warn("BTRFS: inflateInit failed\n"); + ret = zlib_inflateInit2(&workspace->strm, wbits); + if (unlikely(ret != Z_OK)) { + struct btrfs_inode *inode = cb->bbio.inode; + kunmap_local(data_in); + btrfs_err(inode->root->fs_info, + "zlib decompression init failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -331,21 +365,26 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; kunmap_local(data_in); - page_in_index++; - if (page_in_index >= total_pages_in) { + folio_in_index++; + if (folio_in_index >= total_folios_in) { data_in = NULL; break; } - data_in = kmap_local_page(pages_in[page_in_index]); + data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); } } - if (ret != Z_STREAM_END) + if (unlikely(ret != Z_STREAM_END)) { + btrfs_err(cb->bbio.inode->root->fs_info, + "zlib decompression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(cb->bbio.inode->root), + btrfs_ino(cb->bbio.inode), cb->start); ret = -EIO; - else + } else { ret = 0; + } done: zlib_inflateEnd(&workspace->strm); if (data_in) @@ -354,7 +393,7 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -380,8 +419,14 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, workspace->strm.avail_in -= 2; } - if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { - pr_warn("BTRFS: inflateInit failed\n"); + ret = zlib_inflateInit2(&workspace->strm, wbits); + if (unlikely(ret != Z_OK)) { + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(inode->root->fs_info, + "zlib decompression init failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(dest_folio)); return -EIO; } @@ -394,12 +439,16 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, if (ret != Z_STREAM_END) goto out; - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy); out: if (unlikely(to_copy != destlen)) { - pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", - to_copy, destlen); + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(inode->root->fs_info, +"zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(dest_folio), to_copy, destlen); ret = -EIO; } else { ret = 0; @@ -408,7 +457,7 @@ out: zlib_inflateEnd(&workspace->strm); if (unlikely(to_copy < destlen)) - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5f750fa53a2b..73e0aa9fc08a 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -12,10 +12,8 @@ #include "rcu-string.h" #include "disk-io.h" #include "block-group.h" -#include "transaction.h" #include "dev-replace.h" #include "space-info.h" -#include "super.h" #include "fs.h" #include "accessors.h" #include "bio.h" @@ -89,9 +87,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, bool empty[BTRFS_NR_SB_LOG_ZONES]; bool full[BTRFS_NR_SB_LOG_ZONES]; sector_t sector; - int i; - for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); full[i] = sb_zone_is_full(&zones[i]); @@ -120,12 +117,11 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, return -ENOENT; } else if (full[0] && full[1]) { /* Compare two super blocks */ - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; struct page *page[BTRFS_NR_SB_LOG_ZONES]; struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; - int i; - for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - BTRFS_SUPER_INFO_SIZE; @@ -146,7 +142,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, else sector = zones[0].start; - for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) + for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) btrfs_release_disk_super(super[i]); } else if (!full[0] && (empty[1] || full[1])) { sector = zones[0].wp; @@ -291,7 +287,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* The emulated zone size is determined from the size of device extent */ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct extent_buffer *leaf; @@ -308,28 +304,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (ret > 0) { - ret = -EUCLEAN; - goto out; - } + if (ret > 0) + return -EUCLEAN; } leaf = path->nodes[0]; dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); - ret = 0; - -out: - btrfs_free_path(path); - - return ret; + return 0; } int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -654,8 +643,7 @@ out: return NULL; } -int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, - struct blk_zone *zone) +static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) { unsigned int nr_zones = 1; int ret; @@ -719,11 +707,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) * zoned mode. In this case, we don't have a valid max zone * append size. */ - if (bdev_is_zoned(device->bdev)) { - blk_stack_limits(lim, - &bdev_get_queue(device->bdev)->limits, - 0); - } + if (bdev_is_zoned(device->bdev)) + blk_stack_limits(lim, bdev_limits(device->bdev), 0); + } + + ret = blk_validate_limits(lim); + if (ret) { + btrfs_err(fs_info, "zoned: failed to validate queue limits"); + return ret; } /* @@ -757,8 +748,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; + + fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, + fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned @@ -772,7 +764,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt) +int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, + unsigned long long *mount_opt) { if (!btrfs_is_zoned(info)) return 0; @@ -824,11 +817,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + unsigned int nofs_flags; + ASSERT(sb_zone_is_full(reset)); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_NOFS); + reset->start, reset->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -974,11 +970,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { + unsigned int nofs_flags; int ret; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_NOFS); + zone->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; } @@ -996,11 +995,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { + unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; + int ret; zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); @@ -1011,9 +1012,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) if (sb_zone + 1 >= nr_zones) return -ENOENT; - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zone_start_sector(sb_zone, bdev), + zone_sectors * BTRFS_NR_SB_LOG_ZONES); + memalloc_nofs_restore(nofs_flags); + return ret; } /* @@ -1124,12 +1128,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { + unsigned int nofs_flags; int ret; *bytes = 0; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_NOFS); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -1202,7 +1208,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; int ret; @@ -1237,7 +1243,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!ret) ret = -EUCLEAN; if (ret < 0) - goto out; + return ret; ret = btrfs_previous_extent_item(root, path, cache->start); if (ret) { @@ -1245,7 +1251,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, ret = 0; *offset_ret = 0; } - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -1257,15 +1263,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!(found_key.objectid >= cache->start && found_key.objectid + length <= cache->start + cache->length)) { - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; - ret = 0; - -out: - btrfs_free_path(path); - return ret; + return 0; } struct zone_info { @@ -1279,7 +1280,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct btrfs_chunk_map *map) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - struct btrfs_device *device = map->stripes[zone_idx].dev; + struct btrfs_device *device; int dev_replace_is_ongoing = 0; unsigned int nofs_flag; struct blk_zone zone; @@ -1287,7 +1288,11 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, info->physical = map->stripes[zone_idx].physical; + down_read(&dev_replace->rwsem); + device = map->stripes[zone_idx].dev; + if (!device->bdev) { + up_read(&dev_replace->rwsem); info->alloc_offset = WP_MISSING_DEV; return 0; } @@ -1297,6 +1302,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, __set_bit(zone_idx, active); if (!btrfs_dev_is_sequential(device, info->physical)) { + up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; return 0; } @@ -1304,11 +1310,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); - down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); - up_read(&dev_replace->rwsem); /* * The group is mapped to a sequential zone. Get the zone write pointer @@ -1319,6 +1323,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); if (ret) { + up_read(&dev_replace->rwsem); if (ret != -EIO && ret != -EOPNOTSUPP) return ret; info->alloc_offset = WP_MISSING_DEV; @@ -1330,6 +1335,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), device->devid); + up_read(&dev_replace->rwsem); return -EIO; } @@ -1338,7 +1344,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: - btrfs_err(fs_info, + btrfs_err_in_rcu(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", (info->physical >> device->zone_info->zone_size_shift), rcu_str_deref(device->name), device->devid); @@ -1357,6 +1363,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, break; } + up_read(&dev_replace->rwsem); + return 0; } @@ -1390,6 +1398,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, return -EINVAL; } + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + if (zone_info[0].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", @@ -1416,7 +1426,6 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, } bg->alloc_offset = zone_info[0].alloc_offset; - bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity); return 0; } @@ -1434,6 +1443,9 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, return -EINVAL; } + /* In case a device is missing we have a cap of 0, so don't use it. */ + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + for (i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV || zone_info[i].alloc_offset == WP_CONVENTIONAL) @@ -1455,9 +1467,6 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } - /* In case a device is missing we have a cap of 0, so don't use it. */ - bg->zone_capacity = min_not_zero(zone_info[0].capacity, - zone_info[1].capacity); } if (zone_info[0].alloc_offset != WP_MISSING_DEV) @@ -1547,6 +1556,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; + u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; @@ -1563,11 +1573,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (!map) return -EINVAL; - cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS); - if (!cache->physical_map) { - ret = -ENOMEM; - goto out; - } + cache->physical_map = map; zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); if (!zone_info) { @@ -1611,7 +1617,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } - switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + switch (profile) { case 0: /* single */ ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; @@ -1638,6 +1645,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && + profile != BTRFS_BLOCK_GROUP_RAID10) { + /* + * Detected broken write pointer. Make this block group + * unallocatable by setting the allocation pointer at the end of + * allocatable region. Relocating this block group will fix the + * mismatch. + * + * Currently, we cannot handle RAID0 or RAID10 case like this + * because we don't have a proper zone_capacity value. But, + * reading from this block group won't work anyway by a missing + * stripe. + */ + cache->alloc_offset = cache->zone_capacity; + ret = 0; + } + out: /* Reject non SINGLE data profiles without RST */ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && @@ -1679,7 +1703,6 @@ out: } bitmap_free(active); kfree(zone_info); - btrfs_free_chunk_map(map); return ret; } @@ -1713,14 +1736,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) if (!btrfs_is_zoned(fs_info)) return false; - if (!inode || !is_data_inode(&inode->vfs_inode)) + if (!inode || !is_data_inode(inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) return false; /* - * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the * extent layout the relocation code has. * Furthermore we have set aside own block-group from which only the * relocation "process" can allocate and make sure only one process at a @@ -1755,7 +1778,7 @@ void btrfs_record_physical_zoned(struct btrfs_bio *bbio) static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, u64 logical) { - struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree; + struct extent_map_tree *em_tree = &ordered->inode->extent_tree; struct extent_map *em; ordered->disk_bytenr = logical; @@ -1763,7 +1786,9 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); - em->block_start = logical; + /* The em should be a new COW extent, thus it should not have an offset. */ + ASSERT(em->offset == 0); + em->disk_bytenr = logical; free_extent_map(em); write_unlock(&em_tree->lock); } @@ -1774,7 +1799,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, struct btrfs_ordered_extent *new; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset, + split_extent_map(ordered->inode, ordered->file_offset, ordered->num_bytes, len, logical)) return false; @@ -1788,7 +1813,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *sum; u64 logical, len; @@ -1832,7 +1857,7 @@ out: * here so that we don't attempt to log the csums later. */ if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) { + test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) { while ((sum = list_first_entry_or_null(&ordered->list, typeof(*sum), list))) { list_del(&sum->list); @@ -1952,7 +1977,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (block_group->meta_write_pointer > eb->start) return -EBUSY; - /* If for_sync, this hole will be filled with trasnsaction commit. */ + /* If for_sync, this hole will be filled with transaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) return -EAGAIN; return -EBUSY; @@ -2164,6 +2189,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret = 0; int i; @@ -2201,8 +2227,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* Ensure all writes in this block group finish */ btrfs_wait_block_group_reservations(block_group); /* No need to wait for NOCOW writers. Zoned mode does not allow that */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, - block_group->length); + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group); /* Wait for extent buffers to be written. */ if (is_metadata) wait_eb_writebacks(block_group); @@ -2239,27 +2264,33 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); + down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; + unsigned int nofs_flags; if (zinfo->max_active_zones == 0) continue; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); - if (ret) + if (ret) { + up_read(&dev_replace->rwsem); return ret; + } if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } + up_read(&dev_replace->rwsem); if (!fully_written) btrfs_dec_block_group_ro(block_group); @@ -2420,7 +2451,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); } -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; @@ -2621,3 +2652,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->zone_active_bgs_lock); } + +/* + * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. + * + * @space_info: the space to work on + * @num_bytes: targeting reclaim bytes + * + * This one resets the zones of a block group, so we can reuse the region + * without removing the block group. On the other hand, btrfs_delete_unused_bgs() + * just removes a block group and frees up the underlying zones. So, we still + * need to allocate a new block group to reuse the zones. + * + * Resetting is faster than deleting/recreating a block group. It is similar + * to freeing the logical space on the regular mode. However, we cannot change + * the block group's profile with this operation. + */ +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + while (num_bytes > 0) { + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg = NULL; + bool found = false; + u64 reclaimed = 0; + + /* + * Here, we choose a fully zone_unusable block group. It's + * technically possible to reset a partly zone_unusable block + * group, which still has some free space left. However, + * handling that needs to cope with the allocation side, which + * makes the logic more complex. So, let's handle the easy case + * for now. + */ + spin_lock(&fs_info->unused_bgs_lock); + list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { + if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) + continue; + + /* + * Use trylock to avoid locking order violation. In + * btrfs_reclaim_bgs_work(), the lock order is + * &bg->lock -> &fs_info->unused_bgs_lock. We skip a + * block group if we cannot take its lock. + */ + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + found = true; + break; + } + if (!found) { + spin_unlock(&fs_info->unused_bgs_lock); + return 0; + } + + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + spin_unlock(&fs_info->unused_bgs_lock); + + /* + * Since the block group is fully zone_unusable and we cannot + * allocate from this block group anymore, we don't need to set + * this block group read-only. + */ + + down_read(&fs_info->dev_replace.rwsem); + map = bg->physical_map; + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + unsigned int nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, + stripe->physical >> SECTOR_SHIFT, + zone_size_sectors); + memalloc_nofs_restore(nofs_flags); + + if (ret) { + up_read(&fs_info->dev_replace.rwsem); + return ret; + } + } + up_read(&fs_info->dev_replace.rwsem); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + ASSERT(!btrfs_is_block_group_used(bg)); + if (bg->ro) { + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + continue; + } + + reclaimed = bg->alloc_offset; + bg->zone_unusable = bg->length - bg->zone_capacity; + bg->alloc_offset = 0; + /* + * This holds because we currently reset fully used then freed + * block group. + */ + ASSERT(reclaimed == bg->zone_capacity); + bg->free_space_ctl->free_space += reclaimed; + space_info->bytes_zone_unusable -= reclaimed; + spin_unlock(&bg->lock); + btrfs_return_free_space(space_info, reclaimed); + spin_unlock(&space_info->lock); + + if (num_bytes <= reclaimed) + break; + num_bytes -= reclaimed; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f573bda496fb..9672bf4c3335 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -4,12 +4,27 @@ #define BTRFS_ZONED_H #include <linux/types.h> +#include <linux/atomic.h> #include <linux/blkdev.h> +#include <linux/blkzoned.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> #include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" #include "btrfs_inode.h" +#include "fs.h" + +struct block_device; +struct extent_buffer; +struct btrfs_bio; +struct btrfs_ordered_extent; +struct btrfs_fs_info; +struct btrfs_space_info; +struct btrfs_eb_write_context; +struct btrfs_fs_devices; #define BTRFS_DEFAULT_RECLAIM_THRESH (75) @@ -38,14 +53,13 @@ struct btrfs_zoned_device_info { void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered); #ifdef CONFIG_BLK_DEV_ZONED -int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, - struct blk_zone *zone); int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt); +int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, + unsigned long long *mount_opt); int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret); int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, @@ -75,19 +89,15 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); #else /* CONFIG_BLK_DEV_ZONED */ -static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, - struct blk_zone *zone) -{ - return 0; -} static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) { @@ -121,8 +131,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) return -EOPNOTSUPP; } -static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, - unsigned long *mount_opt) +static inline int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, + unsigned long long *mount_opt) { return 0; } @@ -233,7 +243,7 @@ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } -static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { return false; } @@ -256,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } +static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, + u64 num_bytes) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0d66db8bc1d4..5232b56d5892 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -18,8 +18,10 @@ #include <linux/slab.h> #include <linux/zstd.h> #include "misc.h" +#include "fs.h" +#include "btrfs_inode.h" #include "compression.h" -#include "ctree.h" +#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -109,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; + ASSERT(timer == &wsm.timer); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { @@ -373,51 +377,63 @@ fail: return ERR_PTR(-ENOMEM); } -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); zstd_cstream *stream; int ret = 0; - int nr_pages = 0; - struct page *in_page = NULL; /* The current page to read */ - struct page *out_page = NULL; /* The current page to write to */ + int nr_folios = 0; + struct folio *in_folio = NULL; /* The current folio to read. */ + struct folio *out_folio = NULL; /* The current folio to write to. */ unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; - const unsigned long nr_dest_pages = *out_pages; - unsigned long max_out = nr_dest_pages * PAGE_SIZE; + const unsigned long nr_dest_folios = *out_folios; + const u64 orig_end = start + len; + unsigned long max_out = nr_dest_folios * PAGE_SIZE; + unsigned int pg_off; + unsigned int cur_len; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); - *out_pages = 0; + *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ stream = zstd_init_cstream(¶ms, len, workspace->mem, workspace->size); - if (!stream) { - pr_warn("BTRFS: zstd_init_cstream failed\n"); + if (unlikely(!stream)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_err(inode->root->fs_info, + "zstd compression init level %d failed, root %llu inode %llu offset %llu", + workspace->req_level, btrfs_root_id(inode->root), + btrfs_ino(inode), start); ret = -EIO; goto out; } /* map in the first page of input data */ - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + if (ret < 0) + goto out; + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -426,9 +442,14 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_compress_stream returned %d\n", - zstd_get_error_code(ret2)); + if (unlikely(zstd_is_error(ret2))) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_warn(inode->root->fs_info, +"zstd compression level %d failed, error %d root %llu inode %llu offset %llu", + workspace->req_level, zstd_get_error_code(ret2), + btrfs_root_id(inode->root), btrfs_ino(inode), + start); ret = -EIO; goto out; } @@ -452,17 +473,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -476,24 +497,34 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { - tot_in += PAGE_SIZE; + tot_in += workspace->in_buf.size; kunmap_local(workspace->in_buf.src); - put_page(in_page); - start += PAGE_SIZE; - len -= PAGE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap_local_page(in_page); + workspace->in_buf.src = NULL; + folio_put(in_folio); + start += cur_len; + len -= cur_len; + ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + if (ret < 0) + goto out; + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; } } while (1) { size_t ret2; ret2 = zstd_end_stream(stream, &workspace->out_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_end_stream returned %d\n", - zstd_get_error_code(ret2)); + if (unlikely(zstd_is_error(ret2))) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_err(inode->root->fs_info, +"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", + workspace->req_level, zstd_get_error_code(ret2), + btrfs_root_id(inode->root), btrfs_ino(inode), + start); ret = -EIO; goto out; } @@ -509,17 +540,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -533,10 +564,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = tot_in; *total_out = tot_out; out: - *out_pages = nr_pages; + *out_folios = nr_folios; if (workspace->in_buf.src) { kunmap_local(workspace->in_buf.src); - put_page(in_page); + folio_put(in_folio); } return ret; } @@ -544,24 +575,28 @@ out: int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct page **pages_in = cb->compressed_pages; + struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; - unsigned long page_in_index = 0; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long folio_in_index = 0; + unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); - if (!stream) { - pr_debug("BTRFS: zstd_init_dstream failed\n"); + if (unlikely(!stream)) { + struct btrfs_inode *inode = cb->bbio.inode; + + btrfs_err(inode->root->fs_info, + "zstd decompression init failed, root %llu inode %llu offset %llu", + btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); ret = -EIO; goto done; } - workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -574,9 +609,13 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret2 = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); + if (unlikely(zstd_is_error(ret2))) { + struct btrfs_inode *inode = cb->bbio.inode; + + btrfs_err(inode->root->fs_info, + "zstd decompression failed, error %d root %llu inode %llu offset %llu", + zstd_get_error_code(ret2), btrfs_root_id(inode->root), + btrfs_ino(inode), cb->start); ret = -EIO; goto done; } @@ -598,14 +637,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); - page_in_index++; - if (page_in_index >= total_pages_in) { + folio_in_index++; + if (folio_in_index >= total_folios_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.src = + kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -618,80 +658,58 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); + const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - size_t ret2; - unsigned long total_out = 0; - unsigned long pg_offset = 0; + unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); - if (!stream) { - pr_warn("BTRFS: zstd_init_dstream failed\n"); + if (unlikely(!stream)) { + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(inode->root->fs_info, + "zstd decompression init failed, root %llu inode %llu offset %llu", + btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(dest_folio)); ret = -EIO; goto finish; } - destlen = min_t(size_t, destlen, PAGE_SIZE); - workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; - - ret2 = 1; - while (pg_offset < destlen - && workspace->in_buf.pos < workspace->in_buf.size) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - /* Check if the frame is over and we still need more input */ - if (ret2 == 0) { - pr_debug("BTRFS: zstd_decompress_stream ended early\n"); - ret = -EIO; - goto finish; - } - ret2 = zstd_decompress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); - ret = -EIO; - goto finish; - } - - buf_start = total_out; - total_out += workspace->out_buf.pos; - workspace->out_buf.pos = 0; - - if (total_out <= start_byte) - continue; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min_t(unsigned long, destlen - pg_offset, - workspace->out_buf.size - buf_offset); - - memcpy_to_page(dest_page, pg_offset, - workspace->out_buf.dst + buf_offset, bytes); - - pg_offset += bytes; + workspace->out_buf.size = sectorsize; + + /* + * Since both input and output buffers should not exceed one sector, + * one call should end the decompression. + */ + ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (unlikely(zstd_is_error(ret))) { + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(inode->root->fs_info, + "zstd decompression failed, error %d root %llu inode %llu offset %llu", + zstd_get_error_code(ret), btrfs_root_id(inode->root), + btrfs_ino(inode), folio_pos(dest_folio)); + goto finish; } - ret = 0; + to_copy = workspace->out_buf.pos; + memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy); finish: - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); + /* Error or early end. */ + if (unlikely(to_copy < destlen)) { + ret = -EIO; + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); } return ret; } |