diff options
Diffstat (limited to 'fs/btrfs')
75 files changed, 2789 insertions, 2723 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index c352f3ae0385..ea95c90c8474 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -114,6 +114,8 @@ config BTRFS_EXPERIMENTAL - extent tree v2 - complex rework of extent tracking + - large folio support + If unsure, say N. config BTRFS_FS_REF_VERIFY diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index e3716516ca38..861c7d92c437 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -9,27 +9,24 @@ #include "fs.h" #include "accessors.h" -static bool check_setget_bounds(const struct extent_buffer *eb, - const void *ptr, unsigned off, int size) +static void __cold report_setget_bounds(const struct extent_buffer *eb, + const void *ptr, unsigned off, int size) { - const unsigned long member_offset = (unsigned long)ptr + off; + unsigned long member_offset = (unsigned long)ptr + off; - if (unlikely(member_offset + size > eb->len)) { - btrfs_warn(eb->fs_info, - "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", - (member_offset > eb->len ? "start" : "end"), - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - - return true; + btrfs_warn(eb->fs_info, + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), + (unsigned long)ptr, eb->start, member_offset, size); } -void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) +/* Copy bytes from @src1 and @src2 to @dest. */ +static __always_inline void memcpy_split_src(char *dest, const char *src1, + const char *src2, const size_t len1, + const size_t total) { - token->eb = eb; - token->kaddr = folio_address(eb->folios[0]); - token->offset = 0; + memcpy(dest, src1, len1); + memcpy(dest + len1, src2, total - len1); } /* @@ -41,11 +38,6 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e * - btrfs_set_8 (for 8/16/32/64) * - btrfs_get_8 (for 8/16/32/64) * - * Generic helpers with a token (cached address of the most recently accessed - * page): - * - btrfs_set_token_8 (for 8/16/32/64) - * - btrfs_get_token_8 (for 8/16/32/64) - * * The set/get functions handle data spanning two pages transparently, in case * metadata block size is larger than page. Every pointer to metadata items is * an offset into the extent buffer page array, cast to a specific type. This @@ -57,118 +49,66 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ - const unsigned long oil = get_eb_offset_in_folio(token->eb, \ - member_offset);\ - const int unit_size = token->eb->folio_size; \ - const int unit_shift = token->eb->folio_shift; \ - const int size = sizeof(u##bits); \ - u8 lebytes[sizeof(u##bits)]; \ - const int part = unit_size - oil; \ - \ - ASSERT(token); \ - ASSERT(token->kaddr); \ - ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ - if (token->offset <= member_offset && \ - member_offset + size <= token->offset + unit_size) { \ - return get_unaligned_le##bits(token->kaddr + oil); \ - } \ - token->kaddr = folio_address(token->eb->folios[idx]); \ - token->offset = idx << unit_shift; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ - return get_unaligned_le##bits(token->kaddr + oil); \ - \ - memcpy(lebytes, token->kaddr + oil, part); \ - token->kaddr = folio_address(token->eb->folios[idx + 1]); \ - token->offset = (idx + 1) << unit_shift; \ - memcpy(lebytes + part, token->kaddr, size - part); \ - return get_unaligned_le##bits(lebytes); \ -} \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ - const unsigned long oil = get_eb_offset_in_folio(eb, \ - member_offset);\ - const int unit_size = eb->folio_size; \ - char *kaddr = folio_address(eb->folios[idx]); \ - const int size = sizeof(u##bits); \ - const int part = unit_size - oil; \ - u8 lebytes[sizeof(u##bits)]; \ - \ - ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ - return get_unaligned_le##bits(kaddr + oil); \ - \ - memcpy(lebytes, kaddr + oil, part); \ - kaddr = folio_address(eb->folios[idx + 1]); \ - memcpy(lebytes + part, kaddr, size - part); \ - return get_unaligned_le##bits(lebytes); \ -} \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ - const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + const unsigned long oif = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = token->eb->folio_size; \ - const int unit_shift = token->eb->folio_shift; \ - const int size = sizeof(u##bits); \ + char *kaddr = folio_address(eb->folios[idx]) + oif; \ + const int part = eb->folio_size - oif; \ u8 lebytes[sizeof(u##bits)]; \ - const int part = unit_size - oil; \ \ - ASSERT(token); \ - ASSERT(token->kaddr); \ - ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ - if (token->offset <= member_offset && \ - member_offset + size <= token->offset + unit_size) { \ - put_unaligned_le##bits(val, token->kaddr + oil); \ - return; \ + if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \ + report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \ + return 0; \ } \ - token->kaddr = folio_address(token->eb->folios[idx]); \ - token->offset = idx << unit_shift; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ - oil + size <= unit_size) { \ - put_unaligned_le##bits(val, token->kaddr + oil); \ - return; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \ + likely(sizeof(u##bits) <= part)) \ + return get_unaligned_le##bits(kaddr); \ + \ + if (sizeof(u##bits) == 2) { \ + lebytes[0] = *kaddr; \ + kaddr = folio_address(eb->folios[idx + 1]); \ + lebytes[1] = *kaddr; \ + } else { \ + memcpy_split_src(lebytes, kaddr, \ + folio_address(eb->folios[idx + 1]), \ + part, sizeof(u##bits)); \ } \ - put_unaligned_le##bits(val, lebytes); \ - memcpy(token->kaddr + oil, lebytes, part); \ - token->kaddr = folio_address(token->eb->folios[idx + 1]); \ - token->offset = (idx + 1) << unit_shift; \ - memcpy(token->kaddr, lebytes + part, size - part); \ + return get_unaligned_le##bits(lebytes); \ } \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ - const unsigned long oil = get_eb_offset_in_folio(eb, \ + const unsigned long oif = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = eb->folio_size; \ - char *kaddr = folio_address(eb->folios[idx]); \ - const int size = sizeof(u##bits); \ - const int part = unit_size - oil; \ + char *kaddr = folio_address(eb->folios[idx]) + oif; \ + const int part = eb->folio_size - oif; \ u8 lebytes[sizeof(u##bits)]; \ \ - ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ - oil + size <= unit_size) { \ - put_unaligned_le##bits(val, kaddr + oil); \ + if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \ + report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \ + return; \ + } \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \ + likely(sizeof(u##bits) <= part)) { \ + put_unaligned_le##bits(val, kaddr); \ return; \ } \ - \ put_unaligned_le##bits(val, lebytes); \ - memcpy(kaddr + oil, lebytes, part); \ - kaddr = folio_address(eb->folios[idx + 1]); \ - memcpy(kaddr, lebytes + part, size - part); \ + if (sizeof(u##bits) == 2) { \ + *kaddr = lebytes[0]; \ + kaddr = folio_address(eb->folios[idx + 1]); \ + *kaddr = lebytes[1]; \ + } else { \ + memcpy(kaddr, lebytes, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ + memcpy(kaddr, lebytes + part, sizeof(u##bits) - part); \ + } \ } DEFINE_BTRFS_SETGET_BITS(8) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 15ea6348800b..99b3ced12805 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -16,14 +16,6 @@ struct extent_buffer; -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - -void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); - /* * Some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple one @@ -56,11 +48,6 @@ static inline void put_unaligned_le8(u8 val, void *p) sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off); \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val); \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off); \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ @@ -83,18 +70,6 @@ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ -} \ -static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ - return btrfs_get_token_##bits(token, s, offsetof(type, member));\ -} \ -static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ - type *s, u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ - btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ @@ -479,18 +454,6 @@ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ int slot, u32 val) \ { \ btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ -} \ -static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ - int slot) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ - return btrfs_token_raw_item_##member(token, item); \ -} \ -static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ - int slot, u32 val) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ - btrfs_set_token_raw_item_##member(token, item, val); \ } BTRFS_ITEM_SETGET_FUNCS(offset) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ed497f5f8d1b..6a450be293b1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -733,7 +733,6 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, struct preftrees *preftrees, struct share_check *sc) { - int err; int ret = 0; struct ulist *parents; struct ulist_node *node; @@ -752,6 +751,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, */ while ((rnode = rb_first_cached(&preftrees->indirect.root))) { struct prelim_ref *ref; + int ret2; ref = rb_entry(rnode, struct prelim_ref, rbnode); if (WARN(ref->parent, @@ -773,18 +773,18 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); + ret2 = resolve_indirect_ref(ctx, path, preftrees, ref, parents); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ - if (err == -ENOENT) { + if (ret2 == -ENOENT) { prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); continue; - } else if (err) { + } else if (ret2) { free_pref(ref); - ret = err; + ret = ret2; goto out; } @@ -2201,7 +2201,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, int ret; u64 flags; u64 size = 0; - u32 item_size; const struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_key key; @@ -2244,7 +2243,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } eb = path->nodes[0]; - item_size = btrfs_item_size(eb, path->slots[0]); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); @@ -2252,7 +2250,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, btrfs_debug(fs_info, "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u", logical, logical - found_key->objectid, found_key->objectid, - found_key->offset, flags, item_size); + found_key->offset, flags, btrfs_item_size(eb, path->slots[0])); WARN_ON(!flags_ret); if (flags_ret) { @@ -2548,17 +2546,20 @@ static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *c } int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, void *ctx, bool ignore_offset) { struct btrfs_backref_walk_ctx walk_ctx = { 0 }; int ret; u64 flags = 0; struct btrfs_key found_key; - int search_commit_root = path->search_commit_root; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); - btrfs_release_path(path); + btrfs_free_path(path); if (ret < 0) return ret; if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) @@ -2571,8 +2572,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, walk_ctx.extent_item_pos = logical - found_key.objectid; walk_ctx.fs_info = fs_info; - return iterate_extent_inodes(&walk_ctx, search_commit_root, - build_ino_list, ctx); + return iterate_extent_inodes(&walk_ctx, false, build_ino_list, ctx); } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, @@ -3161,18 +3161,14 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) ASSERT(!cache->nr_edges); } -void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which) +static void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper) { ASSERT(upper && lower && upper->level == lower->level + 1); edge->node[LOWER] = lower; edge->node[UPPER] = upper; - if (link_which & LINK_LOWER) - list_add_tail(&edge->list[LOWER], &lower->upper); - if (link_which & LINK_UPPER) - list_add_tail(&edge->list[UPPER], &upper->lower); + list_add_tail(&edge->list[LOWER], &lower->upper); } /* * Handle direct tree backref @@ -3242,7 +3238,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } - btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER); + btrfs_backref_link_edge(edge, cur, upper); return 0; } @@ -3412,7 +3408,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, if (!upper->owner) upper->owner = btrfs_header_owner(eb); } - btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER); + btrfs_backref_link_edge(edge, lower, upper); if (rb_node) { btrfs_put_root(root); @@ -3570,7 +3566,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, ASSERT(start->checked); - rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node); + rb_node = rb_simple_insert(&cache->rb_root, &start->simple_node); if (rb_node) btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST); @@ -3621,8 +3617,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, return -EUCLEAN; } - rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); + rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node); if (unlikely(rb_node)) { btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); return -EUCLEAN; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 953637115956..34b0193a181c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -226,8 +226,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, iterate_extent_inodes_t *iterate, void *user_ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, void *ctx, - bool ignore_offset); + void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); @@ -313,10 +312,15 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); * Represent a tree block in the backref cache */ struct btrfs_backref_node { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simple_node for search/insert */ + union{ + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; /* * This is a sanity check, whenever we COW a block we will update @@ -423,13 +427,6 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); -#define LINK_LOWER (1U << 0) -#define LINK_UPPER (1U << 1) - -void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which); void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f7d8958b7327..50b5fc1c06d7 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -27,12 +27,12 @@ struct btrfs_failed_bio { }; /* Is this a data path I/O that needs storage layer checksum and repair? */ -static inline bool is_data_bbio(struct btrfs_bio *bbio) +static inline bool is_data_bbio(const struct btrfs_bio *bbio) { return bbio->inode && is_data_inode(bbio->inode); } -static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio) { return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; } @@ -134,14 +134,14 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) } } -static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) { if (cur_mirror == fbio->num_copies) return cur_mirror + 1 - fbio->num_copies; return cur_mirror + 1; } -static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) { if (cur_mirror == 1) return fbio->num_copies; @@ -165,12 +165,6 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); int mirror = repair_bbio->mirror_num; - /* - * We can only trigger this for data bio, which doesn't support larger - * folios yet. - */ - ASSERT(folio_order(page_folio(bv->bv_page)) == 0); - if (repair_bbio->bio.bi_status || !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); @@ -301,7 +295,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de btrfs_bio_end_io(bbio, bbio->bio.bi_status); } -static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) return; @@ -316,8 +310,8 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); } -static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, - struct bio *bio) +static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info, + const struct bio *bio) { if (bio->bi_opf & REQ_META) return fs_info->endio_meta_workers; @@ -439,7 +433,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) ASSERT(btrfs_dev_is_sequential(dev, physical)); bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } - btrfs_debug_in_rcu(dev->fs_info, + btrfs_debug(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), @@ -845,7 +839,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, goto out_bio_uninit; } - btrfs_info_rl_in_rcu(fs_info, + btrfs_info_rl(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, btrfs_dev_name(smap.dev), smap.physical >> SECTOR_SHIFT); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5b0cb04b2b93..9bf282d2453c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group } #endif +static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group) +{ + /* The meta_write_pointer is available only on the zoned setup. */ + if (!btrfs_is_zoned(block_group->fs_info)) + return false; + + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + return false; + + return block_group->start + block_group->alloc_offset > + block_group->meta_write_pointer; +} + /* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress @@ -832,8 +845,8 @@ out: static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { - btrfs_clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_DIRTY); + btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_DIRTY, NULL); } static noinline void caching_thread(struct btrfs_work *work) @@ -877,7 +890,7 @@ static noinline void caching_thread(struct btrfs_work *work) */ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) - ret = load_free_space_tree(caching_ctl); + ret = btrfs_load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); done: @@ -1235,7 +1248,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * another task to attempt to create another block group with the same * item key (and failing with -EEXIST and a transaction abort). */ - ret = remove_block_group_free_space(trans, block_group); + ret = btrfs_remove_block_group_free_space(trans, block_group); if (ret) goto out; @@ -1244,6 +1257,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; spin_lock(&block_group->lock); + /* + * Hitting this WARN means we removed a block group with an unwritten + * region. It will cause "unable to find chunk map for logical" errors. + */ + if (WARN_ON(has_unwritten_metadata(block_group))) + btrfs_warn(fs_info, + "block group %llu is removed before metadata write out", + block_group->start); + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); /* @@ -1403,7 +1425,7 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false); } return ret; } @@ -1436,14 +1458,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { - ret = btrfs_clear_extent_bits(&prev_trans->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY, NULL); if (ret) goto out; } - ret = btrfs_clear_extent_bits(&trans->transaction->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY, NULL); out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) @@ -1586,8 +1608,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used && - block_group->zone_unusable < block_group->length) { + if ((space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) || + has_unwritten_metadata(block_group)) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the @@ -1616,8 +1639,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) ret = btrfs_zone_finish(block_group); if (ret < 0) { btrfs_dec_block_group_ro(block_group); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + btrfs_link_bg_list(block_group, &retry_list); ret = 0; + } goto next; } @@ -1843,7 +1868,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { - u64 zone_unusable; u64 used; u64 reserved; int ret = 0; @@ -1910,16 +1934,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the block group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. We also - * cache it before unlocking the block group, to prevent races - * (reports from KCSAN and such tools) with tasks updating it. - */ - zone_unusable = bg->zone_unusable; - spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1963,14 +1977,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) reserved = bg->reserved; spin_unlock(&bg->lock); - btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable", - bg->start, - div64_u64(used * 100, bg->length), - div64_u64(reserved * 100, bg->length), - div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); - ret = btrfs_relocate_chunk(fs_info, bg->start); + ret = btrfs_relocate_chunk(fs_info, bg->start, false); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", @@ -2372,7 +2380,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); cache->space_info = btrfs_find_space_info(info, cache->flags); - set_free_space_tree_thresholds(cache); + btrfs_set_free_space_tree_thresholds(cache); if (need_clear) { /* @@ -2791,7 +2799,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) block_group->length); if (ret) btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, block_group); + btrfs_add_block_group_free_space(trans, block_group); /* * If we restriped during balance, we may have added a new raid @@ -2889,7 +2897,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); cache->length = size; - set_free_space_tree_thresholds(cache); + btrfs_set_free_space_tree_thresholds(cache); cache->flags = type; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); @@ -3636,9 +3644,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); ret = update_block_group_item(trans, path, cache); - } - if (ret) + if (ret) + btrfs_abort_transaction(trans, ret); + } else if (ret) { btrfs_abort_transaction(trans, ret); + } } /* If its not on the io list, we need to put the block group */ @@ -4298,7 +4308,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, bytes, type); - btrfs_dump_space_info(fs_info, info, 0, 0); + btrfs_dump_space_info(fs_info, info, 0, false); } if (left < bytes) { @@ -4443,7 +4453,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) * indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); + btrfs_dump_space_info(info, space_info, 0, false); /* * If there was a failure to cleanup a log tree, very likely due to an @@ -4454,7 +4464,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); + btrfs_dump_space_info(info, space_info, 0, false); } WARN_ON(space_info->reclaim_size > 0); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9de356bcb411..a8bb8429c966 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -83,6 +83,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Set after we add a new block group to the free space tree. */ + BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, /* @@ -244,6 +246,11 @@ struct btrfs_block_group { /* Lock for free space tree operations. */ struct mutex free_space_lock; + /* Protected by @free_space_lock. */ + bool using_free_space_bitmaps; + /* Protected by @free_space_lock. */ + bool using_free_space_bitmaps_cached; + /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a79fa0726f1d..b99fb0273292 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -525,6 +525,19 @@ static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) mapping_set_stable_writes(inode->vfs_inode.i_mapping); } +static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) +{ + /* Metadata inode should not reach here. */ + ASSERT(is_data_inode(inode)); + + /* We only allow BITS_PER_LONGS blocks for each bitmap. */ +#ifdef CONFIG_BTRFS_EXPERIMENTAL + mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0, + ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits) + >> PAGE_SHIFT))); +#endif +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 48d07939fee4..d09d622016ef 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -282,8 +282,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) { struct inode *inode = &cb->bbio.inode->vfs_inode; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - unsigned long index = cb->start >> PAGE_SHIFT; - unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; + pgoff_t index = cb->start >> PAGE_SHIFT; + const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; int i; int ret; @@ -415,7 +415,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, int *memstall, unsigned long *pflags) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - unsigned long end_index; + pgoff_t end_index; struct bio *orig_bio = &cb->orig_bbio->bio; u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); @@ -446,8 +446,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (cur < compressed_end) { - u64 page_end; - u64 pg_index = cur >> PAGE_SHIFT; + pgoff_t page_end; + pgoff_t pg_index = cur >> PAGE_SHIFT; u32 add_size; if (pg_index > end_index) @@ -789,8 +789,8 @@ static void btrfs_init_workspace_manager(int type) */ workspace = alloc_workspace(type, 0); if (IS_ERR(workspace)) { - pr_warn( - "BTRFS: cannot preallocate compression workspace, will try later\n"); + btrfs_warn(NULL, + "cannot preallocate compression workspace, will try later"); } else { atomic_set(&wsm->total_ws, 1); wsm->free_ws = 1; @@ -888,9 +888,9 @@ again: /* once per minute */ 60 * HZ, /* no burst */ 1); - if (__ratelimit(&_rs)) { - pr_warn("BTRFS: no compression workspaces, low memory, retrying\n"); - } + if (__ratelimit(&_rs)) + btrfs_warn(NULL, + "no compression workspaces, low memory, retrying"); } goto again; } @@ -975,7 +975,7 @@ static int btrfs_compress_set_level(unsigned int type, int level) if (level == 0) level = ops->default_level; else - level = min(max(level, ops->min_level), ops->max_level); + level = clamp(level, ops->min_level, ops->max_level); return level; } @@ -1482,7 +1482,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, struct heuristic_ws *ws) { struct page *page; - u64 index, index_end; + pgoff_t index, index_end; u32 i, curr_sample_pos; u8 *in_data; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d34c4341eaf4..1b38e707bbd9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -13,6 +13,7 @@ #include <linux/wait.h> #include <linux/pagemap.h> #include "bio.h" +#include "fs.h" #include "messages.h" struct address_space; @@ -77,12 +78,10 @@ struct compressed_bio { /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { - const u64 folio_end = folio_pos(folio) + folio_size(folio); - /* @cur must be inside the folio. */ ASSERT(folio_pos(folio) <= cur); - ASSERT(cur < folio_end); - return min(range_end, folio_end) - cur; + ASSERT(cur < folio_end(folio)); + return min(range_end, folio_end(folio)) - cur; } int __init btrfs_init_compress(void); @@ -114,6 +113,8 @@ enum btrfs_compression_type { BTRFS_COMPRESS_LZO = 2, BTRFS_COMPRESS_ZSTD = 3, BTRFS_NR_COMPRESS_TYPES = 4, + + BTRFS_DEFRAG_DONT_COMPRESS, }; struct workspace_manager { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a2e7979372cc..74e6d7f3d266 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ - if (atomic_inc_not_zero(&eb->refs)) { + if (refcount_inc_not_zero(&eb->refs)) { rcu_read_unlock(); break; } @@ -283,15 +283,26 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); - WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + if (unlikely(btrfs_header_generation(buf) > trans->transid)) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } + + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_inc_ref(trans, root, cow, 1); - else + if (ret) + btrfs_abort_transaction(trans, ret); + } else { ret = btrfs_inc_ref(trans, root, cow, 0); + if (ret) + btrfs_abort_transaction(trans, ret); + } if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); - btrfs_abort_transaction(trans, ret); return ret; } @@ -303,9 +314,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { const u64 buf_gen = btrfs_header_generation(buf); @@ -549,7 +560,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } - atomic_inc(&cow->refs); + refcount_inc(&cow->refs); rcu_assign_pointer(root->node, cow); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, @@ -602,9 +613,9 @@ error_unlock_cow: return ret; } -static inline int should_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +static inline int should_cow_block(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) return 0; @@ -724,7 +735,7 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -int btrfs_bin_search(struct extent_buffer *eb, int first_slot, +int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot) { unsigned long p; @@ -1081,7 +1092,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - atomic_inc(&left->refs); + refcount_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -1268,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * to the block in 'slot', and triggering ra on them. */ static void reada_for_search(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, + const struct btrfs_path *path, int level, int slot, u64 objectid) { struct extent_buffer *node; @@ -1350,7 +1361,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, } } -static noinline void reada_for_balance(struct btrfs_path *path, int level) +static noinline void reada_for_balance(const struct btrfs_path *path, int level) { struct extent_buffer *parent; int slot; @@ -1446,8 +1457,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, u64 blocknr; struct extent_buffer *tmp = NULL; int ret = 0; + int ret2; int parent_level; - int err; bool read_tmp = false; bool tmp_locked = false; bool path_released = false; @@ -1505,9 +1516,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, } /* Now we're allowed to do a blocking uptodate check. */ - err = btrfs_read_extent_buffer(tmp, &check); - if (err) { - ret = err; + ret2 = btrfs_read_extent_buffer(tmp, &check); + if (ret2) { + ret = ret2; goto out; } @@ -1548,9 +1559,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, } /* Now we're allowed to do a blocking uptodate check. */ - err = btrfs_read_extent_buffer(tmp, &check); - if (err) { - ret = err; + ret2 = btrfs_read_extent_buffer(tmp, &check); + if (ret2) { + ret = ret2; goto out; } @@ -1685,7 +1696,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, if (p->search_commit_root) { b = root->commit_root; - atomic_inc(&b->refs); + refcount_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when @@ -1794,7 +1805,7 @@ static int finish_need_commit_sem_search(struct btrfs_path *path) return 0; } -static inline int search_for_key_slot(struct extent_buffer *eb, +static inline int search_for_key_slot(const struct extent_buffer *eb, int search_low_slot, const struct btrfs_key *key, int prev_cmp, @@ -1928,15 +1939,14 @@ static int search_leaf(struct btrfs_trans_handle *trans, ASSERT(leaf_free_space >= 0); if (leaf_free_space < ins_len) { - int err; - - err = split_leaf(trans, root, key, path, ins_len, - (ret == 0)); - ASSERT(err <= 0); - if (WARN_ON(err > 0)) - err = -EUCLEAN; - if (err) - ret = err; + int ret2; + + ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0)); + ASSERT(ret2 <= 0); + if (WARN_ON(ret2 > 0)) + ret2 = -EUCLEAN; + if (ret2) + ret = ret2; } } @@ -1982,7 +1992,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *b; int slot; int ret; - int err; int level; int lowest_unlock = 1; /* everything at write_lock_level or lower must be write locked */ @@ -2053,6 +2062,7 @@ again: while (b) { int dec = 0; + int ret2; level = btrfs_header_level(b); @@ -2081,16 +2091,15 @@ again: } if (last_level) - err = btrfs_cow_block(trans, root, b, NULL, 0, - &b, - BTRFS_NESTING_COW); + ret2 = btrfs_cow_block(trans, root, b, NULL, 0, + &b, BTRFS_NESTING_COW); else - err = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b, - BTRFS_NESTING_COW); - if (err) { - ret = err; + ret2 = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b, + BTRFS_NESTING_COW); + if (ret2) { + ret = ret2; goto done; } } @@ -2138,12 +2147,12 @@ cow_done: slot--; } p->slots[level] = slot; - err = setup_nodes_for_search(trans, root, p, b, level, ins_len, - &write_lock_level); - if (err == -EAGAIN) + ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, + &write_lock_level); + if (ret2 == -EAGAIN) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } b = p->nodes[level]; @@ -2169,11 +2178,11 @@ cow_done: goto done; } - err = read_block_for_search(root, p, &b, slot, key); - if (err == -EAGAIN && !p->nowait) + ret2 = read_block_for_search(root, p, &b, slot, key); + if (ret2 == -EAGAIN && !p->nowait) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } @@ -2236,7 +2245,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct extent_buffer *b; int slot; int ret; - int err; int level; int lowest_unlock = 1; u8 lowest_level = 0; @@ -2261,6 +2269,7 @@ again: while (b) { int dec = 0; + int ret2; level = btrfs_header_level(b); p->nodes[level] = b; @@ -2296,11 +2305,11 @@ again: goto done; } - err = read_block_for_search(root, p, &b, slot, key); - if (err == -EAGAIN && !p->nowait) + ret2 = read_block_for_search(root, p, &b, slot, key); + if (ret2 == -EAGAIN && !p->nowait) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } @@ -2872,6 +2881,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (ret < 0) { int ret2; + btrfs_clear_buffer_dirty(trans, c); ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); if (ret2 < 0) btrfs_abort_transaction(trans, ret2); @@ -2885,7 +2895,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, free_extent_buffer(old); add_root_to_dirty_list(root); - atomic_inc(&c->refs); + refcount_inc(&c->refs); path->nodes[level] = c; path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; @@ -3100,7 +3110,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = right->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; - struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -3174,13 +3183,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); /* update the item pointers */ - btrfs_init_map_token(&token, right); right_nritems += push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - push_space -= btrfs_token_item_size(&token, i); - btrfs_set_token_item_offset(&token, i, push_space); + push_space -= btrfs_item_size(right, i); + btrfs_set_item_offset(right, i, push_space); } left_nritems -= push_items; @@ -3323,7 +3331,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, int ret = 0; u32 this_item_size; u32 old_left_item_size; - struct btrfs_map_token token; if (empty) nr = min(right_nritems, max_slot); @@ -3371,13 +3378,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); - btrfs_init_map_token(&token, left); old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, + ioff = btrfs_item_offset(left, i); + btrfs_set_item_offset(left, i, ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -3398,13 +3404,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_header_nritems(right) - push_items); } - btrfs_init_map_token(&token, right); right_nritems -= push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - push_space = push_space - btrfs_token_item_size(&token, i); - btrfs_set_token_item_offset(&token, i, push_space); + push_space = push_space - btrfs_item_size(right, i); + btrfs_set_item_offset(right, i, push_space); } btrfs_mark_buffer_dirty(trans, left); @@ -3518,7 +3523,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, int i; int ret; struct btrfs_disk_key disk_key; - struct btrfs_map_token token; nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -3531,12 +3535,11 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); - btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); + ioff = btrfs_item_offset(right, i); + btrfs_set_item_offset(right, i, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); @@ -4002,7 +4005,6 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; - struct btrfs_map_token token; leaf = path->nodes[0]; slot = path->slots[0]; @@ -4025,12 +4027,11 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + size_diff); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff + size_diff); } /* shift the data */ @@ -4093,7 +4094,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; - struct btrfs_map_token token; leaf = path->nodes[0]; @@ -4119,12 +4119,11 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff - data_size); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff - data_size); } /* shift the data */ @@ -4164,7 +4163,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct extent_buffer *leaf; int slot; - struct btrfs_map_token token; u32 total_size; /* @@ -4192,7 +4190,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, BUG(); } - btrfs_init_map_token(&token, leaf); if (slot != nritems) { unsigned int old_data = btrfs_item_data_end(leaf, slot); @@ -4210,8 +4207,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff - batch->total_data_size); } /* shift the items */ @@ -4228,8 +4225,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); data_end -= batch->data_sizes[i]; - btrfs_set_token_item_offset(&token, slot + i, data_end); - btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); + btrfs_set_item_offset(leaf, slot + i, data_end); + btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]); } btrfs_set_header_nritems(leaf, nritems + batch->nr); @@ -4442,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used_bytes(root); - atomic_inc(&leaf->refs); + refcount_inc(&leaf->refs); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); if (ret < 0) @@ -4469,7 +4466,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); const int data_end = leaf_data_end(leaf); - struct btrfs_map_token token; u32 dsize = 0; int i; @@ -4479,12 +4475,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, memmove_leaf_data(leaf, data_end + dsize, data_end, last_off - data_end); - btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + dsize); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff + dsize); } memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); @@ -4527,7 +4522,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * for possible call to btrfs_del_ptr below */ slot = path->slots[1]; - atomic_inc(&leaf->refs); + refcount_inc(&leaf->refs); /* * We want to be able to at least push one item to the * left neighbour leaf, and that's the first item. @@ -4585,16 +4580,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are have a minimum transaction id. + * for leaves that have a minimum transaction id. * This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the * key and get a writable path. * - * This honors path->lowest_level to prevent descent past a given level - * of the tree. - * * min_trans indicates the oldest transaction that you are interested * in walking through. Any nodes or leaves older than min_trans are * skipped over (without reading them). @@ -4615,6 +4607,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int keep_locks = path->keep_locks; ASSERT(!path->nowait); + ASSERT(path->lowest_level == 0); path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); @@ -4636,8 +4629,8 @@ again: goto out; } - /* at the lowest level, we're done, setup the path and exit */ - if (level == path->lowest_level) { + /* At level 0 we're done, setup the path and exit. */ + if (level == 0) { if (slot >= nritems) goto find_next_key; ret = 0; @@ -4678,12 +4671,6 @@ find_next_key: goto out; } } - if (level == path->lowest_level) { - ret = 0; - /* Save our key for returning back. */ - btrfs_node_key_to_cpu(cur, min_key, slot); - goto out; - } cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); @@ -4699,7 +4686,7 @@ find_next_key: out: path->keep_locks = keep_locks; if (ret == 0) - btrfs_unlock_up_safe(path, path->lowest_level + 1); + btrfs_unlock_up_safe(path, 1); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 71fa42ca04fe..fe70b593c7cd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -224,16 +224,10 @@ struct btrfs_root { struct list_head root_list; - /* - * Xarray that keeps track of in-memory inodes, protected by the lock - * @inode_lock. - */ + /* Xarray that keeps track of in-memory inodes. */ struct xarray inodes; - /* - * Xarray that keeps track of delayed nodes of every inode, protected - * by @inode_lock. - */ + /* Xarray that keeps track of delayed nodes of every inode. */ struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid @@ -508,7 +502,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); -int btrfs_bin_search(struct extent_buffer *eb, int first_slot, +int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); @@ -576,9 +570,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf); +bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, @@ -727,13 +721,18 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_leaf_free_space(const struct extent_buffer *leaf); -static inline int is_fstree(u64 rootid) +static inline bool btrfs_is_fstree(u64 rootid) { - if (rootid == BTRFS_FS_TREE_OBJECTID || - ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && - !btrfs_qgroup_level(rootid))) - return 1; - return 0; + if (rootid == BTRFS_FS_TREE_OBJECTID) + return true; + + if ((s64)rootid < (s64)BTRFS_FIRST_FREE_OBJECTID) + return false; + + if (btrfs_qgroup_level(rootid) != 0) + return false; + + return true; } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 1831618579cb..738179a5e170 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -60,6 +60,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1, return 0; } +static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node); + const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node); + + return compare_inode_defrag(new_defrag, existing_defrag); +} + /* * Insert a record for an inode into the defrag tree. The lock must be held * already. @@ -71,37 +79,23 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct inode_defrag *entry; - struct rb_node **p; - struct rb_node *parent = NULL; - int ret; + struct rb_node *node; - p = &fs_info->defrag_inodes.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct inode_defrag, rb_node); + node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp); + if (node) { + struct inode_defrag *entry; - ret = compare_inode_defrag(defrag, entry); - if (ret < 0) - p = &parent->rb_left; - else if (ret > 0) - p = &parent->rb_right; - else { - /* - * If we're reinserting an entry for an old defrag run, - * make sure to lower the transid of our existing - * record. - */ - if (defrag->transid < entry->transid) - entry->transid = defrag->transid; - entry->extent_thresh = min(defrag->extent_thresh, - entry->extent_thresh); - return -EEXIST; - } + entry = rb_entry(node, struct inode_defrag, rb_node); + /* + * If we're reinserting an entry for an old defrag run, make + * sure to lower the transid of our existing record. + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); + return -EEXIST; } set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); - rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); return 0; } @@ -854,8 +848,8 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 folio_start; - u64 folio_end; + u64 lock_start; + u64 lock_end; struct extent_state *cached_state = NULL; struct folio *folio; int ret; @@ -891,15 +885,15 @@ again: return ERR_PTR(ret); } - folio_start = folio_pos(folio); - folio_end = folio_pos(folio) + folio_size(folio) - 1; + lock_start = folio_pos(folio); + lock_end = folio_end(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; - btrfs_lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, folio_start, folio_size(folio)); - btrfs_unlock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio)); + btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); if (!ordered) break; @@ -953,7 +947,7 @@ struct defrag_target_range { * @extent_thresh: file extent size threshold, any extent size >= this value * will be ignored * @newer_than: only defrag extents newer than this value - * @do_compress: whether the defrag is doing compression + * @do_compress: whether the defrag is doing compression or no-compression * if true, @extent_thresh will be ignored and all regular * file extents meeting @newer_than will be targets. * @locked: if the range has already held extent lock @@ -1184,8 +1178,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, if (!folio) break; - if (start >= folio_pos(folio) + folio_size(folio) || - start + len <= folio_pos(folio)) + if (start >= folio_end(folio) || start + len <= folio_pos(folio)) continue; btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); @@ -1226,7 +1219,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, folios[i] = NULL; goto free_folios; } - cur = folio_pos(folios[i]) + folio_size(folios[i]); + cur = folio_end(folios[i]); } for (int i = 0; i < nr_pages; i++) { if (!folios[i]) @@ -1371,6 +1364,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; int compress_level = 0; int ret = 0; @@ -1401,6 +1395,9 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, if (range->compress_type) compress_type = range->compress_type; } + } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { + compress_type = BTRFS_DEFRAG_DONT_COMPRESS; + compress_level = 1; } if (extent_thresh == 0) @@ -1451,13 +1448,14 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, btrfs_inode_unlock(inode, 0); break; } - if (do_compress) { + if (do_compress || no_compress) { inode->defrag_compress = compress_type; inode->defrag_compress_level = compress_level; } ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, §ors_defragged, + newer_than, do_compress || no_compress, + §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) @@ -1496,7 +1494,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); ret = sectors_defragged; } - if (do_compress) { + if (do_compress || no_compress) { btrfs_inode_lock(inode, 0); inode->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 8c597fa60523..0f8d8e275143 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -334,6 +334,20 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, return item; } +static int delayed_item_index_cmp(const void *key, const struct rb_node *node) +{ + const u64 *index = key; + const struct btrfs_delayed_item *delayed_item = rb_entry(node, + struct btrfs_delayed_item, rb_node); + + if (delayed_item->index < *index) + return 1; + else if (delayed_item->index > *index) + return -1; + + return 0; +} + /* * Look up the delayed item by key. * @@ -347,21 +361,10 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( struct rb_root *root, u64 index) { - struct rb_node *node = root->rb_node; - struct btrfs_delayed_item *delayed_item = NULL; - - while (node) { - delayed_item = rb_entry(node, struct btrfs_delayed_item, - rb_node); - if (delayed_item->index < index) - node = node->rb_right; - else if (delayed_item->index > index) - node = node->rb_left; - else - return delayed_item; - } + struct rb_node *node; - return NULL; + node = rb_find(&index, root, delayed_item_index_cmp); + return rb_entry_safe(node, struct btrfs_delayed_item, rb_node); } static int btrfs_delayed_item_cmp(const struct rb_node *new, @@ -369,14 +372,8 @@ static int btrfs_delayed_item_cmp(const struct rb_node *new, { const struct btrfs_delayed_item *new_item = rb_entry(new, struct btrfs_delayed_item, rb_node); - const struct btrfs_delayed_item *exist_item = - rb_entry(exist, struct btrfs_delayed_item, rb_node); - if (new_item->index < exist_item->index) - return -1; - if (new_item->index > exist_item->index) - return 1; - return 0; + return delayed_item_index_cmp(&new_item->index, exist); } static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, @@ -1008,8 +1005,16 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, ret = btrfs_lookup_inode(trans, root, path, &key, mod); if (ret > 0) ret = -ENOENT; - if (ret < 0) + if (ret < 0) { + /* + * If we fail to update the delayed inode we need to abort the + * transaction, because we could leave the inode with the + * improper counts behind. + */ + if (ret != -ENOENT) + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], @@ -1034,8 +1039,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto err_out; + } ASSERT(ret > 0); ASSERT(path->slots[0] > 0); ret = 0; @@ -1057,21 +1064,14 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * in the same item doesn't exist. */ ret = btrfs_del_item(trans, root, path); + if (ret < 0) + btrfs_abort_transaction(trans, ret); out: btrfs_release_delayed_iref(node); btrfs_release_path(path); err_out: btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); btrfs_release_delayed_inode(node); - - /* - * If we fail to update the delayed inode we need to abort the - * transaction, because we could leave the inode with the improper - * counts behind. - */ - if (ret && ret != -ENOENT) - btrfs_abort_transaction(trans, ret); - return ret; } @@ -1540,8 +1540,8 @@ release_node: return ret; } -static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, - u64 index) +static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, + u64 index) { struct btrfs_delayed_item *item; @@ -1549,7 +1549,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); if (!item) { mutex_unlock(&node->mutex); - return 1; + return false; } /* @@ -1584,7 +1584,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, } mutex_unlock(&node->mutex); - return 0; + return true; } int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, @@ -1598,9 +1598,10 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(node)) return PTR_ERR(node); - ret = btrfs_delete_delayed_insertion_item(node, index); - if (!ret) + if (btrfs_delete_delayed_insertion_item(node, index)) { + ret = 0; goto end; + } item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); if (!item) { @@ -1617,7 +1618,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, */ if (ret < 0) { btrfs_err(trans->fs_info, -"metadata reservation failed for delayed dir item deltiona, should have been reserved"); +"metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d", + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_release_delayed_item(item); goto end; } @@ -1626,9 +1628,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, ret = __btrfs_add_delayed_item(node, item); if (unlikely(ret)) { btrfs_err(trans->fs_info, - "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, btrfs_root_id(node->root), - node->inode_id, ret); +"failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d", + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_delayed_item_release_metadata(dir->root, item); btrfs_release_delayed_item(item); } @@ -1733,17 +1734,16 @@ void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, downgrade_write(&inode->vfs_inode.i_rwsem); } -int btrfs_should_delete_dir_index(const struct list_head *del_list, - u64 index) +bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index) { struct btrfs_delayed_item *curr; - int ret = 0; + bool ret = false; list_for_each_entry(curr, del_list, readdir_list) { if (curr->index > index) break; if (curr->index == index) { - ret = 1; + ret = true; break; } } @@ -1753,15 +1753,14 @@ int btrfs_should_delete_dir_index(const struct list_head *del_list, /* * Read dir info stored in the delayed tree. */ -int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - const struct list_head *ins_list) +bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + const struct list_head *ins_list) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; struct btrfs_key location; char *name; int name_len; - int over = 0; unsigned char d_type; /* @@ -1770,6 +1769,8 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, * directory, nobody can delete any directory indexes now. */ list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + bool over; + list_del(&curr->readdir_list); if (curr->index < ctx->pos) { @@ -1787,17 +1788,16 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); btrfs_disk_key_to_cpu(&location, &di->location); - over = !dir_emit(ctx, name, name_len, - location.objectid, d_type); + over = !dir_emit(ctx, name, name_len, location.objectid, d_type); if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (over) - return 1; + return true; ctx->pos++; } - return 0; + return false; } static void fill_stack_inode_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index c4b4ba122beb..e6e763ad2d42 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -150,10 +150,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, struct list_head *ins_list, struct list_head *del_list); -int btrfs_should_delete_dir_index(const struct list_head *del_list, - u64 index); -int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - const struct list_head *ins_list); +bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index); +bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + const struct list_head *ins_list); /* Used during directory logging. */ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 739c9e29aaa3..ca382c5b186f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -928,7 +928,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - if (is_fstree(generic_ref->ref_root)) + if (btrfs_is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); @@ -958,8 +958,8 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, #endif generic_ref->tree_ref.level = level; generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && - (!mod_root || is_fstree(mod_root)))) + if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && + (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; @@ -976,8 +976,8 @@ void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, generic_ref->data_ref.objectid = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && - (!mod_root || is_fstree(mod_root)))) + if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && + (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 78cc23837610..552ec4fa645d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -420,7 +420,7 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent); void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); -static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_owner(const struct btrfs_delayed_ref_node *node) { if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) @@ -428,7 +428,7 @@ static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) return node->tree_ref.level; } -static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_offset(const struct btrfs_delayed_ref_node *node) { if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) @@ -436,7 +436,7 @@ static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) return 0; } -static inline u8 btrfs_ref_type(struct btrfs_ref *ref) +static inline u8 btrfs_ref_type(const struct btrfs_ref *ref) { ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2decb9fff445..4675bcd5f92e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -250,7 +250,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); + fs_info->sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev_file); @@ -327,7 +327,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - fput(bdev_file); + bdev_fput(bdev_file); return ret; } @@ -600,7 +600,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(src_device); if (btrfs_pinned_by_swapfile(fs_info, src_device)) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "cannot replace device %s (devid %llu) due to active swapfile", btrfs_dev_name(src_device), src_device->devid); return -ETXTBSY; @@ -647,7 +647,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->srcdev = src_device; dev_replace->tgtdev = tgt_device; - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s started", btrfs_dev_name(src_device), src_device->devid, @@ -943,7 +943,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, tgt_device); } else { if (scrub_ret != -ECANCELED) - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, @@ -961,7 +961,7 @@ error: return scrub_ret; } - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s finished", btrfs_dev_name(src_device), src_device->devid, @@ -1109,7 +1109,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) * btrfs_dev_replace_finishing() will handle the * cleanup part */ - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s canceled", btrfs_dev_name(src_device), src_device->devid, btrfs_dev_name(tgt_device)); @@ -1143,7 +1143,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) ret = btrfs_commit_transaction(trans); WARN_ON(ret); - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "suspended dev_replace from %s (devid %llu) to %s canceled", btrfs_dev_name(src_device), src_device->devid, btrfs_dev_name(tgt_device)); @@ -1247,7 +1247,7 @@ static int btrfs_dev_replace_kthread(void *data) progress = btrfs_dev_replace_progress(fs_info); progress = div_u64(progress, 10); - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "continuing dev_replace from %s (devid %llu) to target %s @%u%%", btrfs_dev_name(dev_replace->srcdev), dev_replace->srcdev->devid, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index b29cc31a7c4a..69863e398e22 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -227,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return di; } -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino, const struct fscrypt_str *name) { int ret; @@ -242,7 +242,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, if (!path) return -ENOMEM; - key.objectid = dir; + key.objectid = dir_ino; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 8462579a95f4..e52174a8baf9 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -14,7 +14,7 @@ struct btrfs_inode; struct btrfs_root; struct btrfs_trans_handle; -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino, const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0d6ad7512f21..70fc4e7cc5a0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -884,7 +884,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); - if (is_fstree(objectid)) + if (btrfs_is_fstree(objectid)) generate_random_guid(root->root_item.uuid); else export_guid(root->root_item.uuid, &guid_null); @@ -1104,7 +1104,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && - is_fstree(btrfs_root_id(root))) { + btrfs_is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1113,7 +1113,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ - if (is_fstree(btrfs_root_id(root)) && + if (btrfs_is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); @@ -1246,6 +1246,8 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + if (fs_info->fs_devices) + btrfs_close_devices(fs_info->fs_devices); percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); @@ -1315,7 +1317,7 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, * This is namely for free-space-tree and quota tree, which can change * at runtime and should only be grabbed from fs_info. */ - if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) return ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, objectid); @@ -1947,7 +1949,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; - fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); @@ -2028,14 +2029,10 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; - /* - * Check if the checksum implementation is a fast accelerated one. - * As-is this is a bit of a hack and should be replaced once the csum - * implementations provide that information themselves. - */ + /* Check if the checksum implementation is a fast accelerated one. */ switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: - if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) + if (crc32_optimizations() & CRC32C_OPTIMIZATION) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; case BTRFS_CSUM_TYPE_XXHASH: @@ -3396,6 +3393,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); fs_info->nodesize = nodesize; + fs_info->nodesize_bits = ilog2(nodesize); fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; @@ -3561,6 +3559,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } + btrfs_zoned_reserve_data_reloc_bg(fs_info); btrfs_free_zone_cache(fs_info); btrfs_check_active_zone_reservation(fs_info); @@ -3681,7 +3680,6 @@ fail_alloc: iput(fs_info->btree_inode); fail: - btrfs_close_devices(fs_info->fs_devices); ASSERT(ret < 0); return ret; } @@ -3694,7 +3692,7 @@ static void btrfs_end_super_write(struct bio *bio) bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { - btrfs_warn_rl_in_rcu(device->fs_info, + btrfs_warn_rl(device->fs_info, "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); @@ -3992,7 +3990,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) } if (min_tolerated == INT_MAX) { - pr_warn("BTRFS: unknown raid flag: %llu", flags); + btrfs_warn(NULL, "unknown raid flag: %llu", flags); min_tolerated = 0; } @@ -4428,7 +4426,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); btrfs_mapping_tree_free(fs_info); - btrfs_close_devices(fs_info->fs_devices); } void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, @@ -4640,7 +4637,7 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, mark, NULL)) { - btrfs_clear_extent_bits(dirty_pages, start, end, mark); + btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index b1b96eb5f64e..66361325f6dc 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -43,7 +43,8 @@ static inline void btrfs_extent_state_leak_debug_check(void) while (!list_empty(&states)) { state = list_first_entry(&states, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", + btrfs_err(NULL, + "state leak: start %llu end %llu state %u in tree %d refs %d", state->start, state->end, state->state, extent_state_in_tree(state), refcount_read(&state->refs)); @@ -1882,12 +1883,11 @@ int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 e bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached) { - int err; + int ret; u64 failed_start; - err = set_extent_bit(tree, start, end, bits, &failed_start, NULL, - cached, NULL); - if (err == -EEXIST) { + ret = set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); + if (ret == -EEXIST) { if (failed_start > start) btrfs_clear_extent_bit(tree, start, failed_start - 1, bits, cached); @@ -1904,21 +1904,21 @@ int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 struct extent_state **cached_state) { struct extent_state *failed_state = NULL; - int err; + int ret; u64 failed_start; - err = set_extent_bit(tree, start, end, bits, &failed_start, + ret = set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); - while (err == -EEXIST) { + while (ret == -EEXIST) { if (failed_start != start) btrfs_clear_extent_bit(tree, start, failed_start - 1, bits, cached_state); wait_extent_bit(tree, failed_start, end, bits, &failed_state); - err = set_extent_bit(tree, start, end, bits, &failed_start, + ret = set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); } - return err; + return ret; } /* diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 0a18ca9c59c3..36facca37973 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -19,7 +19,8 @@ enum { ENUM_BIT(EXTENT_DIRTY), ENUM_BIT(EXTENT_LOCKED), ENUM_BIT(EXTENT_DIO_LOCKED), - ENUM_BIT(EXTENT_NEW), + ENUM_BIT(EXTENT_DIRTY_LOG1), + ENUM_BIT(EXTENT_DIRTY_LOG2), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), ENUM_BIT(EXTENT_BOUNDARY), @@ -191,12 +192,6 @@ static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u6 cached, NULL); } -static inline int btrfs_clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) -{ - return btrfs_clear_extent_bit(tree, start, end, bits, NULL); -} - int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cb6128778a83..97d517cdf2df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -46,7 +46,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -56,12 +56,12 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int find_next_key(struct btrfs_path *path, int level, +static int find_next_key(const struct btrfs_path *path, int level, struct btrfs_key *key); -static int block_group_bits(struct btrfs_block_group *cache, u64 bits) +static int block_group_bits(const struct btrfs_block_group *cache, u64 bits) { return (cache->flags & bits) == bits; } @@ -329,7 +329,7 @@ search_again: * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, + const struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -401,16 +401,16 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) return ((u64)high_crc << 31) ^ (u64)low_crc; } -static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref) +static u64 hash_extent_data_ref_item(const struct extent_buffer *leaf, + const struct btrfs_extent_data_ref *ref) { return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), btrfs_extent_data_ref_objectid(leaf, ref), btrfs_extent_data_ref_offset(leaf, ref)); } -static bool match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, +static bool match_extent_data_ref(const struct extent_buffer *leaf, + const struct btrfs_extent_data_ref *ref, u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || @@ -497,7 +497,7 @@ fail: static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); @@ -617,13 +617,13 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref) +static noinline u32 extent_data_ref_count(const struct btrfs_path *path, + const struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref1; - struct btrfs_shared_data_ref *ref2; + const struct btrfs_extent_data_ref *ref1; + const struct btrfs_shared_data_ref *ref2; u32 num_refs = 0; int type; @@ -638,10 +638,10 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); ASSERT(type != BTRFS_REF_TYPE_INVALID); if (type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + ref1 = (const struct btrfs_extent_data_ref *)(&iref->offset); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else { - ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + ref2 = (const struct btrfs_shared_data_ref *)(iref + 1); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { @@ -684,7 +684,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); @@ -722,7 +722,7 @@ static inline int extent_ref_type(u64 parent, u64 owner) return type; } -static int find_next_key(struct btrfs_path *path, int level, +static int find_next_key(const struct btrfs_path *path, int level, struct btrfs_key *key) { @@ -1480,7 +1480,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { BTRFS_PATH_AUTO_FREE(path); @@ -1522,19 +1522,21 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* now insert the actual backref */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) + if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = insert_tree_block_ref(trans, path, node, bytenr); - else + if (ret) + btrfs_abort_transaction(trans, ret); + } else { ret = insert_extent_data_ref(trans, path, node, bytenr); - - if (ret) - btrfs_abort_transaction(trans, ret); + if (ret) + btrfs_abort_transaction(trans, ret); + } return ret; } static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_head *href) + const struct btrfs_delayed_ref_head *href) { u64 root = href->owning_root; @@ -1543,7 +1545,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, * where it has already been unset. */ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || - !href->is_data || !is_fstree(root)) + !href->is_data || !btrfs_is_fstree(root)) return; btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, @@ -1552,7 +1554,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -1620,7 +1622,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head, + const struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1707,7 +1709,7 @@ again: static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -1754,7 +1756,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -2998,7 +3000,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } - ret = add_to_free_space_tree(trans, bytenr, num_bytes); + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3079,7 +3081,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -3649,6 +3651,21 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + /* * Helper function for find_free_extent(). * @@ -3670,7 +3687,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (!cluster_bg) goto refill_cluster; if (cluster_bg != bg && (cluster_bg->ro || - !block_group_bits(cluster_bg, ffe_ctl->flags))) + !block_group_bits(cluster_bg, ffe_ctl->flags) || + !find_free_extent_check_size_class(ffe_ctl, cluster_bg))) goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, @@ -4227,21 +4245,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } -static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group *bg) -{ - if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) - return true; - if (!btrfs_block_group_should_use_size_class(bg)) - return true; - if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) - return true; - if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && - bg->size_class == BTRFS_BG_SZ_NONE) - return true; - return ffe_ctl->size_class == bg->size_class; -} - static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4782,7 +4785,7 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + ret = btrfs_remove_from_free_space_tree(trans, bytenr, num_bytes); if (ret) return ret; @@ -4873,7 +4876,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -4961,7 +4964,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); - if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) + if (btrfs_is_data_reloc_root(root) && btrfs_is_fstree(root->relocation_src_root)) generic_ref.owning_root = root->relocation_src_root; btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); @@ -4983,7 +4986,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; - struct btrfs_squota_delta delta = { + const struct btrfs_squota_delta delta = { .root = root_objectid, .num_bytes = ins->offset, .generation = trans->transid, @@ -5111,11 +5114,11 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (buf->log_index == 0) btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY_LOG1, NULL); else btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, - EXTENT_NEW, NULL); + EXTENT_DIRTY_LOG2, NULL); } else { buf->log_index = -1; btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start, @@ -5552,7 +5555,7 @@ again: goto again; } - exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); + exists = btrfs_find_delayed_tree_ref(head, btrfs_root_id(root), parent); mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); @@ -5872,15 +5875,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { ret = btrfs_dec_ref(trans, root, eb, 1); - else + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } else { ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { - btrfs_abort_transaction(trans, ret); - return ret; + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } } - if (is_fstree(btrfs_root_id(root))) { + if (btrfs_is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, @@ -6341,7 +6349,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); - atomic_inc(&parent->refs); + refcount_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); @@ -6442,7 +6450,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { DEBUG_WARN(); - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, btrfs_dev_name(device), diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 72914074c304..82d3a82dc712 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -97,7 +97,7 @@ enum btrfs_inline_ref_type { }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, + const struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1dc931c4937f..835b0deef9bb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -75,9 +75,9 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); - pr_err( - "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, + btrfs_err(fs_info, + "buffer leak start %llu len %u refs %d bflags %lu owner %llu", + eb->start, eb->len, refcount_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); WARN_ON_ONCE(1); @@ -110,6 +110,7 @@ struct btrfs_bio_ctrl { * This is to avoid touching ranges covered by compression/inline. */ unsigned long submit_bitmap; + struct readahead_control *ractl; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -266,8 +267,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, goto out; } range_start = max_t(u64, folio_pos(folio), start); - range_len = min_t(u64, folio_pos(folio) + folio_size(folio), - end + 1) - range_start; + range_len = min_t(u64, folio_end(folio), end + 1) - range_start; btrfs_folio_set_lock(fs_info, folio, range_start, range_len); processed_end = range_start + range_len - 1; @@ -321,7 +321,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, ASSERT(orig_end > orig_start); /* The range should at least cover part of the folio */ - ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + ASSERT(!(orig_start >= folio_end(locked_folio) || orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ @@ -419,7 +419,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + folio_size(folio)); + start + len <= folio_end(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); @@ -782,7 +782,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, static int attach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio, - struct btrfs_subpage *prealloc) + struct btrfs_folio_state *prealloc) { struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; @@ -806,7 +806,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, /* Already mapped, just free prealloc */ if (folio_test_private(folio)) { - btrfs_free_subpage(prealloc); + btrfs_free_folio_state(prealloc); return 0; } @@ -815,7 +815,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ - ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); + ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret; } @@ -831,7 +831,7 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); if (btrfs_is_subpage(fs_info, folio)) - return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); + return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; @@ -848,7 +848,7 @@ void clear_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); if (btrfs_is_subpage(fs_info, folio)) - return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); + return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } @@ -882,6 +882,25 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, return em; } + +static void btrfs_readahead_expand(struct readahead_control *ractl, + const struct extent_map *em) +{ + const u64 ra_pos = readahead_pos(ractl); + const u64 ra_end = ra_pos + readahead_length(ractl); + const u64 em_end = em->start + em->ram_bytes; + + /* No expansion for holes and inline extents. */ + if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) + return; + + ASSERT(em_end >= ra_pos, + "extent_map %llu %llu ends before current readahead position %llu", + em->start, em->len, ra_pos); + if (em_end > ra_end) + readahead_expand(ractl, ra_pos, em_end - ra_pos); +} + /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io @@ -945,6 +964,16 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, compress_type = btrfs_extent_map_compression(em); + /* + * Only expand readahead for extents which are already creating + * the pages anyway in add_ra_bio_pages, which is compressed + * extents in the non subpage case. + */ + if (bio_ctrl->ractl && + !btrfs_is_subpage(fs_info, folio) && + compress_type != BTRFS_COMPRESS_NONE) + btrfs_readahead_expand(bio_ctrl->ractl, em); + if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else @@ -1086,7 +1115,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * finished our folio read and unlocked the folio. */ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_pos(folio) + folio_size(folio), + u64 range_len = min(folio_end(folio), ordered->file_offset + ordered->num_bytes) - cur; ret = true; @@ -1108,7 +1137,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * So we return true and update @next_ret to the OE/folio boundary. */ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_pos(folio) + folio_size(folio), + u64 range_len = min(folio_end(folio), ordered->file_offset + ordered->num_bytes) - cur; /* @@ -1663,7 +1692,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl int ret; size_t pg_offset; loff_t i_size = i_size_read(&inode->vfs_inode); - unsigned long end_index = i_size >> PAGE_SHIFT; + const pgoff_t end_index = i_size >> PAGE_SHIFT; const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); @@ -1704,7 +1733,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs", - inode->root->root_key.objectid, + btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(folio)); ret = -EUCLEAN; goto done; @@ -1774,7 +1803,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e */ spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsigned long flags; set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); @@ -1874,7 +1903,7 @@ static void set_btree_ioerr(struct extent_buffer *eb) static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) { struct btrfs_fs_info *fs_info = eb->fs_info; - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsigned long flags; xas_lock_irqsave(&xas, flags); @@ -1886,7 +1915,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) { struct btrfs_fs_info *fs_info = eb->fs_info; - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsigned long flags; xas_lock_irqsave(&xas, flags); @@ -1961,7 +1990,7 @@ retry: if (!eb) return NULL; - if (!atomic_inc_not_zero(&eb->refs)) { + if (!refcount_inc_not_zero(&eb->refs)) { xas_reset(xas); goto retry; } @@ -1986,7 +2015,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, rcu_read_lock(); while ((eb = find_get_eb(&xas, end, tag)) != NULL) { if (!eb_batch_add(batch, eb)) { - *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits); + *start = ((eb->start + eb->len) >> fs_info->nodesize_bits); goto out; } } @@ -2008,11 +2037,11 @@ static struct extent_buffer *find_extent_buffer_nolock( struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; - unsigned long index = (start >> fs_info->sectorsize_bits); + unsigned long index = (start >> fs_info->nodesize_bits); rcu_read_lock(); eb = xa_load(&fs_info->buffer_tree, index); - if (eb && !atomic_inc_not_zero(&eb->refs)) + if (eb && !refcount_inc_not_zero(&eb->refs)) eb = NULL; rcu_read_unlock(); return eb; @@ -2031,10 +2060,7 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio) } buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); - + clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); bio_put(&bbio->bio); } @@ -2085,7 +2111,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + u32 range_len = min_t(u64, folio_end(folio), eb->start + eb->len) - range_start; folio_lock(folio); @@ -2114,8 +2140,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { struct eb_batch batch; - unsigned long start_index = (start >> fs_info->sectorsize_bits); - unsigned long end_index = (end >> fs_info->sectorsize_bits); + unsigned long start_index = (start >> fs_info->nodesize_bits); + unsigned long end_index = (end >> fs_info->nodesize_bits); eb_batch_init(&batch); while (start_index <= end_index) { @@ -2151,7 +2177,7 @@ int btree_write_cache_pages(struct address_space *mapping, eb_batch_init(&batch); if (wbc->range_cyclic) { - index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits); + index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits); end = -1; /* @@ -2160,8 +2186,8 @@ int btree_write_cache_pages(struct address_space *mapping, */ scanned = (index == 0); } else { - index = (wbc->range_start >> fs_info->sectorsize_bits); - end = (wbc->range_end >> fs_info->sectorsize_bits); + index = (wbc->range_start >> fs_info->nodesize_bits); + end = (wbc->range_end >> fs_info->nodesize_bits); scanned = 1; } @@ -2489,7 +2515,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f continue; } - cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end); + cur_end = min_t(u64, folio_end(folio) - 1, end); cur_len = cur_end + 1 - cur; ASSERT(folio_test_locked(folio)); @@ -2541,7 +2567,10 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ | REQ_RAHEAD, + .ractl = rac + }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); const u64 start = readahead_pos(rac); @@ -2731,13 +2760,13 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) static bool folio_range_has_eb(struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; lockdep_assert_held(&folio->mapping->i_private_lock); if (folio_test_private(folio)) { - subpage = folio_get_private(folio); - if (atomic_read(&subpage->eb_refs)) + bfs = folio_get_private(folio); + if (atomic_read(&bfs->eb_refs)) return true; } return false; @@ -2787,7 +2816,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); + btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2798,7 +2827,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); + btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); spin_unlock(&mapping->i_private_lock); } @@ -2842,7 +2871,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info btrfs_leak_debug_add_eb(eb); spin_lock_init(&eb->refs_lock); - atomic_set(&eb->refs, 1); + refcount_set(&eb->refs, 1); ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); @@ -2975,13 +3004,13 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * once io is initiated, TREE_REF can no longer be cleared, so that is * the moment at which any such race is best fixed. */ - refs = atomic_read(&eb->refs); + refs = refcount_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) return; spin_lock(&eb->refs_lock); if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); spin_unlock(&eb->refs_lock); } @@ -3038,7 +3067,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb->fs_info = fs_info; again: xa_lock_irq(&fs_info->buffer_tree); - exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits, + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits, NULL, eb, GFP_NOFS); if (xa_is_err(exists)) { ret = xa_err(exists); @@ -3047,7 +3076,7 @@ again: return ERR_PTR(ret); } if (exists) { - if (!atomic_inc_not_zero(&exists->refs)) { + if (!refcount_inc_not_zero(&exists->refs)) { /* The extent buffer is being freed, retry. */ xa_unlock_irq(&fs_info->buffer_tree); goto again; @@ -3092,7 +3121,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, * just overwrite folio private. */ exists = folio_get_private(folio); - if (atomic_inc_not_zero(&exists->refs)) + if (refcount_inc_not_zero(&exists->refs)) return exists; WARN_ON(folio_test_dirty(folio)); @@ -3141,13 +3170,13 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * The caller needs to free the existing folios and retry using the same order. */ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, - struct btrfs_subpage *prealloc, + struct btrfs_folio_state *prealloc, struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; - const unsigned long index = eb->start >> PAGE_SHIFT; + const pgoff_t index = eb->start >> PAGE_SHIFT; struct folio *existing_folio; int ret; @@ -3224,7 +3253,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; - struct btrfs_subpage *prealloc = NULL; + struct btrfs_folio_state *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; int uptodate = 1; @@ -3269,7 +3298,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * manually if we exit earlier. */ if (btrfs_meta_is_subpage(fs_info)) { - prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); + prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3280,7 +3309,7 @@ reallocate: /* Allocate all pages first. */ ret = alloc_eb_folio_array(eb, true); if (ret < 0) { - btrfs_free_subpage(prealloc); + btrfs_free_folio_state(prealloc); goto out; } @@ -3354,7 +3383,7 @@ reallocate: again: xa_lock_irq(&fs_info->buffer_tree); existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, - start >> fs_info->sectorsize_bits, NULL, eb, + start >> fs_info->nodesize_bits, NULL, eb, GFP_NOFS); if (xa_is_err(existing_eb)) { ret = xa_err(existing_eb); @@ -3362,7 +3391,7 @@ again: goto out; } if (existing_eb) { - if (!atomic_inc_not_zero(&existing_eb->refs)) { + if (!refcount_inc_not_zero(&existing_eb->refs)) { xa_unlock_irq(&fs_info->buffer_tree); goto again; } @@ -3391,7 +3420,7 @@ again: return eb; out: - WARN_ON(!atomic_dec_and_test(&eb->refs)); + WARN_ON(!refcount_dec_and_test(&eb->refs)); /* * Any attached folios need to be detached before we unlock them. This @@ -3437,8 +3466,7 @@ static int release_extent_buffer(struct extent_buffer *eb) { lockdep_assert_held(&eb->refs_lock); - WARN_ON(atomic_read(&eb->refs) == 0); - if (atomic_dec_and_test(&eb->refs)) { + if (refcount_dec_and_test(&eb->refs)) { struct btrfs_fs_info *fs_info = eb->fs_info; spin_unlock(&eb->refs_lock); @@ -3458,7 +3486,7 @@ static int release_extent_buffer(struct extent_buffer *eb) * in this case. */ xa_cmpxchg_irq(&fs_info->buffer_tree, - eb->start >> fs_info->sectorsize_bits, eb, NULL, + eb->start >> fs_info->nodesize_bits, eb, NULL, GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); @@ -3484,22 +3512,26 @@ void free_extent_buffer(struct extent_buffer *eb) if (!eb) return; - refs = atomic_read(&eb->refs); + refs = refcount_read(&eb->refs); while (1) { - if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) - || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && - refs == 1)) + if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { + if (refs == 1) + break; + } else if (refs <= 3) { break; - if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) + } + + /* Optimization to avoid locking eb->refs_lock. */ + if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) return; } spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) == 2 && + if (refcount_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); + refcount_dec(&eb->refs); /* * I know this is terrible, but it's temporary until we stop tracking @@ -3516,9 +3548,9 @@ void free_extent_buffer_stale(struct extent_buffer *eb) spin_lock(&eb->refs_lock); set_bit(EXTENT_BUFFER_STALE, &eb->bflags); - if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); + refcount_dec(&eb->refs); release_extent_buffer(eb); } @@ -3576,7 +3608,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } - WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(refcount_read(&eb->refs) == 0); } void set_extent_buffer_dirty(struct extent_buffer *eb) @@ -3587,7 +3619,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(refcount_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); @@ -3646,9 +3678,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) static void clear_extent_buffer_reading(struct extent_buffer *eb) { - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags); } static void end_bbio_meta_read(struct btrfs_bio *bbio) @@ -3713,7 +3743,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, eb->read_mirror = 0; check_buffer_tree_ref(eb); - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, @@ -3725,7 +3755,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + u32 range_len = min_t(u64, folio_end(folio), eb->start + eb->len) - range_start; bio_add_folio_nofail(&bbio->bio, folio, range_len, @@ -4104,8 +4134,8 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * @start: offset of the bitmap item in the extent buffer * @nr: bit number to test */ -int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, - unsigned long nr) +bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, + unsigned long nr) { unsigned long i; size_t offset; @@ -4296,9 +4326,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct extent_buffer *eb; - unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits); + unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits); unsigned long index = start; - unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1; + unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; int ret; xa_lock_irq(&fs_info->buffer_tree); @@ -4308,7 +4338,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * won't disappear out from under us. */ spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); continue; } @@ -4374,7 +4404,7 @@ int try_release_extent_buffer(struct folio *folio) * this page. */ spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); spin_unlock(&folio->mapping->i_private_lock); return 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e36e8d6a00bc..61130786b9a3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -98,7 +98,7 @@ struct extent_buffer { void *addr; spinlock_t refs_lock; - atomic_t refs; + refcount_t refs; int read_mirror; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; @@ -345,8 +345,8 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long len); void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, unsigned long len); -int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, - unsigned long pos); +bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, + unsigned long pos); void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); void extent_buffer_bitmap_clear(const struct extent_buffer *eb, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 02bfdb976e40..57f52585a6dd 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -84,7 +84,7 @@ static void remove_em(struct btrfs_inode *inode, struct extent_map *em) rb_erase(&em->rb_node, &inode->extent_tree.root); RB_CLEAR_NODE(&em->rb_node); - if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) + if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(inode->root))) percpu_counter_dec(&fs_info->evictable_extent_maps); } @@ -502,7 +502,7 @@ static int add_extent_mapping(struct btrfs_inode *inode, setup_extent_mapping(inode, em, modified); - if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) + if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(root))) percpu_counter_inc(&fs_info->evictable_extent_maps); return 0; @@ -1337,7 +1337,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) if (!root) continue; - if (is_fstree(btrfs_root_id(root))) + if (btrfs_is_fstree(btrfs_root_id(root))) nr_dropped += btrfs_scan_root(root, &ctx); btrfs_put_root(root); diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 43bf0979fd53..7935586a9dbd 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -320,7 +320,7 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p * the cost of allocating a new one. */ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); - atomic_inc(&clone->refs); + refcount_inc(&clone->refs); ret = btrfs_next_leaf(inode->root, path); if (ret != 0) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54d523d4f421..c09fbc257634 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -427,7 +427,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) memset(csum_dst, 0, csum_size); count = 1; - if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_is_data_reloc_root(inode->root)) { u64 file_offset = bbio->file_offset + bio_offset; btrfs_set_extent_bit(&inode->io_tree, file_offset, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8ce6f45f45e0..204674934795 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -89,8 +89,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); - ASSERT(folio_pos(folio) <= pos && - folio_pos(folio) + folio_size(folio) >= pos + write_bytes); + ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -801,7 +800,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); - u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; @@ -857,7 +856,7 @@ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ loff_t pos, size_t write_bytes, bool nowait) { - unsigned long index = pos >> PAGE_SHIFT; + const pgoff_t index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | fgf_set_order(write_bytes); @@ -963,6 +962,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * @pos: File offset. * @write_bytes: The length to write, will be updated to the nocow writeable * range. + * @nowait: Indicate if we can block or not (non-blocking IO context). * * This function will flush ordered extents in the range to ensure proper * nocow checks. @@ -971,7 +971,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. * -EAGAIN If we can't do a nocow write because snapshoting of the inode's - * root is in progress. + * root is in progress or because we are in a non-blocking IO + * context and need to block (@nowait is true). * < 0 If an error happened. * * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. @@ -983,8 +984,8 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, struct btrfs_root *root = inode->root; struct extent_state *cached_state = NULL; u64 lockstart, lockend; - u64 num_bytes; - int ret; + u64 cur_offset; + int ret = 0; if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; @@ -995,7 +996,6 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - num_bytes = lockend - lockstart + 1; if (nowait) { if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, @@ -1007,14 +1007,36 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); - if (ret <= 0) - btrfs_drew_write_unlock(&root->snapshot_lock); - else - *write_bytes = min_t(size_t, *write_bytes , - num_bytes - pos + lockstart); + + cur_offset = lockstart; + while (cur_offset < lockend) { + u64 num_bytes = lockend - cur_offset + 1; + + ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait); + if (ret <= 0) { + /* + * If cur_offset == lockstart it means we haven't found + * any extent against which we can NOCOW, so unlock the + * snapshot lock. + */ + if (cur_offset == lockstart) + btrfs_drew_write_unlock(&root->snapshot_lock); + break; + } + cur_offset += num_bytes; + } + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + /* + * cur_offset > lockstart means there's at least a partial range we can + * NOCOW, and that range can cover one or more extents. + */ + if (cur_offset > lockstart) { + *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos); + return 1; + } + return ret; } @@ -1233,8 +1255,8 @@ again: * The reserved range goes beyond the current folio, shrink the reserved * space to the folio boundary. */ - if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) { - const u64 last_block = folio_pos(folio) + folio_size(folio); + if (reserved_start + reserved_len > folio_end(folio)) { + const u64 last_block = folio_end(folio); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, @@ -1832,9 +1854,9 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct folio *folio = page_folio(page); - struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; @@ -1842,6 +1864,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) loff_t size; size_t fsize = folio_size(folio); int ret; + bool only_release_metadata = false; u64 reserved_space; u64 page_start; u64 page_end; @@ -1849,7 +1872,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) reserved_space = fsize; - sb_start_pagefault(inode->i_sb); + sb_start_pagefault(inode->vfs_inode.i_sb); page_start = folio_pos(folio); page_end = page_start + folio_size(folio) - 1; end = page_end; @@ -1862,20 +1885,43 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (ret < 0) + ret = btrfs_check_data_free_space(inode, &data_reserved, page_start, + reserved_space, false); + if (ret < 0) { + size_t write_bytes = reserved_space; + + if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0) + goto out_noreserve; + + only_release_metadata = true; + + /* + * Can't write the whole range, there may be shared extents or + * holes in the range, bail out with @only_release_metadata set + * to true so that we unlock the nocow lock before returning the + * error. + */ + if (write_bytes < reserved_space) + goto out_noreserve; + } + ret = btrfs_delalloc_reserve_metadata(inode, reserved_space, + reserved_space, false); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + page_start, reserved_space); goto out_noreserve; + } ret = file_update_time(vmf->vma->vm_file); if (ret < 0) goto out; again: - down_read(&BTRFS_I(inode)->i_mmap_lock); + down_read(&inode->i_mmap_lock); folio_lock(folio); - size = i_size_read(inode); + size = i_size_read(&inode->vfs_inode); - if ((folio->mapping != inode->i_mapping) || + if ((folio->mapping != inode->vfs_inode.i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; @@ -1893,11 +1939,11 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); + ordered = btrfs_lookup_ordered_range(inode, page_start, fsize); if (ordered) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); - up_read(&BTRFS_I(inode)->i_mmap_lock); + up_read(&inode->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -1906,10 +1952,14 @@ again: if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < fsize) { + const u64 to_free = fsize - reserved_space; + end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, end + 1, - fsize - reserved_space, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, to_free, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + end + 1, to_free, true); } } @@ -1920,12 +1970,11 @@ again: * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ - btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + btrfs_clear_extent_bit(io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, - &cached_state); + ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state); if (ret < 0) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; @@ -1944,26 +1993,38 @@ again: btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); - btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + btrfs_set_inode_last_sub_trans(inode); + + if (only_release_metadata) + btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE, + &cached_state); btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); - up_read(&BTRFS_I(inode)->i_mmap_lock); + up_read(&inode->i_mmap_lock); - btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); - sb_end_pagefault(inode->i_sb); + btrfs_delalloc_release_extents(inode, fsize); + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); + sb_end_pagefault(inode->vfs_inode.i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); - up_read(&BTRFS_I(inode)->i_mmap_lock); + up_read(&inode->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, true); + btrfs_delalloc_release_extents(inode, fsize); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, reserved_space, true); + else + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space, true); extent_changeset_free(data_reserved); out_noreserve: - sb_end_pagefault(inode->i_sb); + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); + + sb_end_pagefault(inode->vfs_inode.i_sb); if (ret < 0) return vmf_error(ret); @@ -1978,15 +2039,16 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .page_mkwrite = btrfs_page_mkwrite, }; -static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); - vma->vm_ops = &btrfs_file_vm_ops; + desc->vm_ops = &btrfs_file_vm_ops; return 0; } @@ -2195,7 +2257,7 @@ static bool check_range_has_page(struct inode *inode, u64 start, u64 end) if (folio->index < start_index) continue; /* A large folio extends beyond the end. Not a target. */ - if (folio->index + folio_nr_pages(folio) > end_index) + if (folio_next_index(folio) > end_index) continue; /* A folio doesn't cover the head/tail index. Found a target. */ ret = true; @@ -2341,7 +2403,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv rsv; unsigned int rsv_count; u64 cur_offset; u64 len = end - start; @@ -2350,13 +2412,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, if (end <= start) return -EINVAL; - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) { - ret = -ENOMEM; - goto out; - } - rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1); + rsv.failfast = true; /* * 1 - update the inode @@ -2373,14 +2431,14 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; - goto out_free; + goto out_release; } - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); if (WARN_ON(ret)) goto out_trans; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; cur_offset = start; drop_args.path = path; @@ -2496,10 +2554,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); + &rsv, min_size, false); if (WARN_ON(ret)) break; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; cur_offset = drop_args.drop_end; len = end - cur_offset; @@ -2576,16 +2634,15 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, out_trans: if (!trans) - goto out_free; + goto out_release; trans->block_rsv = &fs_info->trans_block_rsv; if (ret) btrfs_end_transaction(trans); else *trans_out = trans; -out_free: - btrfs_free_block_rsv(fs_info, rsv); -out: +out_release: + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); return ret; } @@ -3765,7 +3822,7 @@ const struct file_operations btrfs_file_operations = { .splice_read = filemap_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, - .mmap = btrfs_file_mmap, + .mmap_prepare = btrfs_file_mmap_prepare, .open = btrfs_file_open, .release = btrfs_release_file, .get_unmapped_area = thp_get_unmapped_area, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4b34ea1f01c2..5d8d1570a5c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -366,7 +366,7 @@ fail: static void readahead_cache(struct inode *inode) { struct file_ra_state ra; - unsigned long last_index; + pgoff_t last_index; file_ra_state_init(&ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; @@ -3192,7 +3192,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - int err; + int ret2; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; @@ -3200,8 +3200,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, search_start = min_start; search_bytes = bytes; - err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); - if (err) { + ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true); + if (ret2) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); return 0; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a3e2a2a81461..eba7f22ae49c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -35,7 +35,7 @@ static struct btrfs_root *btrfs_free_space_root( return btrfs_global_root(block_group->fs_info, &key); } -void set_free_space_tree_thresholds(struct btrfs_block_group *cache) +void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; size_t bitmap_size; @@ -82,22 +82,19 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; info = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - - ret = 0; -out: btrfs_release_path(path); - return ret; + return 0; } EXPORT_FOR_TESTS -struct btrfs_free_space_info *search_free_space_info( +struct btrfs_free_space_info *btrfs_search_free_space_info( struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, int cow) @@ -201,9 +198,9 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) } EXPORT_FOR_TESTS -int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path) +int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); @@ -281,7 +278,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - info = search_free_space_info(trans, block_group, path, 1); + info = btrfs_search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); btrfs_abort_transaction(trans, ret); @@ -290,6 +287,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; flags = btrfs_free_space_flags(leaf, info); flags |= BTRFS_FREE_SPACE_USING_BITMAPS; + block_group->using_free_space_bitmaps = true; + block_group->using_free_space_bitmaps_cached = true; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_release_path(path); @@ -343,9 +342,9 @@ out: } EXPORT_FOR_TESTS -int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path) +int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); @@ -409,12 +408,12 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, data_size = free_space_bitmap_size(fs_info, found_key.offset); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); + path->slots[0]--; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); read_extent_buffer(leaf, bitmap_cursor, ptr, data_size); nr++; - path->slots[0]--; } else { ASSERT(0); } @@ -428,7 +427,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - info = search_free_space_info(trans, block_group, path, 1); + info = btrfs_search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); btrfs_abort_transaction(trans, ret); @@ -437,20 +436,22 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; flags = btrfs_free_space_flags(leaf, info); flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; + block_group->using_free_space_bitmaps = false; + block_group->using_free_space_bitmaps_cached = true; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_release_path(path); - nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; + nrbits = block_group->length >> fs_info->sectorsize_bits; start_bit = find_next_bit_le(bitmap, nrbits, 0); while (start_bit < nrbits) { end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit); ASSERT(start_bit < end_bit); - key.objectid = start + start_bit * block_group->fs_info->sectorsize; + key.objectid = start + start_bit * fs_info->sectorsize; key.type = BTRFS_FREE_SPACE_EXTENT_KEY; - key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; + key.offset = (end_bit - start_bit) * fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) { @@ -493,11 +494,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (new_extents == 0) return 0; - info = search_free_space_info(trans, block_group, path, 1); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + info = btrfs_search_free_space_info(trans, block_group, path, 1); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); extent_count = btrfs_free_space_extent_count(path->nodes[0], info); @@ -507,19 +507,18 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count > block_group->bitmap_high_thresh) { - ret = convert_free_space_to_bitmaps(trans, block_group, path); + ret = btrfs_convert_free_space_to_bitmaps(trans, block_group, path); } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count < block_group->bitmap_low_thresh) { - ret = convert_free_space_to_extents(trans, block_group, path); + ret = btrfs_convert_free_space_to_extents(trans, block_group, path); } -out: return ret; } EXPORT_FOR_TESTS -int free_space_test_bit(struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 offset) +bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 offset) { struct extent_buffer *leaf; struct btrfs_key key; @@ -537,13 +536,13 @@ int free_space_test_bit(struct btrfs_block_group *block_group, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); i = div_u64(offset - found_start, block_group->fs_info->sectorsize); - return !!extent_buffer_test_bit(leaf, ptr, i); + return extent_buffer_test_bit(leaf, ptr, i); } -static void free_space_set_bits(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 *start, u64 *size, - int bit) +static void free_space_modify_bits(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 *start, u64 *size, + bool set_bits) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct extent_buffer *leaf; @@ -567,7 +566,7 @@ static void free_space_set_bits(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); first = (*start - found_start) >> fs_info->sectorsize_bits; last = (end - found_start) >> fs_info->sectorsize_bits; - if (bit) + if (set_bits) extent_buffer_bitmap_set(leaf, ptr, first, last - first); else extent_buffer_bitmap_clear(leaf, ptr, first, last - first); @@ -611,13 +610,14 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans, static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, - u64 start, u64 size, int remove) + u64 start, u64 size, bool remove) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; - int prev_bit, next_bit; + bool prev_bit_set = false; + bool next_bit_set = false; int new_extents; int ret; @@ -634,16 +634,16 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); if (ret) - goto out; + return ret; - prev_bit = free_space_test_bit(block_group, path, prev_block); + prev_bit_set = btrfs_free_space_test_bit(block_group, path, prev_block); /* The previous block may have been in the previous bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (start >= key.objectid + key.offset) { ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } } else { key.objectid = start; @@ -652,9 +652,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); if (ret) - goto out; - - prev_bit = -1; + return ret; } /* @@ -664,13 +662,13 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, cur_start = start; cur_size = size; while (1) { - free_space_set_bits(trans, block_group, path, &cur_start, &cur_size, - !remove); + free_space_modify_bits(trans, block_group, path, &cur_start, + &cur_size, !remove); if (cur_size == 0) break; ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } /* @@ -683,42 +681,36 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, if (end >= key.objectid + key.offset) { ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } - next_bit = free_space_test_bit(block_group, path, end); - } else { - next_bit = -1; + next_bit_set = btrfs_free_space_test_bit(block_group, path, end); } if (remove) { new_extents = -1; - if (prev_bit == 1) { + if (prev_bit_set) { /* Leftover on the left. */ new_extents++; } - if (next_bit == 1) { + if (next_bit_set) { /* Leftover on the right. */ new_extents++; } } else { new_extents = 1; - if (prev_bit == 1) { + if (prev_bit_set) { /* Merging with neighbor on the left. */ new_extents--; } - if (next_bit == 1) { + if (next_bit_set) { /* Merging with neighbor on the right. */ new_extents--; } } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } static int remove_free_space_extent(struct btrfs_trans_handle *trans, @@ -739,7 +731,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -771,7 +763,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, /* Delete the existing key (cases 1-4). */ ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; /* Add a key for leftovers at the beginning (cases 3 and 4). */ if (start > found_start) { @@ -782,7 +774,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) - goto out; + return ret; new_extents++; } @@ -795,50 +787,58 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) - goto out; + return ret; new_extents++; } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } -EXPORT_FOR_TESTS -int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size) +static int using_bitmaps(struct btrfs_block_group *bg, struct btrfs_path *path) { struct btrfs_free_space_info *info; u32 flags; - int ret; - if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { - ret = __add_block_group_free_space(trans, block_group, path); - if (ret) - return ret; - } + if (bg->using_free_space_bitmaps_cached) + return bg->using_free_space_bitmaps; - info = search_free_space_info(NULL, block_group, path, 0); + info = btrfs_search_free_space_info(NULL, bg, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); btrfs_release_path(path); - if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + bg->using_free_space_bitmaps = (flags & BTRFS_FREE_SPACE_USING_BITMAPS); + bg->using_free_space_bitmaps_cached = true; + + return bg->using_free_space_bitmaps; +} + +EXPORT_FOR_TESTS +int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + int ret; + + ret = __add_block_group_free_space(trans, block_group, path); + if (ret) + return ret; + + ret = using_bitmaps(block_group, path); + if (ret < 0) + return ret; + + if (ret) return modify_free_space_bitmap(trans, block_group, path, - start, size, 1); - } else { - return remove_free_space_extent(trans, block_group, path, - start, size); - } + start, size, true); + + return remove_free_space_extent(trans, block_group, path, start, size); } -int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size) +int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size) { struct btrfs_block_group *block_group; struct btrfs_path *path; @@ -863,8 +863,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, } mutex_lock(&block_group->free_space_lock); - ret = __remove_from_free_space_tree(trans, block_group, path, start, - size); + ret = __btrfs_remove_from_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); if (ret) btrfs_abort_transaction(trans, ret); @@ -918,7 +917,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -941,7 +940,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, if (found_end == start) { ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; new_key.objectid = found_start; new_key.offset += key.offset; new_extents--; @@ -958,7 +957,7 @@ right: ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -982,7 +981,7 @@ right: if (found_start == end) { ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; new_key.offset += key.offset; new_extents--; } @@ -992,48 +991,36 @@ insert: /* Insert the new key (cases 1-4). */ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); if (ret) - goto out; + return ret; btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } EXPORT_FOR_TESTS -int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size) +int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_free_space_info *info; - u32 flags; int ret; - if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { - ret = __add_block_group_free_space(trans, block_group, path); - if (ret) - return ret; - } + ret = __add_block_group_free_space(trans, block_group, path); + if (ret) + return ret; - info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) - return PTR_ERR(info); - flags = btrfs_free_space_flags(path->nodes[0], info); - btrfs_release_path(path); + ret = using_bitmaps(block_group, path); + if (ret < 0) + return ret; - if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + if (ret) return modify_free_space_bitmap(trans, block_group, path, - start, size, 0); - } else { - return add_free_space_extent(trans, block_group, path, start, - size); - } + start, size, false); + + return add_free_space_extent(trans, block_group, path, start, size); } -int add_to_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size) +int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size) { struct btrfs_block_group *block_group; struct btrfs_path *path; @@ -1058,7 +1045,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, } mutex_lock(&block_group->free_space_lock); - ret = __add_to_free_space_tree(trans, block_group, path, start, size); + ret = __btrfs_add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); if (ret) btrfs_abort_transaction(trans, ret); @@ -1138,11 +1125,11 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, break; if (start < key.objectid) { - ret = __add_to_free_space_tree(trans, - block_group, - path2, start, - key.objectid - - start); + ret = __btrfs_add_to_free_space_tree(trans, + block_group, + path2, start, + key.objectid - + start); if (ret) goto out_locked; } @@ -1161,8 +1148,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, goto out_locked; } if (start < end) { - ret = __add_to_free_space_tree(trans, block_group, path2, - start, end - start); + ret = __btrfs_add_to_free_space_tree(trans, block_group, path2, + start, end - start); if (ret) goto out_locked; } @@ -1241,6 +1228,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, { BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; + struct rb_node *node; int nr; int ret; @@ -1269,6 +1257,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, btrfs_release_path(path); } + node = rb_first_cached(&trans->fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *bg; + + bg = rb_entry(node, struct btrfs_block_group, cache_node); + clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags); + node = rb_next(node); + cond_resched(); + } + return 0; } @@ -1358,12 +1356,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); + + if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, + &block_group->runtime_flags)) + goto next; + ret = populate_free_space_tree(trans, block_group); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } +next: if (btrfs_should_end_transaction(trans)) { btrfs_end_transaction(trans); trans = btrfs_start_transaction(free_space_root, 1); @@ -1386,51 +1390,79 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { + bool own_path = false; int ret; - clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); + if (!test_and_clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + &block_group->runtime_flags)) + return 0; + + /* + * While rebuilding the free space tree we may allocate new metadata + * block groups while modifying the free space tree. + * + * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we + * can use multiple transactions, every time btrfs_end_transaction() is + * called at btrfs_rebuild_free_space_tree() we finish the creation of + * new block groups by calling btrfs_create_pending_block_groups(), and + * that in turn calls us, through add_block_group_free_space(), to add + * a free space info item and a free space extent item for the block + * group. + * + * Then later btrfs_rebuild_free_space_tree() may find such new block + * groups and processes them with populate_free_space_tree(), which can + * fail with EEXIST since there are already items for the block group in + * the free space tree. Notice that we say "may find" because a new + * block group may be added to the block groups rbtree in a node before + * or after the block group currently being processed by the rebuild + * process. So signal the rebuild process to skip such new block groups + * if it finds them. + */ + set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + btrfs_abort_transaction(trans, -ENOMEM); + return -ENOMEM; + } + own_path = true; + } ret = add_new_free_space_info(trans, block_group, path); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = __btrfs_add_to_free_space_tree(trans, block_group, path, + block_group->start, block_group->length); if (ret) - return ret; + btrfs_abort_transaction(trans, ret); + +out: + if (own_path) + btrfs_free_path(path); - return __add_to_free_space_tree(trans, block_group, path, - block_group->start, - block_group->length); + return ret; } -int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group) +int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path = NULL; - int ret = 0; + int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; mutex_lock(&block_group->free_space_lock); - if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) - goto out; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - ret = __add_block_group_free_space(trans, block_group, path); - -out: - btrfs_free_path(path); + ret = __add_block_group_free_space(trans, block_group, NULL); mutex_unlock(&block_group->free_space_lock); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } -int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group) +int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_path *path; @@ -1451,6 +1483,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -1463,8 +1496,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -1492,16 +1527,16 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } ret = 0; out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1513,7 +1548,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; - int prev_bit = 0, bit; + bool prev_bit_set = false; /* Initialize to silence GCC. */ u64 extent_start = 0; u64 end, offset; @@ -1530,7 +1565,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, while (1) { ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret) break; @@ -1544,10 +1579,12 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, offset = key.objectid; while (offset < key.objectid + key.offset) { - bit = free_space_test_bit(block_group, path, offset); - if (prev_bit == 0 && bit == 1) { + bool bit_set; + + bit_set = btrfs_free_space_test_bit(block_group, path, offset); + if (!prev_bit_set && bit_set) { extent_start = offset; - } else if (prev_bit == 1 && bit == 0) { + } else if (prev_bit_set && !bit_set) { u64 space_added; ret = btrfs_add_new_free_space(block_group, @@ -1555,7 +1592,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, offset, &space_added); if (ret) - goto out; + return ret; total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; @@ -1563,14 +1600,14 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } extent_count++; } - prev_bit = bit; + prev_bit_set = bit_set; offset += fs_info->sectorsize; } } - if (prev_bit == 1) { + if (prev_bit_set) { ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); if (ret) - goto out; + return ret; extent_count++; } @@ -1580,13 +1617,10 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, block_group->start, extent_count, expected_extent_count); DEBUG_WARN(); - ret = -EIO; - goto out; + return -EIO; } - ret = 0; -out: - return ret; + return 0; } static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, @@ -1613,7 +1647,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret) break; @@ -1629,7 +1663,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, key.objectid + key.offset, &space_added); if (ret) - goto out; + return ret; total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; @@ -1644,16 +1678,13 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, block_group->start, extent_count, expected_extent_count); DEBUG_WARN(); - ret = -EIO; - goto out; + return -EIO; } - ret = 0; -out: - return ret; + return 0; } -int load_free_space_tree(struct btrfs_caching_control *caching_ctl) +int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; @@ -1674,7 +1705,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->search_commit_root = 1; path->reada = READA_FORWARD; - info = search_free_space_info(NULL, block_group, path, 0); + info = btrfs_search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) return PTR_ERR(info); diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index e6c6d6f4f221..3d9a5d4477fc 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -22,39 +22,39 @@ struct btrfs_trans_handle; #define BTRFS_FREE_SPACE_BITMAP_SIZE 256 #define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) -void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); +void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *block_group); int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info); -int load_free_space_tree(struct btrfs_caching_control *caching_ctl); -int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group); -int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group); -int add_to_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size); -int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size); +int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl); +int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group); +int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group); +int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size); +int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * -search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, int cow); -int __add_to_free_space_tree(struct btrfs_trans_handle *trans, +btrfs_search_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size); -int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size); -int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path); -int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path); -int free_space_test_bit(struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 offset); + struct btrfs_path *path, int cow); +int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size); +int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size); +int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 offset); #endif #endif diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 4394de12a767..8cc07cc70b12 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -420,6 +420,8 @@ struct btrfs_commit_stats { u64 last_commit_dur; /* The total commit duration in ns */ u64 total_commit_dur; + /* Start of the last critical section in ns. */ + u64 critical_section_start_time; }; struct btrfs_fs_info { @@ -713,8 +715,6 @@ struct btrfs_fs_info { u32 data_chunk_allocations; u32 metadata_ratio; - void *bdev_holder; - /* Private scrub information */ struct mutex scrub_lock; atomic_t scrubs_running; @@ -739,12 +739,6 @@ struct btrfs_fs_info { spinlock_t qgroup_lock; /* - * Used to avoid frequently calling ulist_alloc()/ulist_free() - * when doing qgroup accounting, it must be protected by qgroup_lock. - */ - struct ulist *qgroup_ulist; - - /* * Protect user change for quota operations. If a transaction is needed, * it must be started before locking this lock. */ @@ -779,7 +773,7 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Entries are eb->start / sectorsize */ + /* Entries are eb->start >> nodesize_bits */ struct xarray buffer_tree; /* Next backup root to be overwritten */ @@ -811,6 +805,7 @@ struct btrfs_fs_info { /* Cached block sizes */ u32 nodesize; + u32 nodesize_bits; u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index a61c3540d67b..f06cf701ae5a 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -78,13 +78,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( } /* Returns NULL if no extref found */ -struct btrfs_inode_extref * -btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct fscrypt_str *name, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow) +struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root, + struct btrfs_path *path, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid) { int ret; struct btrfs_key key; @@ -93,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ERR_PTR(ret); if (ret > 0) @@ -720,13 +717,12 @@ delete: } out: if (ret >= 0 && pending_del_nr) { - int err; + int ret2; - err = btrfs_del_items(trans, root, path, pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, err); - ret = err; + ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); + if (ret2) { + btrfs_abort_transaction(trans, ret2); + ret = ret2; } } diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index c11b97fdccc4..6d9f5ad20646 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -101,13 +101,10 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); -struct btrfs_inode_extref *btrfs_lookup_inode_extref( - struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct fscrypt_str *name, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow); +struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root, + struct btrfs_path *path, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid); struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 26d6ed170a19..b77dd22b8cdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -308,7 +308,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); @@ -395,8 +395,8 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, u64 offset, u64 bytes) { - unsigned long index = offset >> PAGE_SHIFT; - unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + pgoff_t index = offset >> PAGE_SHIFT; + const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT; struct folio *folio; while (index <= end_index) { @@ -423,18 +423,18 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { - int err; + int ret; if (args->default_acl) { - err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ret = __btrfs_set_acl(trans, args->inode, args->default_acl, ACL_TYPE_DEFAULT); - if (err) - return err; + if (ret) + return ret; } if (args->acl) { - err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); - if (err) - return err; + ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (ret) + return ret; } if (!args->default_acl && !args->acl) cache_no_acl(args->inode); @@ -781,12 +781,15 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } + /* Defrag ioctl takes precedence over mount options and properties. */ + if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) + return 0; + if (BTRFS_COMPRESS_NONE < inode->defrag_compress && + inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) + return 1; /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; - /* defrag ioctl */ - if (inode->defrag_compress) - return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; @@ -808,12 +811,11 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { - unsigned long end_index = end >> PAGE_SHIFT; + const pgoff_t end_index = end >> PAGE_SHIFT; struct folio *folio; int ret = 0; - for (unsigned long index = start >> PAGE_SHIFT; - index <= end_index; index++) { + for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) { folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) @@ -943,7 +945,7 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) { + if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { compress_type = inode->defrag_compress; compress_level = inode->defrag_compress_level; } else if (inode->prop_compress) { @@ -1755,7 +1757,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, spin_unlock(&sinfo->lock); if (count > 0) - btrfs_clear_extent_bits(io_tree, start, end, EXTENT_NORESERVE); + btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, + &cached_state); } btrfs_unlock_extent(io_tree, start, end, &cached_state); @@ -2328,8 +2331,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= folio_pos(locked_folio) || - start >= folio_pos(locked_folio) + folio_size(locked_folio))); + ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio))); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); @@ -2737,7 +2739,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); - u64 page_end = folio_pos(folio) + folio_size(folio) - 1; + u64 page_end = folio_end(folio) - 1; int ret = 0; bool free_delalloc_space = true; @@ -2881,7 +2883,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) DEBUG_WARN(); btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs", - BTRFS_I(inode)->root->root_key.objectid, + btrfs_root_id(BTRFS_I(inode)->root), btrfs_ino(BTRFS_I(inode)), folio_pos(folio)); return -EUCLEAN; @@ -3375,8 +3377,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, NULL)) { /* Skip the range without csum for data reloc inode */ - btrfs_clear_extent_bits(&inode->io_tree, file_offset, end, - EXTENT_NODATASUM); + btrfs_clear_extent_bit(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM, NULL); return true; } @@ -3946,6 +3948,7 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &inode->flags, &inode->ro_flags); btrfs_update_inode_mapping_flags(inode); + btrfs_set_inode_mapping_order(inode); cache_index: /* @@ -4078,45 +4081,35 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - struct btrfs_map_token token; u64 flags; - btrfs_init_map_token(&token, leaf); - - btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); - btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); - btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_token_inode_mode(&token, item, inode->i_mode); - btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); - - btrfs_set_token_timespec_sec(&token, &item->atime, - inode_get_atime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->atime, - inode_get_atime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->mtime, - inode_get_mtime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode_get_mtime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); - - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); - btrfs_set_token_inode_generation(&token, item, - BTRFS_I(inode)->generation); - btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); - btrfs_set_token_inode_transid(&token, item, trans->transid); - btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); - btrfs_set_token_inode_flags(&token, item, flags); - btrfs_set_token_inode_block_group(&token, item, 0); + btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_block_group(leaf, item, 0); } /* @@ -4215,20 +4208,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { - ret = di ? PTR_ERR(di) : -ENOENT; - goto err; + btrfs_free_path(path); + return di ? PTR_ERR(di) : -ENOENT; } ret = btrfs_delete_one_dir_name(trans, root, path, di); + /* + * Down the call chains below we'll also need to allocate a path, so no + * need to hold on to this one for longer than necessary. + */ + btrfs_free_path(path); if (ret) - goto err; - btrfs_release_path(path); + return ret; /* * If we don't have dir index, we have to get it by looking up @@ -4254,7 +4249,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, "failed to delete reference to %.*s, root %llu inode %llu parent %llu", name->len, name->name, btrfs_root_id(root), ino, dir_ino); btrfs_abort_transaction(trans, ret); - goto err; + return ret; } skip_backref: if (rename_ctx) @@ -4263,7 +4258,7 @@ skip_backref: ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); - goto err; + return ret; } /* @@ -4287,19 +4282,14 @@ skip_backref: * holding. */ btrfs_run_delayed_iput(fs_info, inode); -err: - btrfs_free_path(path); - if (ret) - goto out; btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); - ret = btrfs_update_inode(trans, dir); -out: - return ret; + + return btrfs_update_inode(trans, dir); } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -4704,68 +4694,68 @@ out_up_write: return ret; } -static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_inode *dir = BTRFS_I(vfs_dir); + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; struct btrfs_trans_handle *trans; - u64 last_unlink_trans; struct fscrypt_name fname; - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) + if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { btrfs_err(fs_info, "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } - return btrfs_delete_subvolume(BTRFS_I(dir), dentry); + return btrfs_delete_subvolume(dir, dentry); } - ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ - trans = __unlink_start_trans(BTRFS_I(dir)); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } - if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + /* + * Propagate the last_unlink_trans value of the deleted dir to its + * parent directory. This is to prevent an unrecoverable log tree in the + * case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + * + * This is because we can't unlink other roots when replaying the dir + * deletes for directory foo. + */ + if (inode->last_unlink_trans >= trans->transid) + btrfs_record_snapshot_destroy(trans, dir); + + if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + ret = btrfs_unlink_subvol(trans, dir, dentry); goto out; } - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); if (ret) goto out; - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; - /* now the directory is empty */ - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &fname.disk_name); - if (!ret) { - btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * Propagate the last_unlink_trans value of the deleted dir to - * its parent directory. This is to prevent an unrecoverable - * log tree in the case we do something like this: - * 1) create dir foo - * 2) create snapshot under dir foo - * 3) delete the snapshot - * 4) rmdir foo - * 5) mkdir foo - * 6) fsync foo or some file inside foo - */ - if (last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; - } + ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name); + if (!ret) + btrfs_i_size_write(inode, 0); out: btrfs_end_transaction(trans); out_notrans: @@ -4821,9 +4811,9 @@ again: */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = folio_pos(folio) + folio_size(folio) - 1; + zero_end = folio_end(folio); folio_zero_range(folio, zero_start - folio_pos(folio), - zero_end - zero_start + 1); + zero_end - zero_start); out_unlock: folio_unlock(folio); @@ -4861,7 +4851,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); - size_t write_bytes = blocksize; int ret = 0; const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), blocksize); @@ -4913,8 +4902,12 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, blocksize, false); if (ret < 0) { + size_t write_bytes = blocksize; + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { - /* For nocow case, no need to reserve data space */ + /* For nocow case, no need to reserve data space. */ + ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u", + write_bytes, blocksize); only_release_metadata = true; } else { goto out; @@ -5001,8 +4994,7 @@ again: * not reach disk, it still affects our page caches. */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, - end); + zero_end = min_t(u64, folio_end(folio) - 1, end); } else { zero_start = max_t(u64, block_start, start); zero_end = min_t(u64, block_end, end); @@ -5014,11 +5006,12 @@ again: block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); - btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL); + EXTENT_NORESERVE, &cached_state); + + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); out_unlock: if (ret) { @@ -5256,7 +5249,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { - int err; + int ret2; /* * Truncate failed, so fix up the in-memory size. We @@ -5264,9 +5257,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); - if (err) - return err; + ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); + if (ret2) + return ret2; i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } @@ -5279,31 +5272,31 @@ static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; - int err; + int ret; if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(idmap, dentry, attr); - if (err) - return err; + ret = setattr_prepare(idmap, dentry, attr); + if (ret) + return ret; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr); - if (err) - return err; + ret = btrfs_setsize(inode, attr); + if (ret) + return ret; } if (attr->ia_valid) { setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); - err = btrfs_dirty_inode(BTRFS_I(inode)); + ret = btrfs_dirty_inode(BTRFS_I(inode)); - if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(idmap, dentry, inode->i_mode); + if (!ret && attr->ia_valid & ATTR_MODE) + ret = posix_acl_chmod(idmap, dentry, inode->i_mode); } - return err; + return ret; } /* @@ -5437,7 +5430,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv = NULL; + struct btrfs_block_rsv rsv; int ret; trace_btrfs_inode_evict(inode); @@ -5485,11 +5478,9 @@ void btrfs_evict_inode(struct inode *inode) */ btrfs_kill_delayed_inode_items(BTRFS_I(inode)); - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) - goto out; - rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = btrfs_calc_metadata_size(fs_info, 1); + rsv.failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5501,11 +5492,11 @@ void btrfs_evict_inode(struct inode *inode) .min_type = 0, }; - trans = evict_refill_and_join(root, rsv); + trans = evict_refill_and_join(root, &rsv); if (IS_ERR(trans)) - goto out; + goto out_release; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; @@ -5517,7 +5508,7 @@ void btrfs_evict_inode(struct inode *inode) */ btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) - goto out; + goto out_release; else if (!ret) break; } @@ -5531,16 +5522,17 @@ void btrfs_evict_inode(struct inode *inode) * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ - trans = evict_refill_and_join(root, rsv); + trans = evict_refill_and_join(root, &rsv); if (!IS_ERR(trans)) { - trans->block_rsv = rsv; + trans->block_rsv = &rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); } +out_release: + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); out: - btrfs_free_block_rsv(fs_info, rsv); /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want @@ -6173,8 +6165,7 @@ again: if (ret) goto nopos; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); - if (ret) + if (btrfs_readdir_delayed_dir_index(ctx, &ins_list)) goto nopos; /* @@ -6467,6 +6458,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; btrfs_update_inode_mapping_flags(BTRFS_I(inode)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6610,13 +6602,17 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto discard; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } return 0; @@ -6703,20 +6699,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; - int err; - err = btrfs_del_root_ref(trans, key.objectid, - btrfs_root_id(root), parent_ino, - &local_index, name); - if (err) - btrfs_abort_transaction(trans, err); + int ret2; + + ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root), + parent_ino, &local_index, name); + if (ret2) + btrfs_abort_transaction(trans, ret2); } else if (add_backref) { - u64 local_index; - int err; + int ret2; - err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, - &local_index); - if (err) - btrfs_abort_transaction(trans, err); + ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL); + if (ret2) + btrfs_abort_transaction(trans, ret2); } /* Return the original error code */ @@ -6735,20 +6729,20 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, }; unsigned int trans_num_items; struct btrfs_trans_handle *trans; - int err; + int ret; - err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); - if (err) + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) goto out_inode; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_new_inode_args; } - err = btrfs_create_new_inode(trans, &new_inode_args); - if (!err) + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (!ret) d_instantiate_new(dentry, inode); btrfs_end_transaction(trans); @@ -6756,9 +6750,9 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: - if (err) + if (ret) iput(inode); - return err; + return ret; } static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -6799,7 +6793,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; - int err; + int ret; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ @@ -6809,12 +6803,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); - if (err) + ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (ret) goto fail; - err = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (err) + ret = btrfs_set_inode_index(BTRFS_I(dir), &index); + if (ret) goto fail; /* @@ -6825,7 +6819,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, */ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; goto fail; } @@ -6838,24 +6832,24 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); - if (err) { + if (ret) { drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (ret) goto fail; if (inode->i_nlink == 1) { /* * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ - err = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); + if (ret) goto fail; } d_instantiate(dentry, inode); @@ -6871,7 +6865,7 @@ fail: iput(inode); } btrfs_btree_balance_dirty(fs_info); - return err; + return ret; } static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, @@ -7364,13 +7358,13 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, static void wait_subpage_spinlock(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, @@ -7383,8 +7377,8 @@ static void wait_subpage_spinlock(struct folio *folio) * Here we just acquire the spinlock so that all existing callers * should exit and we're safe to release/invalidate the page. */ - spin_lock_irq(&subpage->lock); - spin_unlock_irq(&subpage->lock); + spin_lock_irq(&bfs->lock); + spin_unlock_irq(&bfs->lock); } static int btrfs_launder_folio(struct folio *folio) @@ -7607,7 +7601,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv rsv; int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; @@ -7649,11 +7643,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) - return -ENOMEM; - rsv->size = min_size; - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = min_size; + rsv.failfast = true; /* * 1 for the truncate slack space @@ -7666,7 +7658,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) } /* Migrate the slack space for the truncate to our reserve */ - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); /* * We have reserved 2 metadata units when we started the transaction and @@ -7678,7 +7670,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) goto out; } - trans->block_rsv = rsv; + trans->block_rsv = &rsv; while (1) { struct extent_state *cached_state = NULL; @@ -7721,9 +7713,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) break; } - btrfs_block_rsv_release(fs_info, rsv, -1, NULL); + btrfs_block_rsv_release(fs_info, &rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); + &rsv, min_size, false); /* * We have reserved 2 metadata units when we started the * transaction and min_size matches 1 unit, so this should never @@ -7732,7 +7724,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) if (WARN_ON(ret)) break; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; } /* @@ -7771,7 +7763,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_btree_balance_dirty(fs_info); } out: - btrfs_free_block_rsv(fs_info, rsv); + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to @@ -8026,7 +8018,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; - stat->subvol = BTRFS_I(inode)->root->root_key.objectid; + stat->subvol = btrfs_root_id(BTRFS_I(inode)->root); stat->result_mask |= STATX_SUBVOL; spin_lock(&BTRFS_I(inode)->lock); @@ -8823,7 +8815,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, .dentry = dentry, }; unsigned int trans_num_items; - int err; + int ret; int name_len; int datasize; unsigned long ptr; @@ -8850,26 +8842,26 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, inode_set_bytes(inode, name_len); new_inode_args.inode = inode; - err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); - if (err) + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) goto out_inode; /* 1 additional item for the inline extent */ trans_num_items++; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_new_inode_args; } - err = btrfs_create_new_inode(trans, &new_inode_args); - if (err) + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (ret) goto out; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; - btrfs_abort_transaction(trans, err); + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); discard_new_inode(inode); inode = NULL; goto out; @@ -8878,10 +8870,9 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); - err = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (err) { - btrfs_abort_transaction(trans, err); + ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); + if (ret) { + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); discard_new_inode(inode); inode = NULL; @@ -8903,16 +8894,16 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, btrfs_free_path(path); d_instantiate_new(dentry, inode); - err = 0; + ret = 0; out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: - if (err) + if (ret) iput(inode); - return err; + return ret; } static struct btrfs_trans_handle *insert_prealloc_file_extent( diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4eda35bdba71..7e13de2bdcbf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -245,7 +245,7 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. */ -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); @@ -254,7 +254,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int btrfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; @@ -666,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_record_new_subvolume(trans, BTRFS_I(dir)); - d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; @@ -841,7 +841,7 @@ free_pending: static int btrfs_may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, int isdir) { - int error; + int ret; if (d_really_is_negative(victim)) return -ENOENT; @@ -851,9 +851,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); - if (error) - return error; + ret = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); + if (ret) + return ret; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, d_inode(victim)) || @@ -892,39 +892,37 @@ static inline int btrfs_may_create(struct mnt_idmap *idmap, * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ -static noinline int btrfs_mksubvol(const struct path *parent, +static noinline int btrfs_mksubvol(struct dentry *parent, struct mnt_idmap *idmap, - const char *name, int namelen, - struct btrfs_root *snap_src, + struct qstr *qname, struct btrfs_root *snap_src, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct inode *dir = d_inode(parent->dentry); + struct inode *dir = d_inode(parent); struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; - struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); - int error; + struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len); + int ret; - error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (error == -EINTR) - return error; + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) + return ret; - dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry); - error = PTR_ERR(dentry); + dentry = lookup_one(idmap, qname, parent); + ret = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; - error = btrfs_may_create(idmap, dir, dentry); - if (error) + ret = btrfs_may_create(idmap, dir, dentry); + if (ret) goto out_dput; /* * even if this name doesn't exist, we may get hash collisions. * check for them now when we can safely fail */ - error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, - dir->i_ino, &name_str); - if (error) + ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str); + if (ret) goto out_dput; down_read(&fs_info->subvol_sem); @@ -933,11 +931,11 @@ static noinline int btrfs_mksubvol(const struct path *parent, goto out_up_read; if (snap_src) - error = create_snapshot(snap_src, dir, dentry, readonly, inherit); + ret = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(idmap, dir, dentry, inherit); + ret = create_subvol(idmap, dir, dentry, inherit); - if (!error) + if (!ret) fsnotify_mkdir(dir, dentry); out_up_read: up_read(&fs_info->subvol_sem); @@ -945,12 +943,12 @@ out_dput: dput(dentry); out_unlock: btrfs_inode_unlock(BTRFS_I(dir), 0); - return error; + return ret; } -static noinline int btrfs_mksnapshot(const struct path *parent, +static noinline int btrfs_mksnapshot(struct dentry *parent, struct mnt_idmap *idmap, - const char *name, int namelen, + struct qstr *qname, struct btrfs_root *root, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -977,8 +975,8 @@ static noinline int btrfs_mksnapshot(const struct path *parent, btrfs_wait_ordered_extents(root, U64_MAX, NULL); - ret = btrfs_mksubvol(parent, idmap, name, namelen, - root, readonly, inherit); + ret = btrfs_mksubvol(parent, idmap, qname, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: btrfs_drew_read_unlock(&root->snapshot_lock); @@ -1169,7 +1167,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } /* equal, nothing need to do */ if (ret == 0 && new_size != old_size) - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "resize device %s (devid %llu) from %llu to %llu", btrfs_dev_name(device), device->devid, old_size, new_size); @@ -1184,12 +1182,12 @@ out_drop: static noinline int __btrfs_ioctl_snap_create(struct file *file, struct mnt_idmap *idmap, - const char *name, unsigned long fd, int subvol, + const char *name, unsigned long fd, bool subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) { - int namelen; int ret = 0; + struct qstr qname = QSTR_INIT(name, strlen(name)); if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; @@ -1198,21 +1196,20 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, if (ret) goto out; - namelen = strlen(name); if (strchr(name, '/')) { ret = -EINVAL; goto out_drop_write; } - if (name[0] == '.' && - (namelen == 1 || (name[1] == '.' && namelen == 2))) { + if (qname.name[0] == '.' && + (qname.len == 1 || (qname.name[1] == '.' && qname.len == 2))) { ret = -EEXIST; goto out_drop_write; } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, idmap, name, - namelen, NULL, readonly, inherit); + ret = btrfs_mksubvol(file_dentry(file), idmap, &qname, NULL, + readonly, inherit); } else { CLASS(fd, src)(fd); struct inode *src_inode; @@ -1242,8 +1239,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, */ ret = -EINVAL; } else { - ret = btrfs_mksnapshot(&file->f_path, idmap, - name, namelen, + ret = btrfs_mksnapshot(file_dentry(file), idmap, &qname, BTRFS_I(src_inode)->root, readonly, inherit); } @@ -1280,7 +1276,7 @@ out: } static noinline int btrfs_ioctl_snap_create_v2(struct file *file, - void __user *arg, int subvol) + void __user *arg, bool subvol) { struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; @@ -2558,8 +2554,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EOPNOTSUPP; goto out; } - /* compression requires us to start the IO */ - if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) && + (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) { + ret = -EINVAL; + goto out; + } + /* Compression or no-compression require to start the IO. */ + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) || + (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; range.extent_thresh = (u32)-1; } @@ -2700,7 +2702,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) err_drop: mnt_drop_write_file(file); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2751,7 +2753,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mnt_drop_write_file(file); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); out_free: @@ -2890,7 +2892,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(btrfs_root_id(new_root))) { + if (!btrfs_is_fstree(btrfs_root_id(new_root))) { ret = -ENOENT; goto out_free; } @@ -3357,7 +3359,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; - struct btrfs_path *path = NULL; bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) @@ -3391,14 +3392,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, goto out_loi; } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - inodes, ignore_offset); - btrfs_free_path(path); + ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -3715,22 +3709,6 @@ drop_write: return ret; } -/* - * Quick check for ioctl handlers if quotas are enabled. Proper locking must be - * done before any operations. - */ -static bool qgroup_enabled(struct btrfs_fs_info *fs_info) -{ - bool ret = true; - - mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!fs_info->quota_root) - ret = false; - mutex_unlock(&fs_info->qgroup_ioctl_lock); - - return ret; -} - static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); @@ -3745,7 +3723,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -3815,7 +3793,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -3833,7 +3811,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } - if (sa->create && is_fstree(sa->qgroupid)) { + if (sa->create && btrfs_is_fstree(sa->qgroupid)) { ret = -EINVAL; goto out; } @@ -3874,7 +3852,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -3922,7 +3900,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(fs_info)) + if (!btrfs_qgroup_enabled(fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -4200,7 +4178,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) } spin_lock(&fs_info->super_lock); - strcpy(super_block->label, label); + strscpy(super_block->label, label); spin_unlock(&fs_info->super_lock); ret = btrfs_commit_transaction(trans); @@ -4629,6 +4607,13 @@ out_acct: return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + /* * Context that's attached to an encoded read io_uring command, in cmd->pdu. It * contains the fields in btrfs_uring_read_extent that are necessary to finish @@ -4650,6 +4635,7 @@ struct btrfs_uring_priv { }; struct io_btrfs_cmd { + struct btrfs_uring_encoded_data *data; struct btrfs_uring_priv *priv; }; @@ -4659,7 +4645,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; - unsigned long index; + pgoff_t index; u64 cur; size_t page_offset; ssize_t ret; @@ -4708,6 +4694,7 @@ out: kfree(priv->pages); kfree(priv->iov); kfree(priv); + kfree(bc->data); } void btrfs_uring_read_extent_endio(void *ctx, int err) @@ -4791,13 +4778,6 @@ out_fail: return ret; } -struct btrfs_uring_encoded_data { - struct btrfs_ioctl_encoded_io_args args; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; - struct iov_iter iter; -}; - static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); @@ -4813,7 +4793,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; - struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_encoded_data *data = NULL; + + if (cmd->flags & IORING_URING_CMD_REISSUE) + data = bc->data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4829,7 +4813,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); #else - return -ENOTTY; + ret = -ENOTTY; + goto out_acct; #endif } else { copy_end = copy_end_kernel; @@ -4842,7 +4827,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue goto out_acct; } - io_uring_cmd_get_async_data(cmd)->op_data = data; + bc->data = data; if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) @@ -4940,6 +4925,9 @@ out_acct: add_rchar(current, ret); inc_syscr(current); + if (ret != -EIOCBQUEUED && ret != -EAGAIN) + kfree(data); + return ret; } @@ -4950,7 +4938,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu struct file *file; ssize_t ret; void __user *sqe_addr; - struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_encoded_data *data = NULL; + + if (cmd->flags & IORING_URING_CMD_REISSUE) + data = bc->data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4972,7 +4964,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu goto out_acct; } - io_uring_cmd_get_async_data(cmd)->op_data = data; + bc->data = data; if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) @@ -5062,6 +5054,9 @@ out_acct: if (ret > 0) add_wchar(current, ret); inc_syscw(current); + + if (ret != -EAGAIN) + kfree(data); return ret; } diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index e08ea446cf48..ccf6bed9cc24 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -8,7 +8,7 @@ struct file; struct dentry; struct mnt_idmap; -struct fileattr; +struct file_kattr; struct io_uring_cmd; struct btrfs_inode; struct btrfs_fs_info; @@ -16,9 +16,9 @@ struct btrfs_ioctl_balance_args; long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 6abf81bb00c2..022ebc89af85 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -37,106 +37,46 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); btrfs_no_printk(fs_info, fmt, ##args) #endif -#define btrfs_emerg(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_INFO fmt, ##args) - /* - * Wrappers that use printk_in_rcu + * Print a message with filesystem info, enclosed in RCU protection. */ -#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ +#define btrfs_crit(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_in_rcu(fs_info, fmt, args...) \ +#define btrfs_err(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ +#define btrfs_warn(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_in_rcu(fs_info, fmt, args...) \ +#define btrfs_info(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) /* - * Wrappers that use a ratelimited printk_in_rcu - */ -#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* * Wrappers that use a ratelimited printk */ -#define btrfs_emerg_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) #define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ fs_info, KERN_DEBUG fmt, ##args) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #else -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +/* When printk() is no_printk(), expand to no-op. */ +#define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0) +#define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0) #endif #define btrfs_printk_in_rcu(fs_info, fmt, args...) \ @@ -146,26 +86,15 @@ do { \ rcu_read_unlock(); \ } while (0) -#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_no_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ + \ + rcu_read_lock(); \ if (__ratelimit(&_rs)) \ btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk_ratelimited(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 0d599fd847c9..ff5eac84d819 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -7,6 +7,8 @@ #include <linux/bitmap.h> #include <linux/sched.h> #include <linux/wait.h> +#include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/math64.h> #include <linux/rbtree.h> @@ -119,28 +121,23 @@ static inline struct rb_node *rb_simple_search_first(const struct rb_root *root, return ret; } -static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) +static int rb_simple_node_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct rb_simple_node *entry; + struct rb_simple_node *new_entry = rb_entry(new, struct rb_simple_node, rb_node); + struct rb_simple_node *existing_entry = rb_entry(existing, struct rb_simple_node, rb_node); - while (*p) { - parent = *p; - entry = rb_entry(parent, struct rb_simple_node, rb_node); + if (new_entry->bytenr < existing_entry->bytenr) + return -1; + else if (new_entry->bytenr > existing_entry->bytenr) + return 1; - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; - } + return 0; +} - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; +static inline struct rb_node *rb_simple_insert(struct rb_root *root, + struct rb_simple_node *simple_node) +{ + return rb_find_add(&simple_node->rb_node, root, rb_simple_node_bytenr_cmp); } static inline bool bitmap_test_range_all_set(const unsigned long *addr, @@ -163,4 +160,9 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr, return (found_set == start + nbits); } +static inline u64 folio_end(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9212ce110cde..2829f20d7bb5 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -359,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (folio) { ASSERT(folio->mapping); ASSERT(folio_pos(folio) <= file_offset); - ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); + ASSERT(file_offset + len <= folio_end(folio)); /* * Ordered flag indicates whether we still have diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fc821aa446f0..74e38da9bd39 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -190,7 +190,7 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("BTRFS: uuid item with illegal size %lu!\n", + btrfs_warn(l->fs_info, "uuid item with illegal size %lu", (unsigned long)item_size); return; } @@ -223,7 +223,7 @@ static void print_eb_refs_lock(const struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", - atomic_read(&eb->refs), eb->lock_owner, current->pid); + refcount_read(&eb->refs), eb->lock_owner, current->pid); #endif } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b3176edbde82..1a5972178b3a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -160,23 +160,34 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); +static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *qgroupid = key; + const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node); + + if (qgroup->qgroupid < *qgroupid) + return -1; + else if (qgroup->qgroupid > *qgroupid) + return 1; + + return 0; +} + /* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info, u64 qgroupid) { - struct rb_node *n = fs_info->qgroup_tree.rb_node; - struct btrfs_qgroup *qgroup; + struct rb_node *node; - while (n) { - qgroup = rb_entry(n, struct btrfs_qgroup, node); - if (qgroup->qgroupid < qgroupid) - n = n->rb_left; - else if (qgroup->qgroupid > qgroupid) - n = n->rb_right; - else - return qgroup; - } - return NULL; + node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp); + return rb_entry_safe(node, struct btrfs_qgroup, node); +} + +static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node); + + return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing); } /* @@ -191,39 +202,25 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *prealloc, u64 qgroupid) { - struct rb_node **p = &fs_info->qgroup_tree.rb_node; - struct rb_node *parent = NULL; - struct btrfs_qgroup *qgroup; + struct rb_node *node; /* Caller must have pre-allocated @prealloc. */ ASSERT(prealloc); - while (*p) { - parent = *p; - qgroup = rb_entry(parent, struct btrfs_qgroup, node); - - if (qgroup->qgroupid < qgroupid) { - p = &(*p)->rb_left; - } else if (qgroup->qgroupid > qgroupid) { - p = &(*p)->rb_right; - } else { - kfree(prealloc); - return qgroup; - } + prealloc->qgroupid = qgroupid; + node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp); + if (node) { + kfree(prealloc); + return rb_entry(node, struct btrfs_qgroup, node); } - qgroup = prealloc; - qgroup->qgroupid = qgroupid; - INIT_LIST_HEAD(&qgroup->groups); - INIT_LIST_HEAD(&qgroup->members); - INIT_LIST_HEAD(&qgroup->dirty); - INIT_LIST_HEAD(&qgroup->iterator); - INIT_LIST_HEAD(&qgroup->nested_iterator); + INIT_LIST_HEAD(&prealloc->groups); + INIT_LIST_HEAD(&prealloc->members); + INIT_LIST_HEAD(&prealloc->dirty); + INIT_LIST_HEAD(&prealloc->iterator); + INIT_LIST_HEAD(&prealloc->nested_iterator); - rb_link_node(&qgroup->node, parent, p); - rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); - - return qgroup; + return prealloc; } static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) @@ -349,13 +346,27 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid } #endif -static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) +__printf(2, 3) +static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...) { + const u64 old_flags = fs_info->qgroup_flags; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); + if (!(old_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_warn_rl(fs_info, "qgroup marked inconsistent, %pV", &vaf); + va_end(args); + } } static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, @@ -386,12 +397,6 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if (!fs_info->quota_root) return 0; - fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); - if (!fs_info->qgroup_ulist) { - ret = -ENOMEM; - goto out; - } - path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -434,13 +439,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) qgroup_read_enable_gen(fs_info, l, slot, ptr); - } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { - qgroup_mark_inconsistent(fs_info); - btrfs_err(fs_info, - "qgroup generation mismatch, marked as inconsistent"); - } + else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) + qgroup_mark_inconsistent(fs_info, "qgroup generation mismatch"); rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -451,10 +453,8 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || - (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - btrfs_err(fs_info, "inconsistent qgroup config"); - qgroup_mark_inconsistent(fs_info); - } + (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) + qgroup_mark_inconsistent(fs_info, "inconsistent qgroup config"); if (!qgroup) { struct btrfs_qgroup *prealloc; struct btrfs_root *tree_root = fs_info->tree_root; @@ -476,7 +476,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) * during mount before we start doing things like creating * subvolumes. */ - if (is_fstree(qgroup->qgroupid) && + if (btrfs_is_fstree(qgroup->qgroupid) && qgroup->qgroupid > tree_root->free_objectid) /* * Don't need to check against BTRFS_LAST_FREE_OBJECTID, @@ -581,8 +581,6 @@ out: if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } else { - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; btrfs_sysfs_del_qgroups(fs_info); } @@ -630,29 +628,30 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info) /* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), - * first two are in single-threaded paths.And for the third one, we have set - * quota_root to be null with qgroup_lock held before, so it is safe to clean - * up the in-memory structures without qgroup_lock held. + * first two are in single-threaded paths. */ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; + /* + * btrfs_quota_disable() can be called concurrently with + * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the + * lock. + */ + spin_lock(&fs_info->qgroup_lock); while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); __del_qgroup_rb(qgroup); + spin_unlock(&fs_info->qgroup_lock); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); + spin_lock(&fs_info->qgroup_lock); } - /* - * We call btrfs_free_qgroup_config() when unmounting - * filesystem and disabling quota, so we set qgroup_ulist - * to be null here to avoid double free. - */ - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; + spin_unlock(&fs_info->qgroup_lock); + btrfs_sysfs_del_qgroups(fs_info); } @@ -998,7 +997,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup = NULL; struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; - struct ulist *ulist = NULL; const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; @@ -1021,12 +1019,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, if (fs_info->quota_root) goto out; - ulist = ulist_alloc(GFP_KERNEL); - if (!ulist) { - ret = -ENOMEM; - goto out; - } - ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; @@ -1066,9 +1058,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, if (fs_info->quota_root) goto out; - fs_info->qgroup_ulist = ulist; - ulist = NULL; - /* * initially create the quota tree */ @@ -1155,11 +1144,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -1272,17 +1256,13 @@ out_free_root: if (ret) btrfs_put_root(quota_root); out: - if (ret) { - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; + if (ret) btrfs_sysfs_del_qgroups(fs_info); - } mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); - ulist_free(ulist); kfree(prealloc); return ret; } @@ -1354,11 +1334,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) /* * We have nothing held here and no trans handle, just return the error - * if there is one. + * if there is one and set back the quota enabled bit since we didn't + * actually disable quotas. */ ret = flush_reservations(fs_info); - if (ret) + if (ret) { + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); return ret; + } /* * 1 For the root item @@ -1679,9 +1662,6 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) struct btrfs_qgroup *prealloc = NULL; int ret = 0; - if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) - return 0; - mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1844,13 +1824,12 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr) { DEBUG_WARN(); - btrfs_warn_rl(fs_info, -"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", - btrfs_qgroup_level(qgroup->qgroupid), - btrfs_qgroup_subvolid(qgroup->qgroupid), - qgroup->rfer, qgroup->rfer_cmpr, - qgroup->excl, qgroup->excl_cmpr); - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rfer, qgroup->rfer_cmpr, + qgroup->excl, qgroup->excl_cmpr); } } del_qgroup_rb(fs_info, qgroupid); @@ -1873,7 +1852,8 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su struct btrfs_trans_handle *trans; int ret; - if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root) + if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || + !fs_info->quota_root) return 0; /* @@ -1968,11 +1948,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_limit_item(trans, qgroup); - if (ret) { - qgroup_mark_inconsistent(fs_info); - btrfs_info(fs_info, "unable to update quota limit for %llu", - qgroupid); - } + if (ret) + qgroup_mark_inconsistent(fs_info, "qgroup item update error %d", ret); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); @@ -2027,7 +2004,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) { - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret)); return xa_err(ret); } @@ -2094,10 +2071,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - qgroup_mark_inconsistent(fs_info); - btrfs_warn(fs_info, -"error accounting new delayed refs extent (err code: %d), quota inconsistent", - ret); + qgroup_mark_inconsistent(fs_info, + "error accounting new delayed refs extent: %d", ret); return 0; } @@ -2341,7 +2316,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); /* For src_path */ - atomic_inc(&src_eb->refs); + refcount_inc(&src_eb->refs); src_path->nodes[root_level] = src_eb; src_path->slots[root_level] = dst_path->slots[root_level]; src_path->locks[root_level] = 0; @@ -2574,7 +2549,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, goto out; } /* For dst_path */ - atomic_inc(&dst_eb->refs); + refcount_inc(&dst_eb->refs); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; @@ -2589,7 +2564,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, out: btrfs_free_path(dst_path); if (ret < 0) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret); return ret; } @@ -2633,7 +2608,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, * mark qgroup inconsistent. */ if (root_level >= drop_subptree_thres) { - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "subtree level reached threshold"); return 0; } @@ -2666,7 +2641,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, * walk back up the tree (adjusting slot pointers as we go) * and restart the search process. */ - atomic_inc(&root_eb->refs); /* For path */ + refcount_inc(&root_eb->refs); /* For path */ path->nodes[root_level] = root_eb; path->slots[root_level] = 0; path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ @@ -2932,7 +2907,7 @@ static int maybe_fs_roots(struct ulist *roots) * trees. * If it contains a non-fs tree, it won't be shared with fs/subvol trees. */ - return is_fstree(unode->val); + return btrfs_is_fstree(unode->val); } int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, @@ -3133,10 +3108,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_info_item(trans, qgroup); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup info item update error %d", ret); ret = update_qgroup_limit_item(trans, qgroup); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup limit item update error %d", ret); spin_lock(&fs_info->qgroup_lock); } if (btrfs_qgroup_enabled(fs_info)) @@ -3147,7 +3124,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) ret = update_qgroup_status_item(trans); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup status item update error %d", ret); return ret; } @@ -3329,6 +3307,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u32 level_size = 0; u64 nums; + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) return -ENOMEM; @@ -3352,8 +3333,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!btrfs_qgroup_enabled(fs_info)) - goto out; quota_root = fs_info->quota_root; if (!quota_root) { @@ -3554,7 +3533,7 @@ out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan"); if (qlist_prealloc) { for (int i = 0; i < inherit->num_qgroups; i++) kfree(qlist_prealloc[i]); @@ -3588,7 +3567,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, int ret = 0; LIST_HEAD(qgroup_list); - if (!is_fstree(ref_root)) + if (!btrfs_is_fstree(ref_root)) return 0; if (num_bytes == 0) @@ -3648,7 +3627,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); - if (!is_fstree(ref_root)) + if (!btrfs_is_fstree(ref_root)) return; if (num_bytes == 0) @@ -4036,12 +4015,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = true; - btrfs_queue_work(fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + /* + * The rescan worker is only for full accounting qgroups, check if it's + * enabled as it is pointless to queue it otherwise. A concurrent quota + * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED. + */ + if (btrfs_qgroup_full_accounting(fs_info)) { + fs_info->qgroup_rescan_running = true; + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } else { + ret = -ENOTCONN; + } mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + return ret; } int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, @@ -4128,8 +4116,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, * Now the entry is in [start, start + len), revert the * EXTENT_QGROUP_RESERVED bit. */ - clear_ret = btrfs_clear_extent_bits(&inode->io_tree, entry_start, - entry_end, EXTENT_QGROUP_RESERVED); + clear_ret = btrfs_clear_extent_bit(&inode->io_tree, entry_start, entry_end, + EXTENT_QGROUP_RESERVED, NULL); if (!ret && clear_ret < 0) ret = clear_ret; @@ -4216,7 +4204,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root)) || len == 0) + !btrfs_is_fstree(btrfs_root_id(root)) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -4469,7 +4457,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root)) || num_bytes == 0) + !btrfs_is_fstree(btrfs_root_id(root)) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -4514,7 +4502,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* TODO: Update trace point to handle such free */ @@ -4530,7 +4518,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* @@ -4589,7 +4577,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, @@ -4673,6 +4661,28 @@ out: spin_unlock(&swapped_blocks->lock); } +static int qgroup_swapped_block_bytenr_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *bytenr = key; + const struct btrfs_qgroup_swapped_block *block = rb_entry(node, + struct btrfs_qgroup_swapped_block, node); + + if (block->subvol_bytenr < *bytenr) + return -1; + else if (block->subvol_bytenr > *bytenr) + return 1; + + return 0; +} + +static int qgroup_swapped_block_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct btrfs_qgroup_swapped_block *new_block = rb_entry(new, + struct btrfs_qgroup_swapped_block, node); + + return qgroup_swapped_block_bytenr_key_cmp(&new_block->subvol_bytenr, existing); +} + /* * Add subtree roots record into @subvol_root. * @@ -4692,8 +4702,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_fs_info *fs_info = subvol_root->fs_info; struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; - struct rb_node **cur; - struct rb_node *parent = NULL; + struct rb_node *node; int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; @@ -4742,46 +4751,32 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, /* Insert @block into @blocks */ spin_lock(&blocks->lock); - cur = &blocks->blocks[level].rb_node; - while (*cur) { + node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp); + if (node) { struct btrfs_qgroup_swapped_block *entry; - parent = *cur; - entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, - node); + entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node); - if (entry->subvol_bytenr < block->subvol_bytenr) { - cur = &(*cur)->rb_left; - } else if (entry->subvol_bytenr > block->subvol_bytenr) { - cur = &(*cur)->rb_right; - } else { - if (entry->subvol_generation != - block->subvol_generation || - entry->reloc_bytenr != block->reloc_bytenr || - entry->reloc_generation != - block->reloc_generation) { - /* - * Duplicated but mismatch entry found. - * Shouldn't happen. - * - * Marking qgroup inconsistent should be enough - * for end users. - */ - DEBUG_WARN("duplicated but mismatched entry found"); - ret = -EEXIST; - } - kfree(block); - goto out_unlock; + if (entry->subvol_generation != block->subvol_generation || + entry->reloc_bytenr != block->reloc_bytenr || + entry->reloc_generation != block->reloc_generation) { + /* + * Duplicated but mismatch entry found. Shouldn't happen. + * Marking qgroup inconsistent should be enough for end + * users. + */ + DEBUG_WARN("duplicated but mismatched entry found"); + ret = -EEXIST; } + kfree(block); + goto out_unlock; } - rb_link_node(&block->node, parent, cur); - rb_insert_color(&block->node, &blocks->blocks[level]); blocks->swapped = true; out_unlock: spin_unlock(&blocks->lock); out: if (ret < 0) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret); return ret; } @@ -4801,7 +4796,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; struct rb_node *node; - bool found = false; bool swapped = false; int level = btrfs_header_level(subvol_eb); int ret = 0; @@ -4809,7 +4803,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) + if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root) return 0; spin_lock(&blocks->lock); @@ -4817,23 +4811,14 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, spin_unlock(&blocks->lock); return 0; } - node = blocks->blocks[level].rb_node; - - while (node) { - block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); - if (block->subvol_bytenr < subvol_eb->start) { - node = node->rb_left; - } else if (block->subvol_bytenr > subvol_eb->start) { - node = node->rb_right; - } else { - found = true; - break; - } - } - if (!found) { + node = rb_find(&subvol_eb->start, &blocks->blocks[level], + qgroup_swapped_block_bytenr_key_cmp); + if (!node) { spin_unlock(&blocks->lock); goto out; } + block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); + /* Found one, remove it from @blocks first and update blocks->swapped */ rb_erase(&block->node, &blocks->blocks[level]); for (i = 0; i < BTRFS_MAX_LEVEL; i++) { @@ -4869,10 +4854,9 @@ free_out: free_extent_buffer(reloc_eb); out: if (ret < 0) { - btrfs_err_rl(fs_info, - "failed to account subtree at bytenr %llu: %d", - subvol_eb->start, ret); - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "failed to account subtree at bytenr %llu: %d", + subvol_eb->start, ret); } return ret; } @@ -4903,7 +4887,7 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) return 0; - if (!is_fstree(root)) + if (!btrfs_is_fstree(root)) return 0; /* If the extent predates enabling quotas, don't count it. */ diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 1834011ccc49..cab0b291088c 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -329,11 +329,14 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); - if (ret == -EEXIST) + if (ret == -EEXIST) { ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, item_size); - if (ret) + if (ret) + btrfs_abort_transaction(trans, ret); + } else if (ret) { btrfs_abort_transaction(trans, ret); + } kfree(stripe_extent); diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h deleted file mode 100644 index 1c2d7cb1fe6f..000000000000 --- a/fs/btrfs/rcu-string.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - */ - -#ifndef BTRFS_RCU_STRING_H -#define BTRFS_RCU_STRING_H - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> -#include <linux/printk.h> - -struct rcu_string { - struct rcu_head rcu; - char str[]; -}; - -static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) -{ - size_t len = strlen(src) + 1; - struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + - (len * sizeof(char)), mask); - if (!ret) - return ret; - /* Warn if the source got unexpectedly truncated. */ - if (WARN_ON(strscpy(ret->str, src, len) < 0)) { - kfree(ret); - return NULL; - } - return ret; -} - -static inline void rcu_string_free(struct rcu_string *str) -{ - if (str) - kfree_rcu(str, rcu); -} - -#define printk_in_rcu(fmt, ...) do { \ - rcu_read_lock(); \ - printk(fmt, __VA_ARGS__); \ - rcu_read_unlock(); \ -} while (0) - -#define printk_ratelimited_in_rcu(fmt, ...) do { \ - rcu_read_lock(); \ - printk_ratelimited(fmt, __VA_ARGS__); \ - rcu_read_unlock(); \ -} while (0) - -#define rcu_str_deref(rcu_str) ({ \ - struct rcu_string *__str = rcu_dereference(rcu_str); \ - __str->str; \ -}) - -#endif diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 2928abf7eb82..3871c3a6c743 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -75,69 +75,70 @@ struct block_entry { struct list_head actions; }; +static int block_entry_bytenr_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *bytenr = key; + const struct block_entry *entry = rb_entry(node, struct block_entry, node); + + if (entry->bytenr < *bytenr) + return 1; + else if (entry->bytenr > *bytenr) + return -1; + + return 0; +} + +static int block_entry_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct block_entry *new_entry = rb_entry(new, struct block_entry, node); + + return block_entry_bytenr_key_cmp(&new_entry->bytenr, existing); +} + static struct block_entry *insert_block_entry(struct rb_root *root, struct block_entry *be) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct block_entry *entry; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct block_entry, node); - if (entry->bytenr > be->bytenr) - p = &(*p)->rb_left; - else if (entry->bytenr < be->bytenr) - p = &(*p)->rb_right; - else - return entry; - } + struct rb_node *node; - rb_link_node(&be->node, parent_node, p); - rb_insert_color(&be->node, root); - return NULL; + node = rb_find_add(&be->node, root, block_entry_bytenr_cmp); + return rb_entry_safe(node, struct block_entry, node); } static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) { - struct rb_node *n; - struct block_entry *entry = NULL; + struct rb_node *node; - n = root->rb_node; - while (n) { - entry = rb_entry(n, struct block_entry, node); - if (entry->bytenr < bytenr) - n = n->rb_right; - else if (entry->bytenr > bytenr) - n = n->rb_left; - else - return entry; - } - return NULL; + node = rb_find(&bytenr, root, block_entry_bytenr_key_cmp); + return rb_entry_safe(node, struct block_entry, node); +} + +static int root_entry_root_objectid_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *objectid = key; + const struct root_entry *entry = rb_entry(node, struct root_entry, node); + + if (entry->root_objectid < *objectid) + return 1; + else if (entry->root_objectid > *objectid) + return -1; + + return 0; +} + +static int root_entry_root_objectid_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct root_entry *new_entry = rb_entry(new, struct root_entry, node); + + return root_entry_root_objectid_key_cmp(&new_entry->root_objectid, existing); } static struct root_entry *insert_root_entry(struct rb_root *root, struct root_entry *re) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct root_entry *entry; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct root_entry, node); - if (entry->root_objectid > re->root_objectid) - p = &(*p)->rb_left; - else if (entry->root_objectid < re->root_objectid) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(&re->node, parent_node, p); - rb_insert_color(&re->node, root); - return NULL; + struct rb_node *node; + node = rb_find_add(&re->node, root, root_entry_root_objectid_cmp); + return rb_entry_safe(node, struct root_entry, node); } static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) @@ -161,48 +162,29 @@ static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) return 0; } +static int ref_entry_cmp(struct rb_node *new, const struct rb_node *existing) +{ + struct ref_entry *new_entry = rb_entry(new, struct ref_entry, node); + struct ref_entry *existing_entry = rb_entry(existing, struct ref_entry, node); + + return comp_refs(new_entry, existing_entry); +} + static struct ref_entry *insert_ref_entry(struct rb_root *root, struct ref_entry *ref) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct ref_entry *entry; - int cmp; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct ref_entry, node); - cmp = comp_refs(entry, ref); - if (cmp > 0) - p = &(*p)->rb_left; - else if (cmp < 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(&ref->node, parent_node, p); - rb_insert_color(&ref->node, root); - return NULL; + struct rb_node *node; + node = rb_find_add(&ref->node, root, ref_entry_cmp); + return rb_entry_safe(node, struct ref_entry, node); } static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) { - struct rb_node *n; - struct root_entry *entry = NULL; + struct rb_node *node; - n = root->rb_node; - while (n) { - entry = rb_entry(n, struct root_entry, node); - if (entry->root_objectid < objectid) - n = n->rb_right; - else if (entry->root_objectid > objectid) - n = n->rb_left; - else - return entry; - } - return NULL; + node = rb_find(&objectid, root, root_entry_root_objectid_key_cmp); + return rb_entry_safe(node, struct root_entry, node); } #ifdef CONFIG_STACKTRACE @@ -668,7 +650,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, * our sanity checks pass as they are no longer needed. */ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref) + const struct btrfs_ref *generic_ref) { struct ref_entry *ref = NULL, *exist; struct ref_action *ra = NULL; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 3511e1a5c96b..559bd25a2b7a 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -19,7 +19,7 @@ struct btrfs_ref; int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref); + const struct btrfs_ref *generic_ref); void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, u64 len); @@ -39,7 +39,7 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) } static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref) + const struct btrfs_ref *generic_ref) { return 0; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 62161beca559..ce25ab7f0e99 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -46,11 +46,9 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); - goto out; + return ret; } - ret = btrfs_end_transaction(trans); -out: - return ret; + return btrfs_end_transaction(trans); } static int copy_inline_to_page(struct btrfs_inode *inode, @@ -95,8 +93,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; - btrfs_clear_extent_bits(&inode->io_tree, file_offset, range_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG); + btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -270,11 +268,15 @@ copy_inline_extent: drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], @@ -283,6 +285,8 @@ copy_inline_extent: btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); btrfs_set_inode_full_sync(inode); ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); + if (ret) + btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { /* @@ -297,10 +301,8 @@ out: trans = NULL; } } - if (ret && trans) { - btrfs_abort_transaction(trans, ret); + if (ret && trans) btrfs_end_transaction(trans); - } if (!ret) *trans_out = trans; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 02086191630d..e58151933844 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -90,10 +90,15 @@ * map address of tree root to tree */ struct mapping_node { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simle_node for search/insert */ + union { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; void *data; }; @@ -106,10 +111,15 @@ struct mapping_tree { * present a tree block to process */ struct tree_block { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simple_node for search/insert */ + union { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; u64 owner; struct btrfs_key key; u8 level; @@ -480,8 +490,7 @@ static int __add_reloc_root(struct btrfs_root *root) node->data = root; spin_lock(&rc->reloc_root_tree.lock); - rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { btrfs_err(fs_info, @@ -564,8 +573,7 @@ static int __update_reloc_root(struct btrfs_root *root) spin_lock(&rc->reloc_root_tree.lock); node->bytenr = root->node->start; - rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); @@ -1516,7 +1524,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_root_level(root_item); - atomic_inc(&reloc_root->node->refs); + refcount_inc(&reloc_root->node->refs); path->nodes[level] = reloc_root->node; path->slots[level] = 0; } else { @@ -2617,7 +2625,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, * tree. */ if (block->owner && - (!is_fstree(block->owner) || + (!btrfs_is_fstree(block->owner) || block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { ret = relocate_cowonly_block(trans, rc, block, path); if (ret) @@ -2658,66 +2666,24 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control u64 num_bytes; int nr; int ret = 0; - u64 i_size = i_size_read(&inode->vfs_inode); u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset = prealloc_start; /* - * For subpage case, previous i_size may not be aligned to PAGE_SIZE. - * This means the range [i_size, PAGE_END + 1) is filled with zeros by - * btrfs_do_readpage() call of previously relocated file cluster. + * For blocksize < folio size case (either bs < page size or large folios), + * beyond i_size, all blocks are filled with zero. * - * If the current cluster starts in the above range, btrfs_do_readpage() + * If the current cluster covers the above range, btrfs_do_readpage() * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * - * Here we have to manually invalidate the range (i_size, PAGE_END + 1). + * Here we have to invalidate the cache covering our cluster. */ - if (!PAGE_ALIGNED(i_size)) { - struct address_space *mapping = inode->vfs_inode.i_mapping; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; - struct folio *folio; - - ASSERT(sectorsize < PAGE_SIZE); - ASSERT(IS_ALIGNED(i_size, sectorsize)); - - /* - * Subpage can't handle page with DIRTY but without UPTODATE - * bit as it can lead to the following deadlock: - * - * btrfs_read_folio() - * | Page already *locked* - * |- btrfs_lock_and_flush_ordered_range() - * |- btrfs_start_ordered_extent() - * |- extent_write_cache_pages() - * |- lock_page() - * We try to lock the page we already hold. - * - * Here we just writeback the whole data reloc inode, so that - * we will be ensured to have no dirty range in the page, and - * are safe to clear the uptodate bits. - * - * This shouldn't cause too much overhead, as we need to write - * the data back anyway. - */ - ret = filemap_write_and_wait(mapping); - if (ret < 0) - return ret; - - folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); - /* - * If page is freed we don't need to do anything then, as we - * will re-read the whole page anyway. - */ - if (!IS_ERR(folio)) { - btrfs_subpage_clear_uptodate(fs_info, folio, i_size, - round_up(i_size, PAGE_SIZE) - i_size); - folio_unlock(folio); - folio_put(folio); - } - } + ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start, + prealloc_end); + if (ret < 0) + return ret; BUG_ON(cluster->start != cluster->boundary[0]); ret = btrfs_alloc_data_chunk_ondemand(inode, @@ -2806,13 +2772,15 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, static int relocate_one_folio(struct reloc_control *rc, struct file_ra_state *ra, - int *cluster_nr, unsigned long index) + int *cluster_nr, u64 *file_offset_ret) { const struct file_extent_cluster *cluster = &rc->cluster; struct inode *inode = rc->data_inode; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + const u64 orig_file_offset = *file_offset_ret; u64 offset = BTRFS_I(inode)->reloc_block_group_start; - const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; + const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT; + const pgoff_t index = orig_file_offset >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); struct folio *folio; u64 folio_start; @@ -2845,8 +2813,6 @@ again: return PTR_ERR(folio); } - WARN_ON(folio_order(folio)); - if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, last_index + 1 - index); @@ -2875,7 +2841,7 @@ again: goto release_folio; folio_start = folio_pos(folio); - folio_end = folio_start + PAGE_SIZE - 1; + folio_end = folio_start + folio_size(folio) - 1; /* * Start from the cluster, as for subpage case, the cluster can start @@ -2923,7 +2889,8 @@ again: * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ - if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { + if (in_range(cluster->boundary[*cluster_nr] - offset, + folio_start, folio_size(folio))) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + @@ -2953,6 +2920,7 @@ again: btrfs_throttle(fs_info); if (btrfs_should_cancel_balance(fs_info)) ret = -ECANCELED; + *file_offset_ret = folio_end + 1; return ret; release_folio: @@ -2966,8 +2934,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) struct inode *inode = rc->data_inode; const struct file_extent_cluster *cluster = &rc->cluster; u64 offset = BTRFS_I(inode)->reloc_block_group_start; - unsigned long index; - unsigned long last_index; + u64 cur_file_offset = cluster->start - offset; struct file_ra_state *ra; int cluster_nr = 0; int ret = 0; @@ -2989,10 +2956,11 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) if (ret) goto out; - last_index = (cluster->end - offset) >> PAGE_SHIFT; - for (index = (cluster->start - offset) >> PAGE_SHIFT; - index <= last_index && !ret; index++) - ret = relocate_one_folio(rc, ra, &cluster_nr, index); + while (cur_file_offset < cluster->end - offset) { + ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); + if (ret) + break; + } if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3155,7 +3123,7 @@ static int add_tree_block(struct reloc_control *rc, block->key_ready = false; block->owner = owner; - rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); + rb_node = rb_simple_insert(blocks, &block->simple_node); if (rb_node) btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, -EEXIST); @@ -3643,7 +3611,7 @@ restart: } btrfs_release_path(path); - btrfs_clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); + btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL); if (trans) { btrfs_end_transaction_throttle(trans); @@ -3880,7 +3848,7 @@ static void free_reloc_control(struct reloc_control *rc) */ static void describe_relocation(struct btrfs_block_group *block_group) { - char buf[128] = {'\0'}; + char buf[128] = "NONE"; btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); @@ -3900,7 +3868,8 @@ static const char *stage_to_string(enum reloc_stage stage) /* * function to relocate all extents in a block group. */ -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, + bool verbose) { struct btrfs_block_group *bg; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); @@ -3992,7 +3961,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - describe_relocation(rc->block_group); + if (verbose) + describe_relocation(rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); @@ -4036,8 +4006,10 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (rc->extents_found == 0) break; - btrfs_info(fs_info, "found %llu extents, stage: %s", - rc->extents_found, stage_to_string(finishes_stage)); + if (verbose) + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, + stage_to_string(finishes_stage)); } WARN_ON(rc->block_group->pinned > 0); @@ -4339,7 +4311,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, } btrfs_backref_drop_node_buffer(node); - atomic_inc(&cow->refs); + refcount_inc(&cow->refs); node->eb = cow; node->new_bytenr = cow->start; diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 788c86d8633a..5c36b3f84b57 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -12,7 +12,8 @@ struct btrfs_trans_handle; struct btrfs_ordered_extent; struct btrfs_pending_snapshot; -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, + bool verbose); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7cd5e76a783c..6776e6ab8d10 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -556,7 +556,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), @@ -570,7 +570,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return 0; err: - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), @@ -596,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * /* Super block error, no need to search extent tree. */ if (is_super) { - btrfs_warn_in_rcu(fs_info, "scrub: %s on device %s, physical %llu", + btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", errstr, btrfs_dev_name(dev), physical); return; } @@ -637,7 +637,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * } if (ret > 0) break; - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), @@ -1045,12 +1045,12 @@ skip: */ if (repaired) { if (dev) { - btrfs_err_rl_in_rcu(fs_info, + btrfs_err_rl(fs_info, "scrub: fixed up error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { - btrfs_err_rl_in_rcu(fs_info, + btrfs_err_rl(fs_info, "scrub: fixed up error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } @@ -1059,12 +1059,12 @@ skip: /* The remaining are all for unrepaired. */ if (dev) { - btrfs_err_rl_in_rcu(fs_info, + btrfs_err_rl(fs_info, "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { - btrfs_err_rl_in_rcu(fs_info, + btrfs_err_rl(fs_info, "scrub: unable to fixup (regular) error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } @@ -1806,7 +1806,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) struct btrfs_io_context *bioc = NULL; const u64 logical = stripe->logical + (i << fs_info->sectorsize_bits); - int err; + int ret; io_stripe.rst_search_commit_root = true; stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; @@ -1814,11 +1814,11 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) * For RST cases, we need to manually split the bbio to * follow the RST boundary. */ - err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); - if (err < 0) { - if (err != -ENODATA) { + if (ret < 0) { + if (ret != -ENODATA) { /* * Earlier btrfs_get_raid_extent_offset() * returned -ENODATA, which means there's @@ -3057,7 +3057,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "scrub: devid %llu: filesystem on %s is not writable", devid, btrfs_dev_name(dev)); ret = -EROFS; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 2891ec4056c6..7664025a5af4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4,6 +4,7 @@ */ #include <linux/bsearch.h> +#include <linux/falloc.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/sort.h> @@ -758,7 +759,7 @@ static int send_header(struct send_ctx *sctx) { struct btrfs_stream_header hdr; - strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); + strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); @@ -1804,7 +1805,7 @@ static int gen_unique_name(struct send_ctx *sctx, ino, gen, idx); ASSERT(len < sizeof(tmp)); tmp_name.name = tmp; - tmp_name.len = strlen(tmp); + tmp_name.len = len; di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, @@ -1843,7 +1844,7 @@ static int gen_unique_name(struct send_ctx *sctx, break; } - ret = fs_path_add(dest, tmp, strlen(tmp)); + ret = fs_path_add(dest, tmp, len); out: btrfs_free_path(path); @@ -4628,7 +4629,6 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node) { const struct recorded_ref *data = k; const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); - int result; if (data->dir > ref->dir) return 1; @@ -4642,12 +4642,7 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node) return 1; if (data->name_len < ref->name_len) return -1; - result = strcmp(data->name, ref->name); - if (result > 0) - return 1; - if (result < 0) - return -1; - return 0; + return strcmp(data->name, ref->name); } static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) @@ -5411,6 +5406,30 @@ tlv_put_failure: return ret; } +static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len) +{ + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); + + ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE); + if (ret < 0) + return ret; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + + ret = send_cmd(sctx); + +tlv_put_failure: + return ret; +} + static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; @@ -5419,6 +5438,14 @@ static int send_hole(struct send_ctx *sctx, u64 end) int ret = 0; /* + * Starting with send stream v2 we have fallocate and can use it to + * punch holes instead of sending writes full of zeroes. + */ + if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE)) + return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - offset); + + /* * A hole that starts at EOF or beyond it. Since we do not yet support * fallocate (for extent preallocation and hole punching), sending a * write of zeroes starting at EOF or beyond would later require issuing diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d9087aa81b21..0481c693ac2e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -615,7 +615,7 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) + bool dump_block_groups) { struct btrfs_block_group *cache; u64 total_avail = 0; @@ -1887,7 +1887,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); + btrfs_dump_space_info(fs_info, space_info, orig_bytes, false); } return ret; } @@ -1918,7 +1918,7 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, bytes, 0); + btrfs_dump_space_info(fs_info, space_info, bytes, false); } return ret; } @@ -1973,13 +1973,13 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) static u64 calc_pct_ratio(u64 x, u64 y) { - int err; + int ret; if (!y) return 0; again: - err = check_mul_overflow(100, x, &x); - if (err) + ret = check_mul_overflow(100, x, &x); + if (ret) goto lose_precision; return div64_u64(x, y); lose_precision: @@ -2139,7 +2139,7 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool } } -bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) +static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) { bool ret; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 92b7f5e2b850..679f22efb407 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -278,7 +278,7 @@ u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, - int dump_block_groups); + bool dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, @@ -306,7 +306,6 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); -bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index d4f019233493..c9b3821957f7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -49,7 +49,7 @@ * Implementation: * * - Common - * Both metadata and data will use a new structure, btrfs_subpage, to + * Both metadata and data will use a new structure, btrfs_folio_state, to * record the status of each sector inside a page. This provides the extra * granularity needed. * @@ -63,10 +63,10 @@ * This means a slightly higher tree locking latency. */ -int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct folio *folio, enum btrfs_subpage_type type) +int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info, + struct folio *folio, enum btrfs_folio_type type) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; /* For metadata we don't support large folio yet. */ if (type == BTRFS_SUBPAGE_METADATA) @@ -87,18 +87,18 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; - subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type); - if (IS_ERR(subpage)) - return PTR_ERR(subpage); + bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type); + if (IS_ERR(bfs)) + return PTR_ERR(bfs); - folio_attach_private(folio, subpage); + folio_attach_private(folio, bfs); return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, - enum btrfs_subpage_type type) +void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_folio_type type) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; /* Either not subpage, or the folio already has private attached. */ if (!folio_test_private(folio)) @@ -108,15 +108,15 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; - subpage = folio_detach_private(folio); - ASSERT(subpage); - btrfs_free_subpage(subpage); + bfs = folio_detach_private(folio); + ASSERT(bfs); + btrfs_free_folio_state(bfs); } -struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - size_t fsize, enum btrfs_subpage_type type) +struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info, + size_t fsize, enum btrfs_folio_type type) { - struct btrfs_subpage *ret; + struct btrfs_folio_state *ret; unsigned int real_size; ASSERT(fs_info->sectorsize < fsize); @@ -136,11 +136,6 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, return ret; } -void btrfs_free_subpage(struct btrfs_subpage *subpage) -{ - kfree(subpage); -} - /* * Increase the eb_refs of current subpage. * @@ -152,7 +147,7 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) */ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; if (!btrfs_meta_is_subpage(fs_info)) return; @@ -160,13 +155,13 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * ASSERT(folio_test_private(folio) && folio->mapping); lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = folio_get_private(folio); - atomic_inc(&subpage->eb_refs); + bfs = folio_get_private(folio); + atomic_inc(&bfs->eb_refs); } void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; if (!btrfs_meta_is_subpage(fs_info)) return; @@ -174,9 +169,9 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * ASSERT(folio_test_private(folio) && folio->mapping); lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = folio_get_private(folio); - ASSERT(atomic_read(&subpage->eb_refs)); - atomic_dec(&subpage->eb_refs); + bfs = folio_get_private(folio); + ASSERT(atomic_read(&bfs->eb_refs)); + atomic_dec(&bfs->eb_refs); } static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, @@ -191,8 +186,9 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, * unmapped page like dummy extent buffer pages. */ if (folio->mapping) - ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + folio_size(folio)); + ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio), + "start=%llu len=%u folio_pos=%llu folio_size=%zu", + start, len, folio_pos(folio), folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ @@ -221,14 +217,13 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_pos(folio) + folio_size(folio), - orig_start + orig_len) - *start; + *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start; } static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); unsigned long flags; @@ -238,7 +233,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, btrfs_subpage_assert(fs_info, folio, start, len); - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. @@ -246,18 +241,18 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, * This @locked_page is locked by plain lock_page(), thus its * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->nr_locked) == 0) { - spin_unlock_irqrestore(&subpage->lock, flags); + if (atomic_read(&bfs->nr_locked) == 0) { + spin_unlock_irqrestore(&bfs->lock, flags); return true; } - for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { - clear_bit(bit, subpage->bitmaps); + for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) { + clear_bit(bit, bfs->bitmaps); cleared++; } - ASSERT(atomic_read(&subpage->nr_locked) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->nr_locked); - spin_unlock_irqrestore(&subpage->lock, flags); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &bfs->nr_locked); + spin_unlock_irqrestore(&bfs->lock, flags); return last; } @@ -280,7 +275,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); ASSERT(folio_test_locked(folio)); @@ -296,7 +291,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, * Since we own the page lock, no one else could touch subpage::locked * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->nr_locked) == 0) { + if (atomic_read(&bfs->nr_locked) == 0) { /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; @@ -310,7 +305,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; @@ -323,42 +318,42 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, return; } - if (atomic_read(&subpage->nr_locked) == 0) { + if (atomic_read(&bfs->nr_locked) == 0) { /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); for_each_set_bit(bit, &bitmap, blocks_per_folio) { - if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + if (test_and_clear_bit(bit + start_bit, bfs->bitmaps)) cleared++; } - ASSERT(atomic_read(&subpage->nr_locked) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->nr_locked); - spin_unlock_irqrestore(&subpage->lock, flags); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &bfs->nr_locked); + spin_unlock_irqrestore(&bfs->lock, flags); if (last) folio_unlock(folio); } #define subpage_test_bitmap_all_set(fs_info, folio, name) \ ({ \ - struct btrfs_subpage *subpage = folio_get_private(folio); \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ const unsigned int blocks_per_folio = \ btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_set(subpage->bitmaps, \ + bitmap_test_range_all_set(bfs->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ }) #define subpage_test_bitmap_all_zero(fs_info, folio, name) \ ({ \ - struct btrfs_subpage *subpage = folio_get_private(folio); \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ const unsigned int blocks_per_folio = \ btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_zero(subpage->bitmaps, \ + bitmap_test_range_all_zero(bfs->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ }) @@ -366,43 +361,43 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_clear_uptodate(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_unlock_irqrestore(&bfs->lock, flags); folio_mark_dirty(folio); } @@ -419,17 +414,17 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; bool last = false; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); return last; } @@ -446,91 +441,91 @@ void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (!folio_test_writeback(folio)) folio_start_writeback(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_set_ordered(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_clear_checked(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } /* @@ -541,16 +536,16 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ - struct btrfs_subpage *subpage = folio_get_private(folio); \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \ name, start, len); \ unsigned long flags; \ bool ret; \ \ - spin_lock_irqsave(&subpage->lock, flags); \ - ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \ + spin_lock_irqsave(&bfs->lock, flags); \ + ret = bitmap_test_range_all_set(bfs->bitmaps, start_bit, \ len >> fs_info->sectorsize_bits); \ - spin_unlock_irqrestore(&subpage->lock, flags); \ + spin_unlock_irqrestore(&bfs->lock, flags); \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); @@ -662,10 +657,10 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, { \ const unsigned int blocks_per_folio = \ btrfs_blocks_per_folio(fs_info, folio); \ - const struct btrfs_subpage *subpage = folio_get_private(folio); \ + const struct btrfs_folio_state *bfs = folio_get_private(folio); \ \ ASSERT(blocks_per_folio <= BITS_PER_LONG); \ - *dst = bitmap_read(subpage->bitmaps, \ + *dst = bitmap_read(bfs->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ } @@ -690,7 +685,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned int start_bit; unsigned int nbits; unsigned long flags; @@ -705,15 +700,15 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); nbits = len >> fs_info->sectorsize_bits; - subpage = folio_get_private(folio); - ASSERT(subpage); - spin_lock_irqsave(&subpage->lock, flags); - if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + bfs = folio_get_private(folio); + ASSERT(bfs); + spin_lock_irqsave(&bfs->lock, flags); + if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) { SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); } - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - spin_unlock_irqrestore(&subpage->lock, flags); + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); + spin_unlock_irqrestore(&bfs->lock, flags); } /* @@ -726,7 +721,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned long flags; unsigned int start_bit; unsigned int nbits; @@ -736,19 +731,19 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); nbits = len >> fs_info->sectorsize_bits; - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); /* Target range should not yet be locked. */ - if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) { SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); } - bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->nr_locked); + bitmap_set(bfs->bitmaps, start_bit, nbits); + ret = atomic_add_return(nbits, &bfs->nr_locked); ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } /* @@ -776,7 +771,7 @@ bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct ext void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; @@ -788,18 +783,18 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(blocks_per_folio > 1); - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); - dump_page(folio_page(folio, 0), "btrfs subpage dump"); + dump_page(folio_page(folio, 0), "btrfs folio state dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), @@ -815,14 +810,14 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 3042c5ea840a..ee0710eb13fd 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -32,9 +32,31 @@ struct folio; enum { btrfs_bitmap_nr_uptodate = 0, btrfs_bitmap_nr_dirty, + + /* + * This can be changed to atomic eventually. But this change will rely + * on the async delalloc range rework for locked bitmap. As async + * delalloc can unlock its range and mark blocks writeback at random + * timing. + */ btrfs_bitmap_nr_writeback, + + /* + * The ordered and checked flags are for COW fixup, already marked + * deprecated, and will be removed eventually. + */ btrfs_bitmap_nr_ordered, btrfs_bitmap_nr_checked, + + /* + * The locked bit is for async delalloc range (compression), currently + * async extent is queued with the range locked, until the compression + * is done. + * So an async extent can unlock the range at any random timing. + * + * This will need a rework on the async extent lifespan (mark writeback + * and do compression) before deprecating this flag. + */ btrfs_bitmap_nr_locked, btrfs_bitmap_nr_max }; @@ -43,7 +65,7 @@ enum { * Structure to trace status of each sector inside a page, attached to * page::private for both data and metadata inodes. */ -struct btrfs_subpage { +struct btrfs_folio_state { /* Common members for both data and metadata pages */ spinlock_t lock; union { @@ -51,7 +73,7 @@ struct btrfs_subpage { * Structures only used by metadata * * @eb_refs should only be operated under private_lock, as it - * manages whether the subpage can be detached. + * manages whether the btrfs_folio_state can be detached. */ atomic_t eb_refs; @@ -65,12 +87,11 @@ struct btrfs_subpage { unsigned long bitmaps[]; }; -enum btrfs_subpage_type { +enum btrfs_folio_type { BTRFS_SUBPAGE_METADATA, BTRFS_SUBPAGE_DATA, }; -#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE /* * Subpage support for metadata is more complex, as we can have dummy extent * buffers, where folios have no mapping to determine the owning inode. @@ -91,29 +112,19 @@ static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); return fs_info->sectorsize < folio_size(folio); } -#else -static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) -{ - return false; -} -static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, - struct folio *folio) -{ - if (folio->mapping && folio->mapping->host) - ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); - return false; -} -#endif -int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct folio *folio, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, - enum btrfs_subpage_type type); +int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info, + struct folio *folio, enum btrfs_folio_type type); +void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_folio_type type); /* Allocate additional data where page represents more than one sector */ -struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - size_t fsize, enum btrfs_subpage_type type); -void btrfs_free_subpage(struct btrfs_subpage *subpage); +struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info, + size_t fsize, enum btrfs_folio_type type); +static inline void btrfs_free_folio_state(struct btrfs_folio_state *bfs) +{ + kfree(bfs); +} void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a0c65adce1ab..68e35a3700ff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -261,10 +261,65 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { {} }; -/* No support for restricting writes to btrfs devices yet... */ -static inline blk_mode_t btrfs_open_mode(struct fs_context *fc) +static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level) { - return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES; + const int len = strlen(type); + + return (strncmp(string, type, len) == 0) && + ((may_have_level && string[len] == ':') || string[len] == '\0'); +} + +static int btrfs_parse_compress(struct btrfs_fs_context *ctx, + const struct fs_parameter *param, int opt) +{ + const char *string = param->string; + + /* + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears "force-compress" + * without the need to pass "compress-force=[no|none]" before + * specifying "compress". + */ + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "zlib", true)) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, + string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "lzo", false)) { + ctx->compress_type = BTRFS_COMPRESS_LZO; + ctx->compress_level = 0; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "zstd", true)) { + ctx->compress_type = BTRFS_COMPRESS_ZSTD; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, + string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "no", false) || + btrfs_match_compress_type(string, "none", false)) { + ctx->compress_level = 0; + ctx->compress_type = 0; + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + } else { + btrfs_err(NULL, "unrecognized compression value %s", string); + return -EINVAL; + } + return 0; } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) @@ -303,10 +358,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_device: { struct btrfs_device *device; - blk_mode_t mode = btrfs_open_mode(fc); mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(param->string, mode, false); + device = btrfs_scan_one_device(param->string, false); mutex_unlock(&uuid_mutex); if (IS_ERR(device)) return PTR_ERR(device); @@ -336,53 +390,8 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) fallthrough; case Opt_compress: case Opt_compress_type: - /* - * Provide the same semantics as older kernels that don't use fs - * context, specifying the "compress" option clears - * "force-compress" without the need to pass - * "compress-force=[no|none]" before specifying "compress". - */ - if (opt != Opt_compress_force && opt != Opt_compress_force_type) - btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); - - if (opt == Opt_compress || opt == Opt_compress_force) { - ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "zlib", 4) == 0) { - ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = - btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, - param->string + 4); - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "lzo", 3) == 0) { - ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "zstd", 4) == 0) { - ctx->compress_type = BTRFS_COMPRESS_ZSTD; - ctx->compress_level = - btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, - param->string + 4); - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "no", 2) == 0) { - ctx->compress_level = 0; - ctx->compress_type = 0; - btrfs_clear_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); - } else { - btrfs_err(NULL, "unrecognized compression value %s", - param->string); + if (btrfs_parse_compress(ctx, param, opt)) return -EINVAL; - } break; case Opt_ssd: if (result.negated) { @@ -945,12 +954,12 @@ static int btrfs_fill_super(struct super_block *sb, { struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - int err; + int ret; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; - sb->s_d_op = &btrfs_dentry_operations; + set_default_d_op(sb, &btrfs_dentry_operations); sb->s_export_op = &btrfs_export_ops; #ifdef CONFIG_FS_VERITY sb->s_vop = &btrfs_verityops; @@ -959,28 +968,28 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_time_gran = 1; sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; - err = super_setup_bdi(sb); - if (err) { + ret = super_setup_bdi(sb); + if (ret) { btrfs_err(fs_info, "super_setup_bdi failed"); - return err; + return ret; } - err = open_ctree(sb, fs_devices); - if (err) { - btrfs_err(fs_info, "open_ctree failed: %d", err); - return err; + ret = open_ctree(sb, fs_devices); + if (ret) { + btrfs_err(fs_info, "open_ctree failed: %d", ret); + return ret; } inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { - err = PTR_ERR(inode); - btrfs_handle_fs_error(fs_info, err, NULL); + ret = PTR_ERR(inode); + btrfs_handle_fs_error(fs_info, ret, NULL); goto fail_close; } sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { - err = -ENOMEM; + ret = -ENOMEM; goto fail_close; } @@ -989,7 +998,7 @@ static int btrfs_fill_super(struct super_block *sb, fail_close: close_ctree(fs_info); - return err; + return ret; } int btrfs_sync_fs(struct super_block *sb, int wait) @@ -1826,10 +1835,9 @@ static int btrfs_get_tree_super(struct fs_context *fc) struct btrfs_fs_info *fs_info = fc->s_fs_info; struct btrfs_fs_context *ctx = fc->fs_private; struct btrfs_fs_devices *fs_devices = NULL; - struct block_device *bdev; struct btrfs_device *device; struct super_block *sb; - blk_mode_t mode = btrfs_open_mode(fc); + blk_mode_t mode = sb_open_mode(fc->sb_flags); int ret; btrfs_ctx_to_info(fs_info, ctx); @@ -1839,47 +1847,60 @@ static int btrfs_get_tree_super(struct fs_context *fc) * With 'true' passed to btrfs_scan_one_device() (mount time) we expect * either a valid device or an error. */ - device = btrfs_scan_one_device(fc->source, mode, true); + device = btrfs_scan_one_device(fc->source, true); ASSERT(device != NULL); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); return PTR_ERR(device); } - fs_devices = device->fs_devices; + /* + * We cannot hold uuid_mutex calling sget_fc(), it will lead to a + * locking order reversal with s_umount. + * + * So here we increase the holding number of fs_devices, this will ensure + * the fs_devices itself won't be freed. + */ + btrfs_fs_devices_inc_holding(fs_devices); fs_info->fs_devices = fs_devices; - - ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type); mutex_unlock(&uuid_mutex); - if (ret) - return ret; - - if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { - ret = -EACCES; - goto error; - } - bdev = fs_devices->latest_dev->bdev; - /* - * From now on the error handling is not straightforward. - * - * If successful, this will transfer the fs_info into the super block, - * and fc->s_fs_info will be NULL. However if there's an existing - * super, we'll still have fc->s_fs_info populated. If we error - * completely out it'll be cleaned up when we drop the fs_context, - * otherwise it's tied to the lifetime of the super_block. - */ sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - goto error; + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + /* + * Since the fs_devices is not opened, it can be freed at any + * time after unlocking uuid_mutex. We need to avoid double + * free through put_fs_context()->btrfs_free_fs_info(). + * So here we reset fs_info->fs_devices to NULL, and let the + * regular fs_devices reclaim path to handle it. + * + * This applies to all later branches where no fs_devices is + * opened. + */ + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + return PTR_ERR(sb); } set_device_specific_options(fs_info); if (sb->s_root) { - btrfs_close_devices(fs_devices); + /* + * Not the first mount of the fs thus got an existing super block. + * Will reuse the returned super block, fs_info and fs_devices. + * + * fc->s_fs_info is not touched and will be later freed by + * put_fs_context() through btrfs_free_fs_context(). + */ + ASSERT(fc->s_fs_info == fs_info); + + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); /* * At this stage we may have RO flag mismatch between * fc->sb_flags and sb->s_flags. Caller should detect such @@ -1887,9 +1908,32 @@ static int btrfs_get_tree_super(struct fs_context *fc) * needed. */ } else { + struct block_device *bdev; + + /* + * The first mount of the fs thus a new superblock, fc->s_fs_info + * must be NULL, and the ownership of our fs_info and fs_devices is + * transferred to the super block. + */ + ASSERT(fc->s_fs_info == NULL); + + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + ret = btrfs_open_devices(fs_devices, mode, sb); + if (ret < 0) + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + if (ret < 0) { + deactivate_locked_super(sb); + return ret; + } + if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + deactivate_locked_super(sb); + return -EACCES; + } + bdev = fs_devices->latest_dev->bdev; snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); - btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; ret = btrfs_fill_super(sb, fs_devices); if (ret) { deactivate_locked_super(sb); @@ -1901,10 +1945,6 @@ static int btrfs_get_tree_super(struct fs_context *fc) fc->root = dget(sb->s_root); return 0; - -error: - btrfs_close_devices(fs_devices); - return ret; } /* @@ -1980,17 +2020,13 @@ error: * btrfs or not, setting the whole super block RO. To make per-subvolume mounting * work with different options work we need to keep backward compatibility. */ -static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt) +static int btrfs_reconfigure_for_mount(struct fs_context *fc) { int ret = 0; - if (fc->sb_flags & SB_RDONLY) - return ret; - - down_write(&mnt->mnt_sb->s_umount); - if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY)) + if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY)) ret = btrfs_reconfigure(fc); - up_write(&mnt->mnt_sb->s_umount); + return ret; } @@ -2035,25 +2071,18 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) */ dup_fc->s_fs_info = fs_info; - /* - * We'll do the security settings in our btrfs_get_tree_super() mount - * loop, they were duplicated into dup_fc, we can drop the originals - * here. - */ - security_free_mnt_opts(&fc->security); - fc->security = NULL; + ret = btrfs_get_tree_super(dup_fc); + if (ret) + goto error; - mnt = fc_mount(dup_fc); - if (IS_ERR(mnt)) { - put_fs_context(dup_fc); - return PTR_ERR(mnt); - } - ret = btrfs_reconfigure_for_mount(dup_fc, mnt); + ret = btrfs_reconfigure_for_mount(dup_fc); + up_write(&dup_fc->root->d_sb->s_umount); + if (ret) + goto error; + mnt = vfs_create_mount(dup_fc); put_fs_context(dup_fc); - if (ret) { - mntput(mnt); - return ret; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); /* * This free's ->subvol_name, because if it isn't set we have to @@ -2067,25 +2096,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fc->root = dentry; return 0; +error: + put_fs_context(dup_fc); + return ret; } static int btrfs_get_tree(struct fs_context *fc) { - /* - * Since we use mount_subtree to mount the default/specified subvol, we - * have to do mounts in two steps. - * - * First pass through we call btrfs_get_tree_subvol(), this is just a - * wrapper around fc_mount() to call back into here again, and this time - * we'll call btrfs_get_tree_super(). This will do the open_ctree() and - * everything to open the devices and file system. Then we return back - * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and - * from there we can do our mount_subvol() call, which will lookup - * whichever subvol we're mounting and setup this fc with the - * appropriate dentry for the subvol. - */ - if (fc->s_fs_info) - return btrfs_get_tree_super(fc); + ASSERT(fc->s_fs_info == NULL); + return btrfs_get_tree_subvol(fc); } @@ -2217,7 +2236,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, * Scanning outside of mount can return NULL which would turn * into 0 error code. */ - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + device = btrfs_scan_one_device(vol->name, false); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2235,7 +2254,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, * Scanning outside of mount can return NULL which would turn * into 0 error code. */ - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + device = btrfs_scan_one_device(vol->name, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); if (IS_ERR(device)) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5d93d9dd2c12..9d398f7a36ad 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -160,8 +160,7 @@ static int can_modify_feature(struct btrfs_feature_attr *fa) clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; break; default: - pr_warn("btrfs: sysfs: unknown feature set %d\n", - fa->feature_set); + btrfs_warn(NULL, "sysfs: unknown feature set %d", fa->feature_set); return 0; } @@ -1138,13 +1137,21 @@ static ssize_t btrfs_commit_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); + u64 now = ktime_get_ns(); + u64 start_time = fs_info->commit_stats.critical_section_start_time; + u64 pending = 0; + + if (start_time) + pending = now - start_time; return sysfs_emit(buf, "commits %llu\n" + "cur_commit_ms %llu\n" "last_commit_ms %llu\n" "max_commit_ms %llu\n" "total_commit_ms %llu\n", fs_info->commit_stats.commit_count, + div_u64(pending, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); @@ -1202,7 +1209,7 @@ static ssize_t quota_override_store(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long knob; - int err; + int ret; if (!fs_info) return -EPERM; @@ -1210,9 +1217,9 @@ static ssize_t quota_override_store(struct kobject *kobj, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = kstrtoul(buf, 10, &knob); - if (err) - return err; + ret = kstrtoul(buf, 10, &knob); + if (ret) + return ret; if (knob > 1) return -EINVAL; @@ -2239,7 +2246,7 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + btrfs_warn(NULL, "sending event %d to kobject: '%s' (%p): failed", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); } @@ -2282,15 +2289,15 @@ static struct kset *btrfs_kset; */ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { - int error; + int ret; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, - "%pU", fs_devs->fsid); - if (error) { + ret = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, + "%pU", fs_devs->fsid); + if (ret) { kobject_put(&fs_devs->fsid_kobj); - return error; + return ret; } fs_devs->devices_kobj = kobject_create_and_add("devices", @@ -2316,71 +2323,70 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { - int error; + int ret; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - error = btrfs_sysfs_add_fs_devices(fs_devs); - if (error) - return error; + ret = btrfs_sysfs_add_fs_devices(fs_devs); + if (ret) + return ret; - error = sysfs_create_files(fsid_kobj, btrfs_attrs); - if (error) { + ret = sysfs_create_files(fsid_kobj, btrfs_attrs); + if (ret) { btrfs_sysfs_remove_fs_devices(fs_devs); - return error; + return ret; } - error = sysfs_create_group(fsid_kobj, - &btrfs_feature_attr_group); - if (error) + ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); + if (ret) goto failure; #ifdef CONFIG_BTRFS_DEBUG fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); if (!fs_info->debug_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); - if (error) + ret = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + if (ret) goto failure; #endif /* Discard directory */ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); if (!fs_info->discard_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); - if (error) + ret = sysfs_create_files(fs_info->discard_kobj, discard_attrs); + if (ret) goto failure; - error = addrm_unknown_feature_attrs(fs_info, true); - if (error) + ret = addrm_unknown_feature_attrs(fs_info, true); + if (ret) goto failure; - error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); - if (error) + ret = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); + if (ret) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); - if (error) + ret = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (ret) goto failure; return 0; failure: btrfs_sysfs_remove_mounted(fs_info); - return error; + return ret; } static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 00da54f0164c..b19328d077d3 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -23,8 +23,8 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, { int ret; struct folio_batch fbatch; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; int i; int count = 0; int loops = 0; @@ -75,7 +75,8 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) dest[0] = 0; PRINT_ONE_FLAG(state, dest, cur, DIRTY); PRINT_ONE_FLAG(state, dest, cur, LOCKED); - PRINT_ONE_FLAG(state, dest, cur, NEW); + PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG1); + PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG2); PRINT_ONE_FLAG(state, dest, cur, DELALLOC); PRINT_ONE_FLAG(state, dest, cur, DEFRAG); PRINT_ONE_FLAG(state, dest, cur, BOUNDARY); @@ -113,7 +114,6 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; - unsigned long index = 0; /* In this test we need at least 2 file extents at its maximum size */ u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 total_dirty = 2 * max_bytes; @@ -156,7 +156,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * everything to make sure our pages don't get evicted and screw up our * test. */ - for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { + for (pgoff_t index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_err("failed to allocate test page"); @@ -326,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) out_bits: if (ret) dump_extent_io_tree(tmp); - btrfs_clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); + btrfs_clear_extent_bit(tmp, 0, total_dirty - 1, (unsigned)-1, NULL); out: if (locked_page) put_page(locked_page); @@ -343,11 +343,11 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) unsigned long i; for (i = 0; i < eb->len * BITS_PER_BYTE; i++) { - int bit, bit1; + bool bit_set, bit1_set; - bit = !!test_bit(i, bitmap); - bit1 = !!extent_buffer_test_bit(eb, 0, i); - if (bit1 != bit) { + bit_set = test_bit(i, bitmap); + bit1_set = extent_buffer_test_bit(eb, 0, i); + if (bit1_set != bit_set) { u8 has; u8 expect; @@ -360,9 +360,9 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) return -EINVAL; } - bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, - i % BITS_PER_BYTE); - if (bit1 != bit) { + bit1_set = extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1_set != bit_set) { u8 has; u8 expect; @@ -662,7 +662,7 @@ static int test_find_first_clear_extent_bit(void) out: if (ret) dump_extent_io_tree(&tree); - btrfs_clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_clear_extent_bit(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index b61972046feb..c8822edd32e2 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, unsigned int i; int ret; - info = search_free_space_info(trans, cache, path, 0); + info = btrfs_search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); ret = PTR_ERR(info); @@ -57,7 +57,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, goto invalid; offset = key.objectid; while (offset < key.objectid + key.offset) { - bit = free_space_test_bit(cache, path, offset); + bit = btrfs_free_space_test_bit(cache, path, offset); if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { @@ -115,7 +115,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, u32 flags; int ret; - info = search_free_space_info(trans, cache, path, 0); + info = btrfs_search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); btrfs_release_path(path); @@ -131,13 +131,13 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, /* Flip it to the other format and check that for good measure. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - ret = convert_free_space_to_extents(trans, cache, path); + ret = btrfs_convert_free_space_to_extents(trans, cache, path); if (ret) { test_err("could not convert to extents"); return ret; } } else { - ret = convert_free_space_to_bitmaps(trans, cache, path); + ret = btrfs_convert_free_space_to_bitmaps(trans, cache, path); if (ret) { test_err("could not convert to bitmaps"); return ret; @@ -170,9 +170,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans, const struct free_space_extent extents[] = {}; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, - cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; @@ -193,8 +192,8 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, alignment); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -216,7 +215,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, cache->start + cache->length - alignment, alignment); if (ret) { @@ -240,9 +239,9 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start + alignment, + alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -263,23 +262,22 @@ static int test_merge_left(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -300,24 +298,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -338,29 +335,29 @@ static int test_merge_both(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -383,29 +380,29 @@ static int test_merge_none(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 4 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 4 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -483,14 +480,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, goto out; } - ret = add_block_group_free_space(&trans, cache); + ret = btrfs_add_block_group_free_space(&trans, cache); if (ret) { test_err("could not add block group free space"); goto out; } if (bitmaps) { - ret = convert_free_space_to_bitmaps(&trans, cache, path); + ret = btrfs_convert_free_space_to_bitmaps(&trans, cache, path); if (ret) { test_err("could not convert block group to bitmaps"); goto out; @@ -501,7 +498,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, if (ret) goto out; - ret = remove_block_group_free_space(&trans, cache); + ret = btrfs_remove_block_group_free_space(&trans, cache); if (ret) { test_err("could not remove block group free space"); goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index a29d2c02c2c8..a4c2b7748b95 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -950,10 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ - ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1017,10 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ - ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1051,8 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* Empty */ - ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1066,8 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = 0; out: if (ret) - btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b96195d6480f..c5c0d9cf1a80 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1211,15 +1211,15 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { bool errors = false; - int err; + int ret; - err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) errors = true; - if (errors && !err) - err = -EIO; - return err; + if (errors && !ret) + ret = -EIO; + return ret; } int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) @@ -1227,22 +1227,22 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) struct btrfs_fs_info *fs_info = log_root->fs_info; struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; bool errors = false; - int err; + int ret; ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); - err = __btrfs_wait_marked_extents(fs_info, dirty_pages); - if ((mark & EXTENT_DIRTY) && + ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); + if ((mark & EXTENT_DIRTY_LOG1) && test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) errors = true; - if ((mark & EXTENT_NEW) && + if ((mark & EXTENT_DIRTY_LOG2) && test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) errors = true; - if (errors && !err) - err = -EIO; - return err; + if (errors && !ret) + ret = -EIO; + return ret; } /* @@ -1735,8 +1735,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_create_qgroup(trans, objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, ret); - goto fail; + if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { + btrfs_abort_transaction(trans, ret); + goto fail; + } } /* @@ -2163,13 +2165,19 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } -static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +static void update_commit_stats(struct btrfs_fs_info *fs_info) { + ktime_t now = ktime_get_ns(); + ktime_t interval = now - fs_info->commit_stats.critical_section_start_time; + + ASSERT(fs_info->commit_stats.critical_section_start_time); + fs_info->commit_stats.commit_count++; fs_info->commit_stats.last_commit_dur = interval; fs_info->commit_stats.max_commit_dur = max_t(u64, fs_info->commit_stats.max_commit_dur, interval); fs_info->commit_stats.total_commit_dur += interval; + fs_info->commit_stats.critical_section_start_time = 0; } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) @@ -2178,8 +2186,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; - ktime_t start_time; - ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); @@ -2312,8 +2318,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit */ - start_time = ktime_get_ns(); - + fs_info->commit_stats.critical_section_start_time = ktime_get_ns(); extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); @@ -2545,6 +2550,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; + update_commit_stats(fs_info); /* * We needn't acquire the lock here because there is no other task * which can change it. @@ -2581,8 +2587,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trace_btrfs_transaction_commit(fs_info); - interval = ktime_get_ns() - start_time; - btrfs_scrub_continue(fs_info); if (current->journal_info == trans) @@ -2590,8 +2594,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); - update_commit_stats(fs_info, interval); - return ret; unlock_reloc: diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 8f4703b488b7..0f556f4de3f9 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -191,7 +191,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, * Only subvolume trees along with their reloc trees need this check. * Things like log tree doesn't follow this ino requirement. */ - if (!is_fstree(btrfs_header_owner(leaf))) + if (!btrfs_is_fstree(btrfs_header_owner(leaf))) return true; if (key->objectid == prev_key->objectid) @@ -475,7 +475,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, * to be COWed to be relocated. */ if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID && - !is_fstree(key->offset))) { + !btrfs_is_fstree(key->offset))) { generic_err(leaf, slot, "invalid reloc tree for root %lld, root id is not a subvolume tree", key->offset); @@ -493,7 +493,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, } /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ - if (unlikely(!is_fstree(key->objectid) && !is_root_item)) { + if (unlikely(!btrfs_is_fstree(key->objectid) && !is_root_item)) { dir_item_err(leaf, slot, "invalid location key objectid, have %llu expect [%llu, %llu]", key->objectid, BTRFS_FIRST_FREE_OBJECTID, @@ -1311,7 +1311,7 @@ static bool is_valid_dref_root(u64 rootid) * - tree root * For v1 space cache */ - return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || + return btrfs_is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || rootid == BTRFS_ROOT_TREE_OBJECTID; } @@ -2167,7 +2167,7 @@ ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) { - const bool is_subvol = is_fstree(root_owner); + const bool is_subvol = btrfs_is_fstree(root_owner); const u64 eb_owner = btrfs_header_owner(eb); /* @@ -2209,7 +2209,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) * For subvolume trees, owners can mismatch, but they should all belong * to subvolume trees. */ - if (unlikely(is_subvol != is_fstree(eb_owner))) { + if (unlikely(is_subvol != btrfs_is_fstree(eb_owner))) { btrfs_crit(eb->fs_info, "corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", btrfs_header_level(eb) == 0 ? "leaf" : "node", diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 858b609e292c..9f05d454b9df 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -112,7 +112,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid, int del_all); + u64 dirid, bool del_all); static void wait_log_commit(struct btrfs_root *root, int transid); /* @@ -143,6 +143,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r unsigned int nofs_flag; struct btrfs_inode *inode; + /* Only meant to be called for subvolume roots and not for log roots. */ + ASSERT(btrfs_is_fstree(btrfs_root_id(root))); + /* * We're holding a transaction handle whether we are logging or * replaying a log tree, so we must make sure NOFS semantics apply @@ -604,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, return 0; } -/* - * simple helper to read an inode off the disk from a given root - * This can only be called for subvolume roots and not for the log - */ -static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, - u64 objectid) -{ - struct btrfs_inode *inode; - - inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - return NULL; - return inode; -} - /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. @@ -674,9 +662,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, return -EUCLEAN; } - inode = read_one_inode(root, key->objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); /* * first check to see if we already have this extent in the @@ -948,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -1052,6 +1041,126 @@ out: return ret; } +static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *log_root, + struct btrfs_key *search_key, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + u64 parent_objectid) +{ + struct extent_buffer *leaf = path->nodes[0]; + unsigned long ptr; + unsigned long ptr_end; + + /* + * Check all the names in this back reference to see if they are in the + * log. If so, we allow them to stay otherwise they must be unlinked as + * a conflict. + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + while (ptr < ptr_end) { + struct fscrypt_str victim_name; + struct btrfs_inode_ref *victim_ref; + int ret; + + victim_ref = (struct btrfs_inode_ref *)ptr; + ret = read_alloc_one_name(leaf, (victim_ref + 1), + btrfs_inode_ref_name_len(leaf, victim_ref), + &victim_name); + if (ret) + return ret; + + ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + if (ret) { + kfree(victim_name.name); + if (ret < 0) + return ret; + ptr = (unsigned long)(victim_ref + 1) + victim_name.len; + continue; + } + + inc_nlink(&inode->vfs_inode); + btrfs_release_path(path); + + ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name); + kfree(victim_name.name); + if (ret) + return ret; + return -EAGAIN; + } + + return 0; +} + +static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_root *log_root, + struct btrfs_key *search_key, + struct btrfs_inode *inode, + u64 inode_objectid, + u64 parent_objectid) +{ + struct extent_buffer *leaf = path->nodes[0]; + const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]); + const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + u32 cur_offset = 0; + + while (cur_offset < item_size) { + struct btrfs_inode_extref *extref; + struct btrfs_inode *victim_parent; + struct fscrypt_str victim_name; + int ret; + + extref = (struct btrfs_inode_extref *)(base + cur_offset); + victim_name.len = btrfs_inode_extref_name_len(leaf, extref); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, + &victim_name); + if (ret) + return ret; + + search_key->objectid = inode_objectid; + search_key->type = BTRFS_INODE_EXTREF_KEY; + search_key->offset = btrfs_extref_hash(parent_objectid, + victim_name.name, + victim_name.len); + ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + if (ret) { + kfree(victim_name.name); + if (ret < 0) + return ret; +next: + cur_offset += victim_name.len + sizeof(*extref); + continue; + } + + victim_parent = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(victim_parent)) { + kfree(victim_name.name); + return PTR_ERR(victim_parent); + } + + inc_nlink(&inode->vfs_inode); + btrfs_release_path(path); + + ret = unlink_inode_for_log_replay(trans, victim_parent, inode, + &victim_name); + iput(&victim_parent->vfs_inode); + kfree(victim_name.name); + if (ret) + return ret; + return -EAGAIN; + } + + return 0; +} + static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1062,7 +1171,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, u64 ref_index, struct fscrypt_str *name) { int ret; - struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key search_key; struct btrfs_inode_extref *extref; @@ -1073,121 +1181,37 @@ again: search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret == 0) { - struct btrfs_inode_ref *victim_ref; - unsigned long ptr; - unsigned long ptr_end; - - leaf = path->nodes[0]; - - /* are we trying to overwrite a back ref for the root directory - * if so, just jump out, we're done + if (ret < 0) { + return ret; + } else if (ret == 0) { + /* + * Are we trying to overwrite a back ref for the root directory? + * If so, we're done. */ if (search_key.objectid == search_key.offset) return 1; - /* check all the names in this back reference to see - * if they are in the log. if so, we allow them to stay - * otherwise they must be unlinked as a conflict - */ - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); - while (ptr < ptr_end) { - struct fscrypt_str victim_name; - - victim_ref = (struct btrfs_inode_ref *)ptr; - ret = read_alloc_one_name(leaf, (victim_ref + 1), - btrfs_inode_ref_name_len(leaf, victim_ref), - &victim_name); - if (ret) - return ret; - - ret = backref_in_log(log_root, &search_key, - parent_objectid, &victim_name); - if (ret < 0) { - kfree(victim_name.name); - return ret; - } else if (!ret) { - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); - - ret = unlink_inode_for_log_replay(trans, dir, inode, - &victim_name); - kfree(victim_name.name); - if (ret) - return ret; - goto again; - } - kfree(victim_name.name); - - ptr = (unsigned long)(victim_ref + 1) + victim_name.len; - } + ret = unlink_refs_not_in_log(trans, path, log_root, &search_key, + dir, inode, parent_objectid); + if (ret == -EAGAIN) + goto again; + else if (ret) + return ret; } btrfs_release_path(path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(NULL, root, path, name, - inode_objectid, parent_objectid, 0, - 0); + extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid); if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - u32 item_size; - u32 cur_offset = 0; - unsigned long base; - struct btrfs_inode *victim_parent; - - leaf = path->nodes[0]; - - item_size = btrfs_item_size(leaf, path->slots[0]); - base = btrfs_item_ptr_offset(leaf, path->slots[0]); - - while (cur_offset < item_size) { - struct fscrypt_str victim_name; - - extref = (struct btrfs_inode_extref *)(base + cur_offset); - - if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) - goto next; - - ret = read_alloc_one_name(leaf, &extref->name, - btrfs_inode_extref_name_len(leaf, extref), - &victim_name); - if (ret) - return ret; - - search_key.objectid = inode_objectid; - search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = btrfs_extref_hash(parent_objectid, - victim_name.name, - victim_name.len); - ret = backref_in_log(log_root, &search_key, - parent_objectid, &victim_name); - if (ret < 0) { - kfree(victim_name.name); - return ret; - } else if (!ret) { - ret = -ENOENT; - victim_parent = read_one_inode(root, - parent_objectid); - if (victim_parent) { - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); - - ret = unlink_inode_for_log_replay(trans, - victim_parent, - inode, &victim_name); - iput(&victim_parent->vfs_inode); - } - kfree(victim_name.name); - if (ret) - return ret; - goto again; - } - kfree(victim_name.name); -next: - cur_offset += victim_name.len + sizeof(*extref); - } + ret = unlink_extrefs_not_in_log(trans, path, root, log_root, + &search_key, inode, + inode_objectid, parent_objectid); + if (ret == -EAGAIN) + goto again; + else if (ret) + return ret; } btrfs_release_path(path); @@ -1315,9 +1339,9 @@ again: struct btrfs_inode *dir; btrfs_release_path(path); - dir = read_one_inode(root, parent_id); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_id, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); kfree(name.name); goto out; } @@ -1361,7 +1385,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, unsigned long ref_end; struct fscrypt_str name = { 0 }; int ret; - int log_ref_ver = 0; + const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY); u64 parent_objectid; u64 inode_objectid; u64 ref_index = 0; @@ -1370,11 +1394,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size(eb, slot); - if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (is_extref_item) { struct btrfs_inode_extref *r; ref_struct_size = sizeof(struct btrfs_inode_extref); - log_ref_ver = 1; r = (struct btrfs_inode_extref *)ref_ptr; parent_objectid = btrfs_inode_extref_parent(eb, r); } else { @@ -1389,37 +1412,61 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, parent_objectid); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + dir = NULL; goto out; } - inode = read_one_inode(root, inode_objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(inode_objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } while (ref_ptr < ref_end) { - if (log_ref_ver) { + if (is_extref_item) { ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); + if (ret) + goto out; /* * parent object can change from one array * item to another. */ - if (!dir) - dir = read_one_inode(root, parent_objectid); if (!dir) { - ret = -ENOENT; - goto out; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; + /* + * A new parent dir may have not been + * logged and not exist in the subvolume + * tree, see the comment above before + * the loop when getting the first + * parent dir. + */ + if (ret == -ENOENT) { + /* + * The next extref may refer to + * another parent dir that + * exists, so continue. + */ + ret = 0; + goto next; + } + goto out; + } } } else { ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); + if (ret) + goto out; } - if (ret) - goto out; ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), ref_index, &name); @@ -1453,10 +1500,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* Else, ret == 1, we already have a perfect match, we're done. */ +next: ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; kfree(name.name); name.name = NULL; - if (log_ref_ver) { + if (is_extref_item && dir) { iput(&dir->vfs_inode); dir = NULL; } @@ -1633,8 +1681,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (inode->vfs_inode.i_nlink == 0) { if (S_ISDIR(inode->vfs_inode.i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, - ino, 1); + ret = replay_dir_deletes(trans, root, NULL, path, ino, true); if (ret) goto out; } @@ -1682,9 +1729,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; btrfs_release_path(path); - inode = read_one_inode(root, key.offset); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.offset, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } @@ -1720,9 +1767,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *inode; struct inode *vfs_inode; - inode = read_one_inode(root, objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; @@ -1761,14 +1808,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_inode *dir; int ret; - inode = read_one_inode(root, location->objectid); - if (!inode) - return -ENOENT; + inode = btrfs_iget_logging(location->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); - dir = read_one_inode(root, dirid); - if (!dir) { + dir = btrfs_iget_logging(dirid, root); + if (IS_ERR(dir)) { iput(&inode->vfs_inode); - return -EIO; + return PTR_ERR(dir); } ret = btrfs_add_link(trans, dir, inode, name, 1, index); @@ -1845,9 +1892,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool update_size = true; bool name_added = false; - dir = read_one_inode(root, key->objectid); - if (!dir) - return -EIO; + dir = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(dir)) + return PTR_ERR(dir); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) @@ -2147,9 +2194,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -2285,7 +2333,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid, int del_all) + u64 dirid, bool del_all) { u64 range_start; u64 range_end; @@ -2301,14 +2349,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (!log_path) return -ENOMEM; - dir = read_one_inode(root, dirid); - /* it isn't an error if the inode isn't there, that can happen - * because we replay the deletes before we copy in the inode item - * from the log + dir = btrfs_iget_logging(dirid, root); + /* + * It isn't an error if the inode isn't there, that can happen because + * we replay the deletes before we copy in the inode item from the log. */ - if (!dir) { + if (IS_ERR(dir)) { btrfs_free_path(log_path); - return 0; + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + return ret; } range_start = 0; @@ -2444,8 +2495,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid, 0); + ret = replay_dir_deletes(wc->trans, root, log, path, + key.objectid, false); if (ret) break; } @@ -2467,9 +2518,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_inode *inode; u64 from; - inode = read_one_inode(root, key.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } from = ALIGN(i_size_read(&inode->vfs_inode), @@ -2520,9 +2571,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); - if (ret && ret != -ENOENT) + if (ret) break; - ret = 0; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); @@ -2721,7 +2771,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, level = btrfs_header_level(log->node); orig_level = level; path->nodes[level] = log->node; - atomic_inc(&log->node->refs); + refcount_inc(&log->node->refs); path->slots[level] = 0; while (1) { @@ -2962,9 +3012,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (log_transid % 2 == 0) - mark = EXTENT_DIRTY; + mark = EXTENT_DIRTY_LOG1; else - mark = EXTENT_NEW; + mark = EXTENT_DIRTY_LOG2; /* we start IO on all the marked extents here, but we don't actually * wait for them until later. @@ -3095,7 +3145,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(fs_info, &log_root_tree->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); blk_finish_plug(&plug); /* * As described above, -EAGAIN indicates a hole in the extents. We @@ -3115,7 +3165,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_tree_log_extents(log, mark); if (!ret) ret = btrfs_wait_tree_log_extents(log_root_tree, - EXTENT_NEW | EXTENT_DIRTY); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); if (ret) { btrfs_set_log_full_commit(trans); mutex_unlock(&log_root_tree->log_mutex); @@ -3241,9 +3291,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans, */ btrfs_write_marked_extents(log->fs_info, &log->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); btrfs_wait_tree_log_extents(log, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); if (trans) btrfs_abort_transaction(trans, ret); @@ -3433,7 +3483,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * inode item because on log replay we update the field to reflect * all existing entries in the directory (see overwrite_item()). */ - return btrfs_delete_one_dir_name(trans, log, path, di); + return btrfs_del_item(trans, log, path); } /* @@ -3473,26 +3523,27 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, return; } - ret = join_running_log_trans(root); - if (ret) - return; - - mutex_lock(&dir->log_mutex); - path = btrfs_alloc_path(); if (!path) { - ret = -ENOMEM; - goto out_unlock; + btrfs_set_log_full_commit(trans); + return; } + ret = join_running_log_trans(root); + ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); + if (WARN_ON(ret)) + goto out; + + mutex_lock(&dir->log_mutex); + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), name, index); - btrfs_free_path(path); -out_unlock: mutex_unlock(&dir->log_mutex); if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); +out: + btrfs_free_path(path); } /* see comments for btrfs_del_dir_entries_in_log */ @@ -3502,7 +3553,6 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; - u64 index; int ret; ret = inode_logged(trans, inode, NULL); @@ -3514,13 +3564,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, } ret = join_running_log_trans(root); - if (ret) + ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); + if (WARN_ON(ret)) return; log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), - dirid, &index); + ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); @@ -3685,7 +3735,7 @@ static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) * Add extra ref to scratch eb so that it is not freed when callers * release the path, so we can reuse it later if needed. */ - atomic_inc(&ctx->scratch_eb->refs); + refcount_inc(&ctx->scratch_eb->refs); return 0; } @@ -4173,44 +4223,37 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode, int log_inode_only, u64 logged_isize) { - struct btrfs_map_token token; u64 flags; - btrfs_init_map_token(&token, leaf); - if (log_inode_only) { /* set the generation to zero so the recover code * can tell the difference between an logging * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_token_inode_generation(&token, item, 0); - btrfs_set_token_inode_size(&token, item, logged_isize); + btrfs_set_inode_generation(leaf, item, 0); + btrfs_set_inode_size(leaf, item, logged_isize); } else { - btrfs_set_token_inode_generation(&token, item, - BTRFS_I(inode)->generation); - btrfs_set_token_inode_size(&token, item, inode->i_size); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_size(leaf, item, inode->i_size); } - btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); - btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); - btrfs_set_token_inode_mode(&token, item, inode->i_mode); - btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - btrfs_set_token_timespec_sec(&token, &item->atime, - inode_get_atime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->atime, - inode_get_atime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->mtime, - inode_get_mtime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode_get_mtime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4221,13 +4264,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * inode item in subvolume tree as needed (see overwrite_item()). */ - btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); - btrfs_set_token_inode_transid(&token, item, trans->transid); - btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); - btrfs_set_token_inode_flags(&token, item, flags); - btrfs_set_token_inode_block_group(&token, item, 0); + btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_block_group(leaf, item, 0); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -7193,8 +7236,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_path *path; struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_root *log; struct btrfs_fs_info *fs_info = log_root_tree->fs_info; struct walk_control wc = { .process_func = process_one_buffer, @@ -7228,6 +7269,9 @@ again: key.offset = (u64)-1; while (1) { + struct btrfs_root *log; + struct btrfs_key found_key; + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { @@ -7256,6 +7300,12 @@ again: true); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + wc.replay_dest = NULL; + if (ret != -ENOENT) { + btrfs_put_root(log); + btrfs_abort_transaction(trans, ret); + goto error; + } /* * We didn't find the subvol, likely because it was @@ -7268,36 +7318,36 @@ again: * block from being modified, and we'll just bail for * each subsequent pass. */ - if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(trans, log->node); - btrfs_put_root(log); - - if (!ret) - goto next; - btrfs_abort_transaction(trans, ret); - goto error; + ret = btrfs_pin_extent_for_log_replay(trans, log->node); + if (ret) { + btrfs_put_root(log); + btrfs_abort_transaction(trans, ret); + goto error; + } + goto next; } wc.replay_dest->log_root = log; ret = btrfs_record_root_in_trans(trans, wc.replay_dest); - if (ret) - /* The loop needs to continue due to the root refs */ + if (ret) { btrfs_abort_transaction(trans, ret); - else - ret = walk_log_tree(trans, log, &wc); + goto next; + } - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { - ret = fixup_inode_link_counts(trans, wc.replay_dest, - path); - if (ret) - btrfs_abort_transaction(trans, ret); + ret = walk_log_tree(trans, log, &wc); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto next; } - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + if (wc.stage == LOG_WALK_REPLAY_ALL) { struct btrfs_root *root = wc.replay_dest; - btrfs_release_path(path); - + ret = fixup_inode_link_counts(trans, wc.replay_dest, path); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto next; + } /* * We have just replayed everything, and the highest * objectid of fs roots probably has changed in case @@ -7307,17 +7357,20 @@ again: * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); - if (ret) + if (ret) { btrfs_abort_transaction(trans, ret); + goto next; + } + } +next: + if (wc.replay_dest) { + wc.replay_dest->log_root = NULL; + btrfs_put_root(wc.replay_dest); } - - wc.replay_dest->log_root = NULL; - btrfs_put_root(wc.replay_dest); btrfs_put_root(log); if (ret) goto error; -next: if (found_key.offset == 0) break; key.offset = found_key.offset - 1; @@ -7448,6 +7501,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * full log sync. * Also we don't need to worry with renames, since btrfs_rename() marks the log * for full commit when renaming a subvolume. + * + * Must be called before creating the subvolume entry in its parent directory. */ void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, struct btrfs_inode *dir) @@ -7484,6 +7539,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, bool log_pinned = false; int ret; + btrfs_init_log_ctx(&ctx, inode); + ctx.logging_new_name = true; + /* * this will force the logging code to walk the dentry chain * up for the file @@ -7515,6 +7573,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, ret = 0; /* + * Now that we know we need to update the log, allocate the scratch eb + * for the context before joining a log transaction below, as this can + * take time and therefore we could delay log commits from other tasks. + */ + btrfs_init_log_ctx_scratch_eb(&ctx); + + /* * If we are doing a rename (old_dir is not NULL) from a directory that * was previously logged, make sure that on log replay we get the old * dir entry deleted. This is needed because we will also log the new @@ -7532,6 +7597,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, &old_dentry->d_name, 0, &fname); if (ret) goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + fscrypt_free_filename(&fname); + goto out; + } + /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7545,19 +7618,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * mark the log for a full commit. */ if (WARN_ON_ONCE(ret < 0)) { + btrfs_free_path(path); fscrypt_free_filename(&fname); goto out; } log_pinned = true; - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - fscrypt_free_filename(&fname); - goto out; - } - /* * Other concurrent task might be logging the old directory, * as it can be triggered when logging other inode that had or @@ -7589,9 +7656,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, goto out; } - btrfs_init_log_ctx(&ctx, inode); - ctx.logging_new_name = true; - btrfs_init_log_ctx_scratch_eb(&ctx); /* * We don't care about the return value. If we fail to log the new name * then we know the next attempt to sync the log will fallback to a full @@ -7600,7 +7664,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); - free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.conflict_inodes)); out: /* @@ -7613,5 +7676,6 @@ out: btrfs_set_log_full_commit(trans); if (log_pinned) btrfs_end_log_trans(root); + free_extent_buffer(ctx.scratch_eb); } diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 1ac2678fc4ca..9e8cb3b7c064 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -27,18 +27,29 @@ struct tree_mod_elem { /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ u64 generation; - /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ - struct btrfs_disk_key key; - u64 blockptr; - - /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ - struct { - int dst_slot; - int nr_items; - } move; - - /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ - struct tree_mod_root old_root; + union { + /* + * This is used for the following op types: + * + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING + * BTRFS_MOD_LOG_KEY_REMOVE + * BTRFS_MOD_LOG_KEY_REPLACE + */ + struct { + struct btrfs_disk_key key; + u64 blockptr; + } slot_change; + + /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ + struct { + int dst_slot; + int nr_items; + } move; + + /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ + struct tree_mod_root old_root; + }; }; /* @@ -164,6 +175,30 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, return 0; } +static inline bool skip_eb_logging(const struct extent_buffer *eb) +{ + const u64 owner = btrfs_header_owner(eb); + + if (btrfs_header_level(eb) == 0) + return true; + + /* + * Tree mod logging exists so that there's a consistent view of the + * extents and backrefs of inodes even if while a task is iterating over + * them other tasks are modifying subvolume trees and the extent tree + * (including running delayed refs). So we only need to log extent + * buffers from the extent tree and subvolume trees. + */ + + if (owner == BTRFS_EXTENT_TREE_OBJECTID) + return false; + + if (btrfs_is_fstree(owner)) + return false; + + return true; +} + /* * Determines if logging can be omitted. Returns true if it can. Otherwise, it * returns false with the tree_mod_log_lock acquired. The caller must hold @@ -174,7 +209,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; - if (eb && btrfs_header_level(eb) == 0) + if (eb && skip_eb_logging(eb)) return true; write_lock(&fs_info->tree_mod_log_lock); @@ -192,7 +227,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; - if (eb && btrfs_header_level(eb) == 0) + if (eb && skip_eb_logging(eb)) return false; return true; @@ -204,15 +239,17 @@ static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, { struct tree_mod_elem *tm; + /* Can't be one of these types, due to union in struct tree_mod_elem. */ + ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS); + ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) return NULL; tm->logical = eb->start; - if (op != BTRFS_MOD_LOG_KEY_ADD) { - btrfs_node_key(eb, &tm->key, slot); - tm->blockptr = btrfs_node_blockptr(eb, slot); - } + btrfs_node_key(eb, &tm->slot_change.key, slot); + tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot); tm->op = op; tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); @@ -830,8 +867,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, fallthrough; case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: case BTRFS_MOD_LOG_KEY_REMOVE: - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); n++; @@ -840,8 +877,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, break; case BTRFS_MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); break; diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index fc59b57257d6..7e16a253fb35 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -129,21 +129,25 @@ void ulist_free(struct ulist *ulist) kfree(ulist); } +static int ulist_node_val_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *val = key; + const struct ulist_node *unode = rb_entry(node, struct ulist_node, rb_node); + + if (unode->val < *val) + return 1; + else if (unode->val > *val) + return -1; + + return 0; +} + static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) { - struct rb_node *n = ulist->root.rb_node; - struct ulist_node *u = NULL; - - while (n) { - u = rb_entry(n, struct ulist_node, rb_node); - if (u->val < val) - n = n->rb_right; - else if (u->val > val) - n = n->rb_left; - else - return u; - } - return NULL; + struct rb_node *node; + + node = rb_find(&val, &ulist->root, ulist_node_val_key_cmp); + return rb_entry_safe(node, struct ulist_node, rb_node); } static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) @@ -155,25 +159,20 @@ static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) ulist->nnodes--; } +static int ulist_node_val_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct ulist_node *unode = rb_entry(new, struct ulist_node, rb_node); + + return ulist_node_val_key_cmp(&unode->val, existing); +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { - struct rb_node **p = &ulist->root.rb_node; - struct rb_node *parent = NULL; - struct ulist_node *cur = NULL; - - while (*p) { - parent = *p; - cur = rb_entry(parent, struct ulist_node, rb_node); - - if (cur->val < ins->val) - p = &(*p)->rb_right; - else if (cur->val > ins->val) - p = &(*p)->rb_left; - else - return -EEXIST; - } - rb_link_node(&ins->rb_node, parent, p); - rb_insert_color(&ins->rb_node, &ulist->root); + struct rb_node *node; + + node = rb_find_add(&ins->rb_node, &ulist->root, ulist_node_val_cmp); + if (node) + return -EEXIST; return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f475b4b7c457..fa7a929a0461 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -18,7 +18,6 @@ #include "transaction.h" #include "volumes.h" #include "raid56.h" -#include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" @@ -214,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) u64 flags = bg_flags; u32 size_bp = size_buf; - if (!flags) { - strcpy(bp, "NONE"); + if (!flags) return; - } #define DESCRIBE_FLAG(flag, desc) \ do { \ @@ -403,7 +400,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); - rcu_string_free(device->name); + /* + * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is + * reading the device name. + */ + kfree(rcu_dereference_raw(device->name)); btrfs_extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); @@ -414,6 +415,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device *device; WARN_ON(fs_devices->opened); + WARN_ON(fs_devices->holding); while (!list_empty(&fs_devices->devices)) { device = list_first_entry(&fs_devices->devices, struct btrfs_device, dev_list); @@ -473,7 +475,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, struct block_device *bdev; int ret; - *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops); if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); @@ -488,7 +490,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (holder) { ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); if (ret) { - fput(*bdev_file); + bdev_fput(*bdev_file); goto error; } } @@ -496,7 +498,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, *disk_super = btrfs_read_disk_super(bdev, 0, false); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - fput(*bdev_file); + bdev_fput(*bdev_file); goto error; } @@ -541,7 +543,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device continue; if (devt && devt != device->devt) continue; - if (fs_devices->opened) { + if (fs_devices->opened || fs_devices->holding) { if (devt) ret = -EBUSY; break; @@ -657,7 +659,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (!device->name) return -EINVAL; - ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1, &bdev_file, &disk_super); if (ret) return ret; @@ -674,8 +676,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { - pr_err( - "BTRFS: Invalid seeding and uuid-changed device detected\n"); + btrfs_err(NULL, + "invalid seeding and uuid-changed device detected"); goto error_free_page; } @@ -701,7 +703,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (device->devt != device->bdev->bd_dev) { btrfs_warn(NULL, "device %s maj:min changed from %d:%d to %d:%d", - device->name->str, MAJOR(device->devt), + rcu_dereference_raw(device->name), MAJOR(device->devt), MINOR(device->devt), MAJOR(device->bdev->bd_dev), MINOR(device->bdev->bd_dev)); @@ -720,7 +722,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - fput(bdev_file); + bdev_fput(bdev_file); return -EINVAL; } @@ -749,7 +751,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) goto out; rcu_read_lock(); - ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX); rcu_read_unlock(); if (ret < 0) goto out; @@ -782,11 +784,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = NULL; - struct rcu_string *name; + const char *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; - int error; + int ret; bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); @@ -798,11 +800,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EAGAIN); } - error = lookup_bdev(path, &path_devt); - if (error) { + ret = lookup_bdev(path, &path_devt); + if (ret) { btrfs_err(NULL, "failed to lookup block device for path %s: %d", - path, error); - return ERR_PTR(error); + path, ret); + return ERR_PTR(ret); } fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); @@ -819,7 +821,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; - pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", + btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU", path, MAJOR(path_devt), MINOR(path_devt), fs_devices->fsid); } @@ -890,6 +892,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, current->comm, task_pid_nr(current)); } else if (!device->name || !is_same_device(device, path)) { + const char *old_name; + /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that @@ -943,27 +947,31 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (device->bdev) { if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_warn_in_rcu(NULL, + btrfs_warn(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, current->comm, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - btrfs_info_in_rcu(NULL, + btrfs_info(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } - name = rcu_string_strdup(path, GFP_NOFS); + name = kstrdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-ENOMEM); } - rcu_string_free(device->name); + rcu_read_lock(); + old_name = rcu_dereference(device->name); + rcu_read_unlock(); rcu_assign_pointer(device->name, name); + kfree_rcu_mightsleep(old_name); + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); @@ -1012,7 +1020,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * uuid mutex so nothing we touch in here is going to disappear. */ if (orig_dev->name) - dev_path = orig_dev->name->str; + dev_path = rcu_dereference_raw(orig_dev->name); device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid, dev_path); @@ -1070,7 +1078,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; if (device->bdev_file) { - fput(device->bdev_file); + bdev_fput(device->bdev_file); device->bdev = NULL; device->bdev_file = NULL; fs_devices->open_devices--; @@ -1117,7 +1125,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - fput(device->bdev_file); + bdev_fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1197,7 +1205,7 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) { + if (!fs_devices->opened && !fs_devices->holding) { list_splice_init(&fs_devices->seed_list, &list); /* @@ -1414,7 +1422,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev && (device->bdev->bd_dev == devt) && - strcmp(device->name->str, path) != 0) { + strcmp(rcu_dereference_raw(device->name), path) != 0) { mutex_unlock(&fs_devices->device_list_mutex); /* Do not skip registration. */ @@ -1440,7 +1448,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, * the device or return an error. Multi-device and seeding devices are registered * in both cases. */ -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, +struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev) { struct btrfs_super_block *disk_super; @@ -1461,7 +1469,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL); if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); @@ -1473,7 +1481,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, devt = file_bdev(bdev_file)->bd_dev; if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { - pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", + btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)", path, MAJOR(devt), MINOR(devt)); btrfs_free_stale_devices(devt, NULL); @@ -1490,7 +1498,7 @@ free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - fput(bdev_file); + bdev_fput(bdev_file); return device; } @@ -2164,7 +2172,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device->name->str); + update_dev_time(rcu_dereference_raw(device->name)); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, @@ -2204,7 +2212,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } if (btrfs_pinned_by_swapfile(fs_info, device)) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", btrfs_dev_name(device), device->devid); return -ETXTBSY; @@ -2294,7 +2302,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and fput() on the block device will pull in the + * write lock, and bdev_fput() on the block device will pull in the * ->open_mutex on the block device and it's dependencies. Instead * just flush the device and let the caller do the final bdev_release. */ @@ -2473,7 +2481,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - fput(bdev_file); + bdev_fput(bdev_file); return 0; } @@ -2705,7 +2713,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return -EROFS; bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); + fs_info->sb, &fs_holder_ops); if (IS_ERR(bdev_file)) return PTR_ERR(bdev_file); @@ -2921,7 +2929,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - fput(bdev_file); + bdev_fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -3404,7 +3412,8 @@ out: return ret; } -int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, + bool verbose) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; @@ -3434,7 +3443,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); - ret = btrfs_relocate_block_group(fs_info, chunk_offset); + ret = btrfs_relocate_block_group(fs_info, chunk_offset, true); btrfs_scrub_continue(fs_info); if (ret) { /* @@ -3544,7 +3553,8 @@ again: btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(fs_info, found_key.offset); + ret = btrfs_relocate_chunk(fs_info, found_key.offset, + true); if (ret == -ENOSPC) failed++; else @@ -4209,7 +4219,7 @@ again: } } - ret = btrfs_relocate_chunk(fs_info, found_key.offset); + ret = btrfs_relocate_chunk(fs_info, found_key.offset, true); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { enospc_errors++; @@ -4977,7 +4987,7 @@ again: goto done; } - ret = btrfs_relocate_chunk(fs_info, chunk_offset); + ret = btrfs_relocate_chunk(fs_info, chunk_offset, true); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { failed++; @@ -5009,8 +5019,8 @@ again: mutex_lock(&fs_info->chunk_mutex); /* Clear all state bits beyond the shrunk device size */ - btrfs_clear_extent_bits(&device->alloc_state, new_size, (u64)-1, - CHUNK_STATE_MASK); + btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK, NULL); btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) @@ -5437,9 +5447,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - btrfs_clear_extent_bits(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT); + btrfs_clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -6923,9 +6933,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, generate_random_uuid(dev->uuid); if (path) { - struct rcu_string *name; + const char *name; - name = rcu_string_strdup(path, GFP_KERNEL); + name = kstrdup(path, GFP_KERNEL); if (!name) { btrfs_free_device(dev); return ERR_PTR(-ENOMEM); @@ -7174,7 +7184,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); + ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb); if (ret) { free_fs_devices(fs_devices); return ERR_PTR(ret); @@ -7706,7 +7716,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "error %d while searching for dev_stats item for device %s", ret, btrfs_dev_name(device)); goto out; @@ -7717,7 +7727,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "delete too small dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; @@ -7731,7 +7741,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "insert dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; @@ -7794,7 +7804,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) if (!dev->dev_stats_valid) return; - btrfs_err_rl_in_rcu(dev->fs_info, + btrfs_err_rl(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), @@ -7814,7 +7824,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - btrfs_info_in_rcu(dev->fs_info, + btrfs_info(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), @@ -7938,7 +7948,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* - * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * Very old mkfs.btrfs (before v4.15) will not respect the reserved * space. Although kernel can handle it without problem, better to warn * the users. */ @@ -8190,7 +8200,7 @@ static int relocating_repair_kthread(void *data) btrfs_info(fs_info, "zoned: relocating block group %llu to repair IO failure", target); - ret = btrfs_relocate_chunk(fs_info, target); + ret = btrfs_relocate_chunk(fs_info, target, true); out: if (cache) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 137cc232f58e..a56e873a3029 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -21,7 +21,6 @@ #include <uapi/linux/btrfs.h> #include <uapi/linux/btrfs_tree.h> #include "messages.h" -#include "rcu-string.h" #include "extent-io-tree.h" struct block_device; @@ -114,7 +113,8 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string __rcu *name; + /* Device path or NULL if missing. */ + const char __rcu *name; u64 generation; @@ -422,6 +422,16 @@ struct btrfs_fs_devices { /* Count fs-devices opened. */ int opened; + /* + * Counter of the processes that are holding this fs_devices but not + * yet opened. + * This is for mounting handling, as we can only open the fs_devices + * after a super block is created. But we cannot take uuid_mutex + * during sget_fc(), thus we have to hold the fs_devices (meaning it + * cannot be released) until a super block is returned. + */ + int holding; + /* Set when we find or add a device that doesn't have the nonrot flag set. */ bool rotating; /* Devices support TRIM/discard commands. */ @@ -667,7 +677,7 @@ enum btrfs_map_op { BTRFS_MAP_GET_READ_MIRRORS, }; -static inline enum btrfs_map_op btrfs_op(struct bio *bio) +static inline enum btrfs_map_op btrfs_op(const struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: @@ -719,8 +729,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, - bool mount_arg_dev); +struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); @@ -754,7 +763,8 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); -int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, + bool verbose); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); @@ -846,7 +856,7 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device) if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) return "<missing disk>"; else - return rcu_str_deref(device->name); + return rcu_dereference(device->name); } static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol) @@ -854,6 +864,20 @@ static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocati WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol); } +static inline void btrfs_fs_devices_inc_holding(struct btrfs_fs_devices *fs_devices) +{ + lockdep_assert_held(&uuid_mutex); + ASSERT(fs_devices->holding >= 0); + fs_devices->holding++; +} + +static inline void btrfs_fs_devices_dec_holding(struct btrfs_fs_devices *fs_devices) +{ + lockdep_assert_held(&uuid_mutex); + ASSERT(fs_devices->holding > 0); + fs_devices->holding--; +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3e0edbcf73e1..79fb1614bd0c 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -510,14 +510,15 @@ static int btrfs_initxattrs(struct inode *inode, */ nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + - strlen(xattr->name) + 1, GFP_KERNEL); + const size_t name_len = XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1; + + name = kmalloc(name_len, GFP_KERNEL); if (!name) { ret = -ENOMEM; break; } - strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + scnprintf(name, name_len, "%s%s", XATTR_SECURITY_PREFIX, xattr->name); if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 9430b34d3cbb..245e813ecd78 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -9,7 +9,6 @@ #include "ctree.h" #include "volumes.h" #include "zoned.h" -#include "rcu-string.h" #include "disk-io.h" #include "block-group.h" #include "dev-replace.h" @@ -17,6 +16,7 @@ #include "fs.h" #include "accessors.h" #include "bio.h" +#include "transaction.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -263,9 +263,9 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to read zone %llu on %s (devid %llu)", - pos, rcu_str_deref(device->name), + pos, rcu_dereference(device->name), device->devid); return ret; } @@ -395,16 +395,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) /* We reject devices with a zone size larger than 8GB */ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: zone size %llu larger than supported maximum %llu", - rcu_str_deref(device->name), + rcu_dereference(device->name), zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: zone size %llu smaller than supported minimum %u", - rcu_str_deref(device->name), + rcu_dereference(device->name), zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); ret = -EINVAL; goto out; @@ -418,9 +418,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", - rcu_str_deref(device->name), max_active_zones, + rcu_dereference(device->name), max_active_zones, BTRFS_MIN_ACTIVE_ZONES); ret = -EINVAL; goto out; @@ -460,9 +460,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_info->zone_cache = vcalloc(zone_info->nr_zones, sizeof(struct blk_zone)); if (!zone_info->zone_cache) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to allocate zone cache for %s", - rcu_str_deref(device->name)); + rcu_dereference(device->name)); ret = -ENOMEM; goto out; } @@ -497,9 +497,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } if (nreported != zone_info->nr_zones) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", - rcu_str_deref(device->name), nreported, + rcu_dereference(device->name), nreported, zone_info->nr_zones); ret = -EIO; goto out; @@ -507,9 +507,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (max_active_zones) { if (nactive > max_active_zones) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", - nactive, rcu_str_deref(device->name), + nactive, rcu_dereference(device->name), max_active_zones); ret = -EIO; goto out; @@ -538,7 +538,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; @@ -556,7 +556,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); if (ret != -ENOENT && ret) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; @@ -575,9 +575,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) emulated = "emulated "; } - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "%s block device %s, %u %szones of %llu bytes", - model, rcu_str_deref(device->name), zone_info->nr_zones, + model, rcu_dereference(device->name), zone_info->nr_zones, emulated, zone_info->zone_size); return 0; @@ -1182,10 +1182,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) continue; /* Free regions should be empty */ - btrfs_warn_in_rcu( + btrfs_warn( device->fs_info, "zoned: resetting device %s (devid %llu) zone %llu for allocation", - rcu_str_deref(device->name), device->devid, pos >> shift); + rcu_dereference(device->name), device->devid, pos >> shift); WARN_ON_ONCE(1); ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, @@ -1345,9 +1345,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, } if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", - zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), + zone.start << SECTOR_SHIFT, rcu_dereference(device->name), device->devid); up_read(&dev_replace->rwsem); return -EIO; @@ -1358,10 +1358,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", (info->physical >> device->zone_info->zone_size_shift), - rcu_str_deref(device->name), device->devid); + rcu_dereference(device->name), device->devid); info->alloc_offset = WP_MISSING_DEV; break; case BLK_ZONE_COND_EMPTY: @@ -2485,7 +2485,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, /* For the work */ btrfs_get_block_group(bg); - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); queue_work(system_unbound_wq, &bg->zone_finish_work); @@ -2501,6 +2501,66 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) spin_unlock(&fs_info->relocation_bg_lock); } +void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + struct btrfs_space_info *space_info = data_sinfo->sub_group[0]; + struct btrfs_trans_handle *trans; + struct btrfs_block_group *bg; + struct list_head *bg_list; + u64 alloc_flags; + bool initial = false; + bool did_chunk_alloc = false; + int index; + int ret; + + if (!btrfs_is_zoned(fs_info)) + return; + + if (fs_info->data_reloc_bg) + return; + + if (sb_rdonly(fs_info->sb)) + return; + + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); + index = btrfs_bg_flags_to_raid_index(alloc_flags); + + bg_list = &data_sinfo->block_groups[index]; +again: + list_for_each_entry(bg, bg_list, list) { + if (bg->used > 0) + continue; + + if (!initial) { + initial = true; + continue; + } + + fs_info->data_reloc_bg = bg->start; + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); + btrfs_zone_activate(bg); + + return; + } + + if (did_chunk_alloc) + return; + + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return; + + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); + if (ret == 1) { + did_chunk_alloc = true; + bg_list = &space_info->block_groups[index]; + goto again; + } +} + void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2523,8 +2583,8 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + u64 total = btrfs_super_total_bytes(fs_info->super_copy); u64 used = 0; - u64 total = 0; u64 factor; ASSERT(btrfs_is_zoned(fs_info)); @@ -2537,7 +2597,6 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) if (!device->bdev) continue; - total += device->disk_total_bytes; used += device->bytes_used; } mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 9672bf4c3335..6e11533b8e14 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -88,6 +88,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, @@ -241,6 +242,8 @@ static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } +static inline void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { } + static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 4a796a049b5a..ff0292615e1f 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -200,8 +200,7 @@ void zstd_init_workspace_manager(void) ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { - pr_warn( - "BTRFS: cannot preallocate zstd compression workspace\n"); + btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); } else { set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); |