summaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-10 09:27:40 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-10 09:27:40 -0800
commitaffc5af36bbb62073b6aaa4f4459b38937ff5331 (patch)
tree2267430b2cc94ce7c8e87ac362bcc28e89975db1 /fs/btrfs/inode.c
parent12958e9c4c8e93ef694c10960c78453edf21526e (diff)
parente94dfb7a2935cb91faca88bf7136177d1ce0dda8 (diff)
Merge tag 'for-6.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "There are no exciting changes for users, it's been mostly API conversions and some fixes or refactoring. The mount API conversion is a base for future improvements that would come with VFS. Metadata processing has been converted to folios, not yet enabling the large folios but it's one patch away once everything gets tested enough. Core changes: - convert extent buffers to folios: - direct API conversion where possible - performance can drop by a few percent on metadata heavy workloads, the folio sizes are not constant and the calculations add up in the item helpers - both regular and subpage modes - data cannot be converted yet, we need to port that to iomap and there are some other generic changes required - convert mount to the new API, should not be user visible: - options deprecated long time ago have been removed: inode_cache, recovery - the new logic that splits mount to two phases slightly changes timing of device scanning for multi-device filesystems - LSM options will now work (like for selinux) - convert delayed nodes radix tree to xarray, preserving the preload-like logic that still allows to allocate with GFP_NOFS - more validation of sysfs value of scrub_speed_max - refactor chunk map structure, reduce size and improve performance - extent map refactoring, smaller data structures, improved performance - reduce size of struct extent_io_tree, embedded in several structures - temporary pages used for compression are cached and attached to a shrinker, this may slightly improve performance - in zoned mode, remove redirty extent buffer tracking, zeros are written in case an out-of-order is detected and proper data are written to the actual write pointer - cleanups, refactoring, error message improvements, updated tests - verify and update branch name or tag - remove unwanted text" * tag 'for-6.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (89 commits) btrfs: pass btrfs_io_geometry into btrfs_max_io_len btrfs: pass struct btrfs_io_geometry to set_io_stripe btrfs: open code set_io_stripe for RAID56 btrfs: change block mapping to switch/case in btrfs_map_block btrfs: factor out block mapping for single profiles btrfs: factor out block mapping for RAID5/6 btrfs: reduce scope of data_stripes in btrfs_map_block btrfs: factor out block mapping for RAID10 btrfs: factor out block mapping for DUP profiles btrfs: factor out RAID1 block mapping btrfs: factor out block-mapping for RAID0 btrfs: re-introduce struct btrfs_io_geometry btrfs: factor out helper for single device IO check btrfs: migrate btrfs_repair_io_failure() to folio interfaces btrfs: migrate eb_bitmap_offset() to folio interfaces btrfs: migrate various end io functions to folios btrfs: migrate subpage code to folio interfaces btrfs: migrate get_eb_page_index() and get_eb_offset_in_page() to folios btrfs: don't double put our subpage reference in alloc_extent_buffer btrfs: cleanup metadata page pointer usage ...
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c153
1 files changed, 98 insertions, 55 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fea464b2a54e..809b11472a80 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,6 +114,15 @@ struct data_reloc_warn {
int mirror_num;
};
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -447,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
- offset, bytes);
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
+ page_folio(page), offset, bytes);
put_page(page);
}
@@ -1037,7 +1046,7 @@ free_pages:
if (pages) {
for (i = 0; i < nr_pages; i++) {
WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ btrfs_free_compr_page(pages[i]);
}
kfree(pages);
}
@@ -1052,7 +1061,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
- put_page(async_extent->pages[i]);
+ btrfs_free_compr_page(async_extent->pages[i]);
}
kfree(async_extent->pages);
async_extent->nr_pages = 0;
@@ -2793,7 +2802,7 @@ out_page:
PAGE_SIZE, !ret);
clear_page_dirty_for_io(page);
}
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2848,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
@@ -3118,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
+ unpin_extent_cache(inode, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -3796,7 +3805,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4725,7 +4734,7 @@ again:
/*
* We unlock the page after the io is completed and then re-lock it
* above. release_folio() could have come in between that and cleared
- * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * folio private, but left the page in the mapping. Set the page mapped
* here to make sure it's properly set for the subpage stuff.
*/
ret = set_page_extent_mapped(page);
@@ -4767,9 +4776,10 @@ again:
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
}
- btrfs_page_clear_checked(fs_info, page, block_start,
- block_end + 1 - block_start);
- btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), block_start,
+ block_end + 1 - block_start);
+ btrfs_folio_set_dirty(fs_info, page_folio(page), block_start,
+ block_end + 1 - block_start);
unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -4889,7 +4899,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
err = maybe_insert_hole(inode, cur_offset, hole_size);
@@ -4917,7 +4927,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = btrfs_get_fs_generation(fs_info);
err = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -6217,6 +6226,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * We don't have any capability xattrs set here yet, shortcut any
+ * queries for the xattrs here. If we add them later via the inode
+ * security init path or any other path this flag will be cleared.
+ */
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ /*
* Subvolumes don't inherit flags from their parent directory.
* Originally this was probably by accident, but we probably can't
* change it now without compatibility issues.
@@ -7258,13 +7274,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
- if (type == BTRFS_ORDERED_PREALLOC) {
- set_bit(EXTENT_FLAG_FILLING, &em->flags);
- } else if (type == BTRFS_ORDERED_COMPRESSED) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
+ em->flags |= EXTENT_FLAG_PINNED;
+ if (type == BTRFS_ORDERED_PREALLOC)
+ em->flags |= EXTENT_FLAG_FILLING;
+ else if (type == BTRFS_ORDERED_COMPRESSED)
+ extent_map_set_compression(em, compress_type);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7304,10 +7318,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* just use the extent.
*
*/
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
type = BTRFS_ORDERED_PREALLOC;
else
type = BTRFS_ORDERED_NOCOW;
@@ -7542,7 +7556,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ if (extent_map_is_compressed(em) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
/*
@@ -7638,7 +7652,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* that, since we have locked only the parts we are performing I/O in.
*/
if ((em->block_start == EXTENT_MAP_HOLE) ||
- (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
@@ -7851,13 +7865,14 @@ static void btrfs_readahead(struct readahead_control *rac)
static void wait_subpage_spinlock(struct page *page)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7995,7 +8010,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8004,7 +8019,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*/
goto next;
}
- btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+ btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8074,7 +8089,7 @@ next:
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
- btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
@@ -8098,6 +8113,7 @@ next:
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
+ struct folio *folio = page_folio(page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -8114,6 +8130,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
u64 page_end;
u64 end;
+ ASSERT(folio_order(folio) == 0);
+
reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
@@ -8217,9 +8235,9 @@ again:
if (zero_start != PAGE_SIZE)
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
- btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
- btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+ btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
@@ -8462,10 +8480,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
+ struct extent_io_tree *file_extent_tree = NULL;
+
+ /* Self tests may pass a NULL fs_info. */
+ if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!file_extent_tree)
+ return NULL;
+ }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei)
+ if (!ei) {
+ kfree(file_extent_tree);
return NULL;
+ }
ei->root = NULL;
ei->generation = 0;
@@ -8501,10 +8529,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
+
+ /* This io tree sets the valid inode. */
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
+
+ ei->file_extent_tree = file_extent_tree;
+ if (file_extent_tree) {
+ extent_io_tree_init(fs_info, ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
+ }
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
@@ -8521,12 +8557,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
void btrfs_test_destroy_inode(struct inode *inode)
{
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
void btrfs_free_inode(struct inode *inode)
{
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -9632,7 +9670,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
@@ -9785,7 +9823,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- btrfs_page_set_writeback(fs_info, page, start, len);
+ /* This is for data, which doesn't yet support larger folio. */
+ ASSERT(folio_order(page_folio(page)) == 0);
+ btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
put_page(page);
index++;
}
@@ -9994,7 +10034,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages);
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
if (ret) {
ret = -ENOMEM;
goto out;
@@ -10113,12 +10153,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ (em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ } else if (extent_map_is_compressed(em)) {
disk_bytenr = em->block_start;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -10133,7 +10173,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- em->compress_type);
+ extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
@@ -10564,6 +10604,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
+ struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
@@ -10680,7 +10721,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ if (extent_map_is_compressed(em)) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
@@ -10703,13 +10744,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
goto out;
}
- if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info,
"swapfile must have single data profile");
ret = -EINVAL;
@@ -10717,23 +10758,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
if (device == NULL) {
- device = em->map_lookup->stripes[0].dev;
+ device = map->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false);
if (ret == 1)
ret = 0;
else if (ret)
goto out;
- } else if (device != em->map_lookup->stripes[0].dev) {
+ } else if (device != map->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL;
goto out;
}
- physical_block_start = (em->map_lookup->stripes[0].physical +
- (logical_block_start - em->start));
- len = min(len, em->len - (logical_block_start - em->start));
- free_extent_map(em);
- em = NULL;
+ physical_block_start = (map->stripes[0].physical +
+ (logical_block_start - map->start));
+ len = min(len, map->chunk_len - (logical_block_start - map->start));
+ btrfs_free_chunk_map(map);
+ map = NULL;
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
if (!bg) {
@@ -10786,6 +10827,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
+ if (!IS_ERR_OR_NULL(map))
+ btrfs_free_chunk_map(map);
unlock_extent(io_tree, 0, isize - 1, &cached_state);