diff options
Diffstat (limited to 'fs/btrfs/defrag.c')
-rw-r--r-- | fs/btrfs/defrag.c | 229 |
1 files changed, 102 insertions, 127 deletions
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5b0b64571418..968dae953948 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include "ctree.h" #include "disk-io.h" -#include "print-tree.h" #include "transaction.h" #include "locking.h" #include "accessors.h" @@ -46,8 +45,8 @@ struct inode_defrag { u32 extent_thresh; }; -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) +static int compare_inode_defrag(const struct inode_defrag *defrag1, + const struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; @@ -62,16 +61,14 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, } /* - * Pop a record for an inode into the defrag tree. The lock must be held + * Insert a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. - * - * If an existing record is found the defrag item you pass in is freed. */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) +static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; @@ -84,7 +81,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(defrag, entry); + ret = compare_inode_defrag(defrag, entry); if (ret < 0) p = &parent->rb_left; else if (ret > 0) @@ -108,7 +105,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; @@ -120,35 +117,29 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) } /* - * Insert a defrag record for this inode if auto defrag is enabled. + * Insert a defrag record for this inode if auto defrag is enabled. No errors + * returned as they're not considered fatal. */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; - u64 transid; int ret; - if (!__need_auto_defrag(fs_info)) - return 0; + if (!need_auto_defrag(fs_info)) + return; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = inode->root->last_trans; + return; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) - return -ENOMEM; + return; defrag->ino = btrfs_ino(inode); - defrag->transid = transid; - defrag->root = root->root_key.objectid; + defrag->transid = btrfs_get_root_last_trans(root); + defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); @@ -158,14 +149,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ - ret = __btrfs_add_inode_defrag(inode, defrag); + ret = btrfs_insert_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); - return 0; } /* @@ -190,7 +180,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(&tmp, entry); + ret = compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) @@ -199,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( goto out; } - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -215,27 +205,24 @@ out: void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { - struct inode_defrag *defrag; - struct rb_node *node; + struct inode_defrag *defrag, *next; spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); + + rbtree_postorder_for_each_entry_safe(defrag, next, + &fs_info->defrag_inodes, rb_node) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - cond_resched_lock(&fs_info->defrag_inodes_lock); + fs_info->defrag_inodes = RB_ROOT; - node = rb_first(&fs_info->defrag_inodes); - } spin_unlock(&fs_info->defrag_inodes_lock); } #define BTRFS_DEFRAG_BATCH 1024 -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) +static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag, + struct file_ra_state *ra) { struct btrfs_root *inode_root; struct inode *inode; @@ -246,7 +233,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ @@ -256,7 +243,7 @@ again: goto cleanup; } - inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); + inode = btrfs_iget(defrag->ino, inode_root); btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -274,9 +261,10 @@ again: range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; + file_ra_state_init(ra, inode->i_mapping); sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); iput(inode); @@ -303,11 +291,13 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while (1) { + struct file_ra_state ra = { 0 }; + /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) break; /* find an inode to defrag */ @@ -325,7 +315,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - __btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag, &ra); } atomic_dec(&fs_info->defrag_running); @@ -521,7 +511,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, * keep_locks set and lowest_level is 1, regardless of the value of * path->slots[1]. */ - BUG_ON(path->locks[1] == 0); + ASSERT(path->locks[1] != 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -708,8 +698,10 @@ iterate: */ if (key.offset > start) { em->start = start; - em->orig_start = start; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; + em->disk_num_bytes = 0; + em->ram_bytes = 0; + em->offset = 0; em->len = key.offset - start; break; } @@ -771,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * We can get a merged extent, in that case, we need to re-search * tree to get the original em for defrag. * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. + * This is because even if we have adjacent extents that are contiguous + * and compatible (same type and flags), we still want to defrag them + * so that we use less metadata (extent items in the extent tree and + * file extent items in the inode's subvolume tree). */ - if (em && (em->flags & EXTENT_FLAG_MERGED) && - newer_than && em->generation >= newer_than) { + if (em && (em->flags & EXTENT_FLAG_MERGED)) { free_extent_map(em); em = NULL; } @@ -810,7 +802,7 @@ static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *next; bool ret = false; @@ -826,7 +818,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, */ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); /* No more em or hole */ - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) goto out; if (next->flags & EXTENT_FLAG_PREALLOC) goto out; @@ -861,20 +853,21 @@ out: * NOTE: Caller should also wait for page writeback after the cluster is * prepared, here we don't do writeback wait for each page. */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); u64 page_start = (u64)index << PAGE_SHIFT; u64 page_end = page_start + PAGE_SIZE - 1; struct extent_state *cached_state = NULL; - struct page *page; + struct folio *folio; int ret; again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) + return folio; /* * Since we can defragment files opened read-only, we can encounter @@ -884,16 +877,16 @@ again: * executables that explicitly enable them, so this isn't very * restrictive. */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ETXTBSY); } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ERR_PTR(ret); } @@ -908,17 +901,17 @@ again: if (!ordered) break; - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); - lock_page(page); + folio_lock(folio); /* - * We unlocked the page above, so we need check if it was + * We unlocked the folio above, so we need check if it was * released or not. */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } } @@ -927,21 +920,21 @@ again: * Now the page range has no ordered extent any more. Read the page to * make it uptodate. */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } } - return page; + return folio; } struct defrag_target_range { @@ -992,12 +985,12 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * This is for users who want to convert inline extents to * regular ones through max_inline= mount option. */ - if (em->block_start == EXTENT_MAP_INLINE && + if (em->disk_bytenr == EXTENT_MAP_INLINE && em->len <= inode->root->fs_info->max_inline) goto next; /* Skip holes and preallocated extents. */ - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) goto next; @@ -1062,7 +1055,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * So if an inline extent passed all above checks, just add it * for defrag, and be converted to regular extents. */ - if (em->block_start == EXTENT_MAP_INLINE) + if (em->disk_bytenr == EXTENT_MAP_INLINE) goto add; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, @@ -1162,7 +1155,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); */ static int defrag_one_locked_target(struct btrfs_inode *inode, struct defrag_target_range *target, - struct page **pages, int nr_pages, + struct folio **folios, int nr_pages, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1171,7 +1164,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, const u64 len = target->len; unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); + unsigned long first_index = folios[0]->index; int ret = 0; int i; @@ -1188,8 +1181,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len); + folio_clear_checked(folios[i]); + btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1205,7 +1198,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); - struct page **pages; + struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; u64 last_index = (start + len - 1) >> PAGE_SHIFT; u64 start_index = start >> PAGE_SHIFT; @@ -1216,21 +1209,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) + folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); + if (!folios) return -ENOMEM; /* Prepare all pages */ for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; + folios[i] = defrag_prepare_one_folio(inode, start_index + i); + if (IS_ERR(folios[i])) { + ret = PTR_ERR(folios[i]); + nr_pages = i; + goto free_folios; } } for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); + folio_wait_writeback(folios[i]); /* Lock the pages range */ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, @@ -1250,7 +1243,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, goto unlock_extent; list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + ret = defrag_one_locked_target(inode, entry, folios, nr_pages, &cached_state); if (ret < 0) break; @@ -1264,14 +1257,12 @@ unlock_extent: unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, &cached_state); -free_pages: +free_folios: for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folios[i]); + folio_put(folios[i]); } - kfree(pages); + kfree(folios); return ret; } @@ -1317,8 +1308,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, if (entry->start + range_len <= *last_scanned_ret) continue; - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, + page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, ((entry->start + range_len - 1) >> PAGE_SHIFT) - (entry->start >> PAGE_SHIFT) + 1); @@ -1350,7 +1340,7 @@ out: * Entry point to file defragmentation. * * @inode: inode to be defragged - * @ra: readahead state (can be NUL) + * @ra: readahead state * @range: defrag options including range and flags * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode @@ -1366,18 +1356,19 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long sectors_defragged = 0; u64 isize = i_size_read(inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); - bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; + ASSERT(ra); + if (isize == 0) return 0; @@ -1407,18 +1398,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, last_byte = round_up(last_byte, fs_info->sectorsize) - 1; /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - - /* * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. */ @@ -1472,8 +1451,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cond_resched(); } - if (ra_allocated) - kfree(ra); /* * Update range.start for autodefrag, this will indicate where to start * in next run. @@ -1512,9 +1489,7 @@ void __cold btrfs_auto_defrag_exit(void) int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct inode_defrag), 0, 0, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; |