diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 12:54:27 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 12:54:27 -0800 |
commit | 885ce48739189fac6645ff42d736ee0de0b5917d (patch) | |
tree | d69300909ff5b4a0291a7311cd25cf90801a650a /fs/btrfs/send.c | |
parent | 274978f173276c5720a3cd8d0b6047d2c0d3a684 (diff) | |
parent | 964a54e5e1a0d70cd80bd5a0885a1938463625b1 (diff) |
Merge tag 'for-6.3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"The usual mix of performance improvements and new features.
The core change is reworking how checksums are processed, with
followup cleanups and simplifications. There are two minor changes in
block layer and iomap code.
Features:
- block group allocation class heuristics:
- pack files by size (up to 128k, up to 8M, more) to avoid
fragmentation in block groups, assuming that file size and life
time is correlated, in particular this may help during balance
- with tracepoints and extensible in the future
Performance:
- send: cache directory utimes and only emit the command when
necessary
- speedup up to 10x
- smaller final stream produced (no redundant utimes commands
issued)
- compatibility not affected
- fiemap: skip backref checks for shared leaves
- speedup 3x on sample filesystem with all leaves shared (e.g. on
snapshots)
- micro optimized b-tree key lookup, speedup in metadata operations
(sample benchmark: fs_mark +10% of files/sec)
Core changes:
- change where checksumming is done in the io path:
- checksum and read repair does verification at lower layer
- cascaded cleanups and simplifications
- raid56 refactoring and cleanups
Fixes:
- sysfs: make sure that a run-time change of a feature is correctly
tracked by the feature files
- scrub: better reporting of tree block errors
Other:
- locally enable -Wmaybe-uninitialized after fixing all warnings
- misc cleanups, spelling fixes
Other code:
- block: export bio_split_rw
- iomap: remove IOMAP_F_ZONE_APPEND"
* tag 'for-6.3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (109 commits)
btrfs: make kobj_type structures constant
btrfs: remove the bdev argument to btrfs_rmap_block
btrfs: don't rely on unchanging ->bi_bdev for zone append remaps
btrfs: never return true for reads in btrfs_use_zone_append
btrfs: pass a btrfs_bio to btrfs_use_append
btrfs: set bbio->file_offset in alloc_new_bio
btrfs: use file_offset to limit bios size in calc_bio_boundaries
btrfs: do unsigned integer division in the extent buffer binary search loop
btrfs: eliminate extra call when doing binary search on extent buffer
btrfs: raid56: handle endio in scrub_rbio
btrfs: raid56: handle endio in recover_rbio
btrfs: raid56: handle endio in rmw_rbio
btrfs: raid56: submit the read bios from scrub_assemble_read_bios
btrfs: raid56: fold rmw_read_wait_recover into rmw_read_bios
btrfs: raid56: fold recover_assemble_read_bios into recover_rbio
btrfs: raid56: add a bio_list_put helper
btrfs: raid56: wait for I/O completion in submit_read_bios
btrfs: raid56: simplify code flow in rmw_rbio
btrfs: raid56: simplify error handling and code flow in raid56_parity_write
btrfs: replace btrfs_wait_tree_block_writeback by wait_on_extent_buffer_writeback
...
Diffstat (limited to 'fs/btrfs/send.c')
-rw-r--r-- | fs/btrfs/send.c | 684 |
1 files changed, 358 insertions, 326 deletions
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d50182b6deec..e5c963bb873d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -32,6 +32,7 @@ #include "file-item.h" #include "ioctl.h" #include "verity.h" +#include "lru_cache.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -80,23 +81,23 @@ struct clone_root { bool found_ref; }; -#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 -#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +#define SEND_MAX_NAME_CACHE_SIZE 256 /* - * Limit the root_ids array of struct backref_cache_entry to 12 elements. - * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * Limit the root_ids array of struct backref_cache_entry to 17 elements. + * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which + * can be satisfied from the kmalloc-192 slab, without wasting any space. * The most common case is to have a single root for cloning, which corresponds - * to the send root. Having the user specify more than 11 clone roots is not + * to the send root. Having the user specify more than 16 clone roots is not * common, and in such rare cases we simply don't use caching if the number of - * cloning roots that lead down to a leaf is more than 12. + * cloning roots that lead down to a leaf is more than 17. */ -#define SEND_MAX_BACKREF_CACHE_ROOTS 12 +#define SEND_MAX_BACKREF_CACHE_ROOTS 17 /* * Max number of entries in the cache. - * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding - * maple tree's internal nodes, is 16K. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding + * maple tree's internal nodes, is 24K. */ #define SEND_MAX_BACKREF_CACHE_SIZE 128 @@ -107,15 +108,31 @@ struct clone_root { * x86_64). */ struct backref_cache_entry { - /* List to link to the cache's lru list. */ - struct list_head list; - /* The key for this entry in the cache. */ - u64 key; + struct btrfs_lru_cache_entry entry; u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; /* Number of valid elements in the root_ids array. */ int num_roots; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct backref_cache_entry, entry) == 0); + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -174,9 +191,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct radix_tree_root name_cache; - struct list_head name_cache_list; - int name_cache_size; + struct btrfs_lru_cache name_cache; /* * The inode we are currently processing. It's not NULL only when we @@ -285,13 +300,11 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; - struct { - u64 last_reloc_trans; - struct list_head lru_list; - struct maple_tree entries; - /* Number of entries stored in the cache. */ - int size; - } backref_cache; + struct btrfs_lru_cache backref_cache; + u64 backref_cache_last_reloc_trans; + + struct btrfs_lru_cache dir_created_cache; + struct btrfs_lru_cache dir_utimes_cache; }; struct pending_dir_move { @@ -321,21 +334,15 @@ struct orphan_dir_info { u64 ino; u64 gen; u64 last_dir_index_offset; + u64 dir_high_seq_ino; }; struct name_cache_entry { - struct list_head list; /* - * radix_tree has only 32bit entries but we need to handle 64bit inums. - * We use the lower 32bit of the 64bit inum to store it in the tree. If - * more then one inum would fall into the same entry, we use radix_list - * to store the additional entries. radix_list is also used to store - * entries where two entries have the same inum but different - * generations. + * The key in the entry is an inode number, and the generation matches + * the inode's generation. */ - struct list_head radix_list; - u64 ino; - u64 gen; + struct btrfs_lru_cache_entry entry; u64 parent_ino; u64 parent_gen; int ret; @@ -344,6 +351,9 @@ struct name_cache_entry { char name[]; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct name_cache_entry, entry) == 0); + #define ADVANCE 1 #define ADVANCE_ONLY_NEXT -1 @@ -956,14 +966,12 @@ out: static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { int ret; - struct btrfs_inode_info info; + struct btrfs_inode_info info = { 0 }; - if (!gen) - return -EPERM; + ASSERT(gen); ret = get_inode_info(root, ino, &info); - if (!ret) - *gen = info.gen; + *gen = info.gen; return ret; } @@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, return 0; } -static void empty_backref_cache(struct send_ctx *sctx) -{ - struct backref_cache_entry *entry; - struct backref_cache_entry *tmp; - - list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) - kfree(entry); - - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mtree_destroy(&sctx->backref_cache.entries); - sctx->backref_cache.size = 0; -} - static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { @@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (sctx->backref_cache.size == 0) + if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) return false; /* @@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { - empty_backref_cache(sctx); + if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { + btrfs_lru_cache_clear(&sctx->backref_cache); return false; } - entry = mtree_load(&sctx->backref_cache.entries, key); - if (!entry) + raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); + if (!raw_entry) return false; + entry = container_of(raw_entry, struct backref_cache_entry, entry); *root_ids_ret = entry->root_ids; *root_count_ret = entry->num_roots; - list_move_tail(&entry->list, &sctx->backref_cache.lru_list); return true; } @@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); while ((node = ulist_next(root_ids, &uiter)) != NULL) { @@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * none of the roots is part of the list of roots from which we are * allowed to clone. Cache the new entry as it's still useful to avoid * backref walking to determine which roots have a path to the leaf. + * + * Also use GFP_NOFS because we're called while holding a transaction + * handle or while holding fs_info->commit_root_sem. */ - - if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { - struct backref_cache_entry *lru_entry; - struct backref_cache_entry *mt_entry; - - lru_entry = list_first_entry(&sctx->backref_cache.lru_list, - struct backref_cache_entry, list); - mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); - ASSERT(mt_entry == lru_entry); - list_del(&mt_entry->list); - kfree(mt_entry); - sctx->backref_cache.size--; - } - - ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, - new_entry, GFP_NOFS); + ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, + GFP_NOFS); ASSERT(ret == 0 || ret == -ENOMEM); if (ret) { /* Caching is optional, no worries. */ @@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, return; } - list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); - /* * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (sctx->backref_cache.size == 0) - sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; - - sctx->backref_cache.size++; + if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, @@ -1886,7 +1868,8 @@ enum inode_state { inode_state_did_delete, }; -static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; int left_ret; @@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; + if (send_gen) + *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); if (!sctx->parent_root) { right_ret = -ENOENT; @@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; + if (parent_gen) + *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); } if (!left_ret && !right_ret) { @@ -1953,14 +1940,15 @@ out: return ret; } -static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; if (ino == BTRFS_FIRST_FREE_OBJECTID) return 1; - ret = get_cur_inode_state(sctx, ino, gen); + ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) goto out; @@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen, u64 *who_mode) { - int ret = 0; - u64 gen; + int ret; + u64 parent_root_dir_gen; u64 other_inode = 0; struct btrfs_inode_info info; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); if (ret <= 0) - goto out; + return 0; /* * If we have a parent root we need to verify that the parent dir was * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. + * + * @parent_root_dir_gen was set to 0 if the inode does not exist in the + * parent root. */ - if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->parent_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && + parent_root_dir_gen != dir_gen) + return 0; ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; /* * Check if the overwritten ref was already processed. If yes, the ref @@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) - goto out; + return ret; - ret = 1; *who_ino = other_inode; *who_gen = info.gen; *who_mode = info.mode; - } else { - ret = 0; + return 1; } -out: - return ret; + return 0; } /* @@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx, u64 ino, u64 ino_gen, const char *name, int name_len) { - int ret = 0; - u64 gen; + int ret; u64 ow_inode; + u64 ow_gen = 0; + u64 send_root_dir_gen; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); if (ret <= 0) - goto out; + return ret; - if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->send_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + /* + * @send_root_dir_gen was set to 0 if the inode does not exist in the + * send root. + */ + if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) + return 0; /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { + if (ret == -ENOENT) { /* was never and will never be overwritten */ - ret = 0; - goto out; + return 0; + } else if (ret < 0) { + return ret; } - ret = get_inode_gen(sctx->send_root, ow_inode, &gen); - if (ret < 0) - goto out; + if (ow_inode == ino) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; - if (ow_inode == ino && gen == ino_gen) { - ret = 0; - goto out; + /* It's the same inode, so no overwrite happened. */ + if (ow_gen == ino_gen) + return 0; } /* @@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx, * inode 'ino' to be orphanized, therefore check if ow_inode matches * the current inode being processed. */ - if ((ow_inode < sctx->send_progress) || - (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && - gen == sctx->cur_inode_gen)) - ret = 1; - else - ret = 0; + if (ow_inode < sctx->send_progress) + return 1; -out: - return ret; + if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { + if (ow_gen == 0) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; + } + if (ow_gen == sctx->cur_inode_gen) + return 1; + } + + return 0; } /* @@ -2285,113 +2264,16 @@ out: return ret; } -/* - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, - * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::radix_list. - * In case of error, nce is kfreed. - */ -static int name_cache_insert(struct send_ctx *sctx, - struct name_cache_entry *nce) +static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) { - int ret = 0; - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); - if (!nce_head) { - kfree(nce); - return -ENOMEM; - } - INIT_LIST_HEAD(nce_head); - - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); - if (ret < 0) { - kfree(nce_head); - kfree(nce); - return ret; - } - } - list_add_tail(&nce->radix_list, nce_head); - list_add_tail(&nce->list, &sctx->name_cache_list); - sctx->name_cache_size++; - - return ret; -} + struct btrfs_lru_cache_entry *entry; -static void name_cache_delete(struct send_ctx *sctx, - struct name_cache_entry *nce) -{ - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - btrfs_err(sctx->send_root->fs_info, - "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", - nce->ino, sctx->name_cache_size); - } - - list_del(&nce->radix_list); - list_del(&nce->list); - sctx->name_cache_size--; - - /* - * We may not get to the final release of nce_head if the lookup fails - */ - if (nce_head && list_empty(nce_head)) { - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); - kfree(nce_head); - } -} - -static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, - u64 ino, u64 gen) -{ - struct list_head *nce_head; - struct name_cache_entry *cur; - - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); - if (!nce_head) + entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); + if (!entry) return NULL; - list_for_each_entry(cur, nce_head, radix_list) { - if (cur->ino == ino && cur->gen == gen) - return cur; - } - return NULL; -} - -/* - * Remove some entries from the beginning of name_cache_list. - */ -static void name_cache_clean_unused(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) - return; - - while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } -} - -static void name_cache_free(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - while (!list_empty(&sctx->name_cache_list)) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } + return container_of(entry, struct name_cache_entry, entry); } /* @@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct name_cache_entry *nce = NULL; + struct name_cache_entry *nce; /* * First check if we already did a call to this function with the same @@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { - name_cache_delete(sctx, nce); - kfree(nce); + btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); nce = NULL; } else { - /* - * Removes the entry from the list and adds it back to - * the end. This marks the entry as recently used so - * that name_cache_clean_unused does not remove it. - */ - list_move_tail(&nce->list, &sctx->name_cache_list); - *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * This should only happen for the parent dir that we determine in * record_new_ref_if_needed(). */ - ret = is_inode_existent(sctx, ino, gen); + ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) goto out; @@ -2497,8 +2371,8 @@ out_cache: goto out; } - nce->ino = ino; - nce->gen = gen; + nce->entry.key = ino; + nce->entry.gen = gen; nce->parent_ino = *parent_ino; nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); @@ -2510,10 +2384,11 @@ out_cache: else nce->need_later_update = 1; - nce_ret = name_cache_insert(sctx, nce); - if (nce_ret < 0) + nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); + if (nce_ret < 0) { + kfree(nce); ret = nce_ret; - name_cache_clean_unused(sctx); + } out: return ret; @@ -2884,6 +2759,63 @@ out: } /* + * If the cache is full, we can't remove entries from it and do a call to + * send_utimes() for each respective inode, because we might be finishing + * processing an inode that is a directory and it just got renamed, and existing + * entries in the cache may refer to inodes that have the directory in their + * full path - in which case we would generate outdated paths (pre-rename) + * for the inodes that the cache entries point to. Instead of prunning the + * cache when inserting, do it after we finish processing each inode at + * finish_inode_if_needed(). + */ +static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); + if (entry != NULL) + return 0; + + /* Caching is optional, don't fail if we can't allocate memory. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return send_utimes(sctx, dir, gen); + + entry->key = dir; + entry->gen = gen; + + ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); + ASSERT(ret != -EEXIST); + if (ret) { + kfree(entry); + return send_utimes(sctx, dir, gen); + } + + return 0; +} + +static int trim_dir_utimes_cache(struct send_ctx *sctx) +{ + while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > + SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + struct btrfs_lru_cache_entry *lru; + int ret; + + lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); + ASSERT(lru != NULL); + + ret = send_utimes(sctx, lru->key, lru->gen); + if (ret) + return ret; + + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); + } + + return 0; +} + +/* * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. @@ -2971,6 +2903,23 @@ out: return ret; } +static void cache_dir_created(struct send_ctx *sctx, u64 dir) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + /* Caching is optional, ignore any failures. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->key = dir; + entry->gen = 0; + ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); + if (ret < 0) + kfree(entry); +} + /* * We need some special handling for inodes that get processed before the parent * directory got created. See process_recorded_refs for details. @@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) struct btrfs_key di_key; struct btrfs_dir_item *di; + if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) + return 1; + path = alloc_path_for_send(); if (!path) return -ENOMEM; @@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; + cache_dir_created(sctx, dir); break; } } @@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) return 0; } - return send_create_inode(sctx, sctx->cur_ino); + ret = send_create_inode(sctx, sctx->cur_ino); + + if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) + cache_dir_created(sctx, sctx->cur_ino); + + return ret; } struct recorded_ref { @@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, odi->ino = dir_ino; odi->gen = dir_gen; odi->last_dir_index_offset = 0; + odi->dir_high_seq_ino = 0; rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); @@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx, * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) { int ret = 0; int iter_ret = 0; @@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_key loc; struct btrfs_dir_item *di; struct orphan_dir_info *odi = NULL; + u64 dir_high_seq_ino = 0; + u64 last_dir_index_offset = 0; /* * Don't try to rmdir the top/root subvolume dir. @@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (dir == BTRFS_FIRST_FREE_OBJECTID) return 0; + odi = get_orphan_dir_info(sctx, dir, dir_gen); + if (odi && sctx->cur_ino < odi->dir_high_seq_ino) + return 0; + path = alloc_path_for_send(); if (!path) return -ENOMEM; + if (!odi) { + /* + * Find the inode number associated with the last dir index + * entry. This is very likely the inode with the highest number + * of all inodes that have an entry in the directory. We can + * then use it to avoid future calls to can_rmdir(), when + * processing inodes with a lower number, from having to search + * the parent root b+tree for dir index keys. + */ + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + /* Can't happen, the root is never empty. */ + ASSERT(path->slots[0] > 0); + if (WARN_ON(path->slots[0] == 0)) { + ret = -EUCLEAN; + goto out; + } + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { + /* No index keys, dir can be removed. */ + ret = 1; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = loc.objectid; + if (sctx->cur_ino < dir_high_seq_ino) { + ret = 0; + goto out; + } + + btrfs_release_path(path); + } + key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = 0; - - odi = get_orphan_dir_info(sctx, dir, dir_gen); - if (odi) - key.offset = odi->last_dir_index_offset; + key.offset = (odi ? odi->last_dir_index_offset : 0); btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; @@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); + last_dir_index_offset = found_key.offset; + dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; dm->rmdir_gen = dir_gen; ret = 0; goto out; } - if (loc.objectid > send_progress) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; + if (loc.objectid > sctx->cur_ino) { ret = 0; goto out; } @@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, out: btrfs_free_path(path); - return ret; + + if (ret) + return ret; + + if (!odi) { + odi = add_orphan_dir_info(sctx, dir, dir_gen); + if (IS_ERR(odi)) + return PTR_ERR(odi); + + odi->gen = dir_gen; + } + + odi->last_dir_index_offset = last_dir_index_offset; + odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); + + return 0; } static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) @@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } gen = odi->gen; - ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); + ret = can_rmdir(sctx, rmdir_ino, gen); if (ret < 0) goto out; if (!ret) @@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } finish: - ret = send_utimes(sctx, pm->ino, pm->gen); + ret = cache_dir_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -3619,7 +3628,7 @@ finish: if (ret < 0) goto out; - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } @@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * "testdir_2". */ list_for_each_entry(cur, &sctx->new_refs, list) { - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) @@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * the source path when performing its rename * operation. */ - if (is_waiting_for_move(sctx, ow_inode)) { - wdm = get_waiting_dir_move(sctx, - ow_inode); - ASSERT(wdm); + wdm = get_waiting_dir_move(sctx, ow_inode); + if (wdm) wdm->orphanized = true; - } /* * Make sure we clear our orphanized inode's @@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * and get instead the orphan name. */ nce = name_cache_search(sctx, ow_inode, ow_gen); - if (nce) { - name_cache_delete(sctx, nce); - kfree(nce); - } + if (nce) + btrfs_lru_cache_remove(&sctx->name_cache, + &nce->entry); /* * ow_inode might currently be an ancestor of @@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) { @@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_create_inode(sctx, cur->dir); if (ret < 0) goto out; + cache_dir_created(sctx, cur->dir); } } @@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; if (ret) { @@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { - /* TODO delayed utimes */ - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete && cur->dir != last_dir_ino_rm) { - ret = can_rmdir(sctx, cur->dir, cur->dir_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret) { @@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * boundary in the send buffer. This means that there may be a gap * between the beginning of the command and the file data. */ - data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + data_offset = PAGE_ALIGN(sctx->send_size); if (data_offset > sctx->send_max_size || sctx->send_max_size - data_offset < disk_num_bytes) { ret = -EOVERFLOW; @@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, sent += size; } - if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { /* * Always operate only on ranges that are a multiple of the page * size. This is not only to prevent zeroing parts of a page in @@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) * it's moved/renamed, therefore we don't need to do it here. */ sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + + /* + * If the current inode is a non-empty directory, delay issuing + * the utimes command for it, as it's very likely we have inodes + * with an higher number inside it. We want to issue the utimes + * command only after adding all dentries to it. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) + ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + else + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) goto out; } out: + if (!ret) + ret = trim_dir_utimes_cache(sctx); + return ret; } @@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; size_t alloc_size; int sort_clone_roots = 0; + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); - INIT_LIST_HEAD(&sctx->name_cache_list); - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mt_init(&sctx->backref_cache.entries); + btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->dir_created_cache, + SEND_MAX_DIR_CREATED_CACHE_SIZE); + /* + * This cache is periodically trimmed to a fixed size elsewhere, see + * cache_dir_utimes() and trim_dir_utimes_cache(). + */ + btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->flags = arg->flags; @@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->pending_dir_moves = RB_ROOT; - sctx->waiting_dir_moves = RB_ROOT; - sctx->orphan_dirs = RB_ROOT; - sctx->rbtree_new_refs = RB_ROOT; - sctx->rbtree_deleted_refs = RB_ROOT; - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, GFP_KERNEL); @@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (ret < 0) goto out; + btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { + ret = send_utimes(sctx, entry->key, entry->gen); + if (ret < 0) + goto out; + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); + } + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { ret = begin_cmd(sctx, BTRFS_SEND_C_END); if (ret < 0) @@ -8358,11 +8389,12 @@ out: kvfree(sctx->send_buf); kvfree(sctx->verity_descriptor); - name_cache_free(sctx); - close_current_inode(sctx); - empty_backref_cache(sctx); + btrfs_lru_cache_clear(&sctx->name_cache); + btrfs_lru_cache_clear(&sctx->backref_cache); + btrfs_lru_cache_clear(&sctx->dir_created_cache); + btrfs_lru_cache_clear(&sctx->dir_utimes_cache); kfree(sctx); } |