summaryrefslogtreecommitdiff
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2023-04-05 18:51:29 +0100
committerDavid Sterba <dsterba@suse.com>2023-04-17 19:52:19 +0200
commitfa4b8cb1738097586f49ec66cd3656cef47ac143 (patch)
tree167fb444fffebcfba80a15c789595a6c7d383b1e /fs/btrfs/tree-log.c
parent8eb3dd17eadd21a340cb658c7252d4ad5788baa4 (diff)
btrfs: avoid iterating over all indexes when logging directory
When logging a directory, after copying all directory index items from the subvolume tree to the log tree, we iterate over the subvolume tree to find all dir index items that are located in leaves COWed (or created) in the current transaction. If we keep logging a directory several times during the same transaction, we end up iterating over the same dir index items everytime we log the directory, wasting time and adding extra lock contention on the subvolume tree. So just keep track of the last logged dir index offset in order to start the search for that index (+1) the next time the directory is logged, as dir index values (key offsets) come from a monotonically increasing counter. The following test measures the difference before and after this change: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT # Time values in milliseconds. declare -a fsync_times # Total number of files added to the test directory. num_files=1000000 # Fsync directory after every N files are added. fsync_period=100 mkdir $MNT/testdir fsync_total_time=0 for ((i = 1; i <= $num_files; i++)); do echo -n > $MNT/testdir/file_$i if [ $((i % fsync_period)) -eq 0 ]; then start=$(date +%s%N) xfs_io -c "fsync" $MNT/testdir end=$(date +%s%N) fsync_total_time=$((fsync_total_time + (end - start))) fsync_times[i]=$(( (end - start) / 1000000 )) echo -n -e "Progress $i / $num_files\r" fi done echo -e "\nHistogram of directory fsync duration in ms:\n" printf '%s\n' "${fsync_times[@]}" | \ perl -MStatistics::Histogram -e '@d = <>; print get_histogram(\@d);' fsync_total_time=$((fsync_total_time / 1000000)) echo -e "\nTotal time spent in fsync: $fsync_total_time ms\n" echo umount $MNT The test was run on a non-debug kernel (Debian's default kernel config) against a 15G null block device. Result before this change: Histogram of directory fsync duration in ms: Count: 10000 Range: 3.000 - 362.000; Mean: 34.556; Median: 31.000; Stddev: 25.751 Percentiles: 90th: 71.000; 95th: 77.000; 99th: 81.000 3.000 - 5.278: 1423 ################################# 5.278 - 8.854: 1173 ########################### 8.854 - 14.467: 591 ############## 14.467 - 23.277: 1025 ####################### 23.277 - 37.105: 1422 ################################# 37.105 - 58.809: 2036 ############################################### 58.809 - 92.876: 2316 ##################################################### 92.876 - 146.346: 6 | 146.346 - 230.271: 6 | 230.271 - 362.000: 2 | Total time spent in fsync: 350527 ms Result after this change: Histogram of directory fsync duration in ms: Count: 10000 Range: 3.000 - 1088.000; Mean: 8.704; Median: 8.000; Stddev: 12.576 Percentiles: 90th: 12.000; 95th: 14.000; 99th: 17.000 3.000 - 6.007: 3222 ################################# 6.007 - 11.276: 5197 ##################################################### 11.276 - 20.506: 1551 ################ 20.506 - 36.674: 24 | 36.674 - 201.552: 1 | 201.552 - 353.841: 4 | 353.841 - 1088.000: 1 | Total time spent in fsync: 92114 ms Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c32
1 files changed, 30 insertions, 2 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f6c3f14fbdb6..d746ebf841fd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3618,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
ret = BTRFS_LOG_FORCE_COMMIT;
else
inode->last_dir_index_offset = last_index;
+
+ if (btrfs_get_first_dir_index_to_log(inode) == 0)
+ btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
out:
kfree(ins_data);
@@ -5376,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
u64 ino = btrfs_ino(start_inode);
+ struct btrfs_inode *curr_inode = start_inode;
int ret = 0;
/*
@@ -5390,18 +5394,23 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ /* Pairs with btrfs_add_delayed_iput below. */
+ ihold(&curr_inode->vfs_inode);
+
while (true) {
+ struct inode *vfs_inode;
struct extent_buffer *leaf;
struct btrfs_key min_key;
+ u64 next_index;
bool continue_curr_inode = true;
int nritems;
int i;
min_key.objectid = ino;
min_key.type = BTRFS_DIR_INDEX_KEY;
- min_key.offset = 0;
+ min_key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
+ next_index = min_key.offset;
again:
- btrfs_release_path(path);
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
if (ret < 0) {
break;
@@ -5426,6 +5435,8 @@ again:
break;
}
+ next_index = min_key.offset + 1;
+
di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
type = btrfs_dir_ftype(leaf, di);
if (btrfs_dir_transid(leaf, di) < trans->transid)
@@ -5466,12 +5477,16 @@ again:
break;
}
+ btrfs_release_path(path);
+
if (continue_curr_inode && min_key.offset < (u64)-1) {
min_key.offset++;
goto again;
}
next:
+ btrfs_set_first_dir_index_to_log(curr_inode, next_index);
+
if (list_empty(&dir_list))
break;
@@ -5479,9 +5494,22 @@ next:
ino = dir_elem->ino;
list_del(&dir_elem->list);
kfree(dir_elem);
+
+ btrfs_add_delayed_iput(curr_inode);
+ curr_inode = NULL;
+
+ vfs_inode = btrfs_iget(fs_info->sb, ino, root);
+ if (IS_ERR(vfs_inode)) {
+ ret = PTR_ERR(vfs_inode);
+ break;
+ }
+ curr_inode = BTRFS_I(vfs_inode);
}
out:
btrfs_free_path(path);
+ if (curr_inode)
+ btrfs_add_delayed_iput(curr_inode);
+
if (ret) {
struct btrfs_dir_list *next;