From 5958253cf65de42493f17f36877a901486a90365 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Jun 2020 09:04:44 +0800 Subject: btrfs: qgroup: catch reserved space leaks at unmount time Before this patch, qgroup completely relies on per-inode extent io tree to detect reserved data space leak. However previous bug has already shown how release page before btrfs_finish_ordered_io() could lead to leak, and since it's QGROUP_RESERVED bit cleared without triggering qgroup rsv, it can't be detected by per-inode extent io tree. So this patch adds another (and hopefully the final) safety net to catch qgroup data reserved space leak. At least the new safety net catches all the leaks during development, so it should be pretty useful in the real world. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5bd4089ad0e1..74eb98479109 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -505,6 +505,49 @@ out: return ret < 0 ? ret : 0; } +static u64 btrfs_qgroup_subvolid(u64 qgroupid) +{ + return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); +} + +/* + * Called in close_ctree() when quota is still enabled. This verifies we don't + * leak some reserved space. + * + * Return false if no reserved space is left. + * Return true if some reserved space is leaked. + */ +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) +{ + struct rb_node *node; + bool ret = false; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return ret; + /* + * Since we're unmounting, there is no race and no need to grab qgroup + * lock. And here we don't go post-order to provide a more user + * friendly sorted result. + */ + for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { + struct btrfs_qgroup *qgroup; + int i; + + qgroup = rb_entry(node, struct btrfs_qgroup, node); + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { + if (qgroup->rsv.values[i]) { + ret = true; + btrfs_warn(fs_info, + "qgroup %llu/%llu has unreleased space, type %d rsv %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + i, qgroup->rsv.values[i]); + } + } + } + return ret; +} + /* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), * first two are in single-threaded paths.And for the third one, we have set -- cgit From df2cfd131fd33dbef1ce33be8b332b1f3d645f35 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:09 +0300 Subject: btrfs: make qgroup_free_reserved_data take btrfs_inode It only uses btrfs_inode so can just as easily take it as an argument. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 74eb98479109..d0031afe993a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3491,10 +3491,10 @@ cleanup: } /* Free ranges specified by @reserved, normally in error path */ -static int qgroup_free_reserved_data(struct inode *inode, +static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; @@ -3530,8 +3530,8 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, - free_start, free_start + free_len - 1, + ret = clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -3560,7 +3560,8 @@ static int __btrfs_qgroup_release_data(struct inode *inode, /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(inode, reserved, start, len); + return qgroup_free_reserved_data(BTRFS_I(inode), reserved, + start, len); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); -- cgit From 8769af96cf6355ae96c808265a05171226b4b6e7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:10 +0300 Subject: btrfs: make __btrfs_qgroup_release_data take btrfs_inode It uses vfs_inode only for a tracepoint so convert its interface to take btrfs_inode directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d0031afe993a..de2812ee73c1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3545,7 +3545,7 @@ out: return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, +static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, int free) { @@ -3553,28 +3553,26 @@ static int __btrfs_qgroup_release_data(struct inode *inode, int trace_op = QGROUP_RELEASE; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, - &BTRFS_I(inode)->root->fs_info->flags)) + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) return 0; /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(BTRFS_I(inode), reserved, - start, len); + return qgroup_free_reserved_data(inode, reserved, start, len); extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, + EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; if (free) trace_op = QGROUP_FREE; - trace_btrfs_qgroup_release_data(inode, start, len, + trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, changeset.bytes_changed, trace_op); if (free) - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->root_key.objectid, + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: @@ -3597,7 +3595,7 @@ out: int btrfs_qgroup_free_data(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); + return __btrfs_qgroup_release_data(BTRFS_I(inode), reserved, start, len, 1); } /* @@ -3617,7 +3615,7 @@ int btrfs_qgroup_free_data(struct inode *inode, */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); + return __btrfs_qgroup_release_data(BTRFS_I(inode), NULL, start, len, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, -- cgit From 8b8a979f1fc69054f99abab80daeae89aba3f19b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:11 +0300 Subject: btrfs: make btrfs_qgroup_free_data take btrfs_inode It passes btrfs_inode to its callee so change the interface. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index de2812ee73c1..7795c7616fe8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3592,10 +3592,10 @@ out: * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, +int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(BTRFS_I(inode), reserved, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* -- cgit From 72b7d15bf1e1ee7b71da81e6c5d9afcd2e86c426 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:18 +0300 Subject: btrfs: make btrfs_qgroup_release_data take btrfs_inode It just forwards its argument to __btrfs_qgroup_release_data. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7795c7616fe8..01a2b03648be 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3613,9 +3613,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(BTRFS_I(inode), NULL, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, -- cgit From 7661a3e033ab782366e0e1f4b6aad0df3555fcbd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:37 +0300 Subject: btrfs: make btrfs_qgroup_reserve_data take btrfs_inode There's only a single use of vfs_inode in a tracepoint so let's take btrfs_inode directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 01a2b03648be..657c09be159a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3435,11 +3435,11 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * same @reserved, caller must ensure when error happens it's OK * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset *reserved; @@ -3462,12 +3462,12 @@ int btrfs_qgroup_reserve_data(struct inode *inode, reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; - ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + ret = set_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; - trace_btrfs_qgroup_reserve_data(inode, start, len, + trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; @@ -3481,7 +3481,7 @@ cleanup: /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) - clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + clear_extent_bit(&inode->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); /* Also free data bytes of already reserved one */ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, -- cgit From cfdd45921571eb24073e0737fa0bd44b4218f914 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:46 +0300 Subject: btrfs: make btrfs_qgroup_check_reserved_leak take btrfs_inode vfs_inode is used only for the inode number everything else requires btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ use btrfs_ino ] Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 657c09be159a..787128d7e196 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3784,7 +3784,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) * Check qgroup reserved space leaking, normally at destroy inode * time */ -void btrfs_qgroup_check_reserved_leak(struct inode *inode) +void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) { struct extent_changeset changeset; struct ulist_node *unode; @@ -3792,19 +3792,19 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) int ret; extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { ULIST_ITER_INIT(&iter); while ((unode = ulist_next(&changeset.range_changed, &iter))) { - btrfs_warn(BTRFS_I(inode)->root->fs_info, - "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu", - inode->i_ino, unode->val, unode->aux); + btrfs_warn(inode->root->fs_info, + "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", + btrfs_ino(inode), unode->val, unode->aux); } - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->root_key.objectid, + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } -- cgit From 06f67c47076e5c3ee65276171596479dcc3a3941 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 28 Jun 2020 13:07:14 +0800 Subject: btrfs: use __u16 for the return value of btrfs_qgroup_level() The qgroup level is limited to u16, so no need to use u64 for it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 787128d7e196..229e461dbfc3 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -538,7 +538,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) if (qgroup->rsv.values[i]) { ret = true; btrfs_warn(fs_info, - "qgroup %llu/%llu has unreleased space, type %d rsv %llu", + "qgroup %hu/%llu has unreleased space, type %d rsv %llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid), i, qgroup->rsv.values[i]); -- cgit From 49e5fb46211de0744170928466e68166eabbd8b3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 28 Jun 2020 13:07:15 +0800 Subject: btrfs: qgroup: export qgroups in sysfs This patch will add the following sysfs interface: /sys/fs/btrfs//qgroups//referenced /sys/fs/btrfs//qgroups//exclusive /sys/fs/btrfs//qgroups//max_referenced /sys/fs/btrfs//qgroups//max_exclusive /sys/fs/btrfs//qgroups//limit_flags Which is also available in output of "btrfs qgroup show". /sys/fs/btrfs//qgroups//rsv_data /sys/fs/btrfs//qgroups//rsv_meta_pertrans /sys/fs/btrfs//qgroups//rsv_meta_prealloc The last 3 rsv related members are not visible to users, but can be very useful to debug qgroup limit related bugs. Also, to avoid '/' used in , the separator between qgroup level and qgroup id is changed to '_'. The interface is not hidden behind 'debug' as we want this interface to be included into production build and to provide another way to read the qgroup information besides the ioctls. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 229e461dbfc3..ee0ad33b659c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -22,6 +22,7 @@ #include "extent_io.h" #include "qgroup.h" #include "block-group.h" +#include "sysfs.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -220,10 +221,12 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) +static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, @@ -252,7 +255,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); - __del_qgroup_rb(qgroup); + __del_qgroup_rb(fs_info, qgroup); return 0; } @@ -351,6 +354,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } + ret = btrfs_sysfs_add_qgroups(fs_info); + if (ret < 0) + goto out; /* default this to quota off, in case no status key is found */ fs_info->qgroup_flags = 0; @@ -412,6 +418,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + switch (found_key.type) { case BTRFS_QGROUP_INFO_KEY: { struct btrfs_qgroup_info_item *ptr; @@ -500,16 +510,12 @@ out: ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + btrfs_sysfs_del_qgroups(fs_info); } return ret < 0 ? ret : 0; } -static u64 btrfs_qgroup_subvolid(u64 qgroupid) -{ - return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); -} - /* * Called in close_ctree() when quota is still enabled. This verifies we don't * leak some reserved space. @@ -562,7 +568,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - __del_qgroup_rb(qgroup); + __del_qgroup_rb(fs_info, qgroup); } /* * We call btrfs_free_qgroup_config() when unmounting @@ -571,6 +577,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) */ ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; + btrfs_sysfs_del_qgroups(fs_info); } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, @@ -943,6 +950,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out; } + ret = btrfs_sysfs_add_qgroups(fs_info); + if (ret < 0) + goto out; /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item @@ -1030,6 +1040,11 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_abort_transaction(trans, ret); goto out_free_path; } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } } ret = btrfs_next_item(tree_root, path); if (ret < 0) { @@ -1054,6 +1069,11 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } ret = btrfs_commit_transaction(trans); trans = NULL; @@ -1089,6 +1109,7 @@ out: fs_info->qgroup_ulist = NULL; if (trans) btrfs_end_transaction(trans); + btrfs_sysfs_del_qgroups(fs_info); } mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -1441,8 +1462,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) qgroup = add_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); - if (IS_ERR(qgroup)) + if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); + goto out; + } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -2861,6 +2885,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, unlock: spin_unlock(&fs_info->qgroup_lock); + if (!ret) + ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); -- cgit From 263da812e87bac4098a4778efaa32c54275641db Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Jul 2020 14:24:45 +0800 Subject: btrfs: qgroup: allow to unreserve range without releasing other ranges [PROBLEM] Before this patch, when btrfs_qgroup_reserve_data() fails, we free all reserved space of the changeset. For example: ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M); ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M); ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M); If the last btrfs_qgroup_reserve_data() failed, it will release the entire [0, 3M) range. This behavior is kind of OK for now, as when we hit -EDQUOT, we normally go error handling and need to release all reserved ranges anyway. But this also means the following call is not possible: ret = btrfs_qgroup_reserve_data(); if (ret == -EDQUOT) { /* Do something to free some qgroup space */ ret = btrfs_qgroup_reserve_data(); } As if the first btrfs_qgroup_reserve_data() fails, it will free all reserved qgroup space. [CAUSE] This is because we release all reserved ranges when btrfs_qgroup_reserve_data() fails. [FIX] This patch will implement a new function, qgroup_unreserve_range(), to iterate through the ulist nodes, to find any nodes in the failure range, and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and decrease the extent_changeset::bytes_changed, so that we can revert to previous state. This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT happens. Suggested-by: Josef Bacik Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ee0ad33b659c..054f36df60cf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3447,6 +3447,73 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) } } +#define rbtree_iterate_from_safe(node, next, start) \ + for (node = start; node && ({ next = rb_next(node); 1;}); node = next) + +static int qgroup_unreserve_range(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, + u64 len) +{ + struct rb_node *node; + struct rb_node *next; + struct ulist_node *entry = NULL; + int ret = 0; + + node = reserved->range_changed.root.rb_node; + while (node) { + entry = rb_entry(node, struct ulist_node, rb_node); + if (entry->val < start) + node = node->rb_right; + else if (entry) + node = node->rb_left; + else + break; + } + + /* Empty changeset */ + if (!entry) + return 0; + + if (entry->val > start && rb_prev(&entry->rb_node)) + entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, + rb_node); + + rbtree_iterate_from_safe(node, next, &entry->rb_node) { + u64 entry_start; + u64 entry_end; + u64 entry_len; + int clear_ret; + + entry = rb_entry(node, struct ulist_node, rb_node); + entry_start = entry->val; + entry_end = entry->aux; + entry_len = entry_end - entry_start + 1; + + if (entry_start >= start + len) + break; + if (entry_start + entry_len <= start) + continue; + /* + * Now the entry is in [start, start + len), revert the + * EXTENT_QGROUP_RESERVED bit. + */ + clear_ret = clear_extent_bits(&inode->io_tree, entry_start, + entry_end, EXTENT_QGROUP_RESERVED); + if (!ret && clear_ret < 0) + ret = clear_ret; + + ulist_del(&reserved->range_changed, entry->val, entry->aux); + if (likely(reserved->bytes_changed >= entry_len)) { + reserved->bytes_changed -= entry_len; + } else { + WARN_ON(1); + reserved->bytes_changed = 0; + } + } + + return ret; +} + /* * Reserve qgroup space for range [start, start + len). * @@ -3457,18 +3524,14 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. - * if btrfs_qgroup_reserve_data() is called multiple times with - * same @reserved, caller must ensure when error happens it's OK - * to free *ALL* reserved space. */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { struct btrfs_root *root = inode->root; - struct ulist_node *unode; - struct ulist_iterator uiter; struct extent_changeset *reserved; + bool new_reserved = false; u64 orig_reserved; u64 to_reserve; int ret; @@ -3481,6 +3544,7 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, if (WARN_ON(!reserved_ret)) return -EINVAL; if (!*reserved_ret) { + new_reserved = true; *reserved_ret = extent_changeset_alloc(); if (!*reserved_ret) return -ENOMEM; @@ -3496,7 +3560,7 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) - goto cleanup; + goto out; ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); if (ret < 0) goto cleanup; @@ -3504,15 +3568,13 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, return ret; cleanup: - /* cleanup *ALL* already reserved ranges */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&reserved->range_changed, &uiter))) - clear_extent_bit(&inode->io_tree, unode->val, - unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); - /* Also free data bytes of already reserved one */ - btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, - orig_reserved, BTRFS_QGROUP_RSV_DATA); - extent_changeset_release(reserved); + qgroup_unreserve_range(inode, reserved, start, len); +out: + if (new_reserved) { + extent_changeset_release(reserved); + kfree(reserved); + *reserved_ret = NULL; + } return ret; } -- cgit From c53e9653605dbf708f5be02902de51831be4b009 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jul 2020 18:50:48 +0800 Subject: btrfs: qgroup: try to flush qgroup space when we get -EDQUOT [PROBLEM] There are known problem related to how btrfs handles qgroup reserved space. One of the most obvious case is the the test case btrfs/153, which do fallocate, then write into the preallocated range. btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad) --- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800 +++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800 @@ -1,2 +1,5 @@ QA output created by 153 +pwrite: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded Silence is golden ... (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff) [CAUSE] Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"), we always reserve space no matter if it's COW or not. Such behavior change is mostly for performance, and reverting it is not a good idea anyway. For preallcoated extent, we reserve qgroup data space for it already, and since we also reserve data space for qgroup at buffered write time, it needs twice the space for us to write into preallocated space. This leads to the -EDQUOT in buffered write routine. And we can't follow the same solution, unlike data/meta space check, qgroup reserved space is shared between data/metadata. The EDQUOT can happen at the metadata reservation, so doing NODATACOW check after qgroup reservation failure is not a solution. [FIX] To solve the problem, we don't return -EDQUOT directly, but every time we got a -EDQUOT, we try to flush qgroup space: - Flush all inodes of the root NODATACOW writes will free the qgroup reserved at run_dealloc_range(). However we don't have the infrastructure to only flush NODATACOW inodes, here we flush all inodes anyway. - Wait for ordered extents This would convert the preallocated metadata space into per-trans metadata, which can be freed in later transaction commit. - Commit transaction This will free all per-trans metadata space. Also we don't want to trigger flush multiple times, so here we introduce a per-root wait list and a new root status, to ensure only one thread starts the flushing. Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to") Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 054f36df60cf..23579dc39fa6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3515,17 +3515,58 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, } /* - * Reserve qgroup space for range [start, start + len). + * Try to free some space for qgroup. * - * This function will either reserve space from related qgroups or doing - * nothing if the range is already reserved. + * For qgroup, there are only 3 ways to free qgroup space: + * - Flush nodatacow write + * Any nodatacow write will free its reserved data space at run_delalloc_range(). + * In theory, we should only flush nodatacow inodes, but it's not yet + * possible, so we need to flush the whole root. * - * Return 0 for successful reserve - * Return <0 for error (including -EQUOT) + * - Wait for ordered extents + * When ordered extents are finished, their reserved metadata is finally + * converted to per_trans status, which can be freed by later commit + * transaction. * - * NOTE: this function may sleep for memory allocation. + * - Commit transaction + * This would free the meta_per_trans space. + * In theory this shouldn't provide much space, but any more qgroup space + * is needed. */ -int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, +static int try_flush_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + } + + ret = btrfs_start_delalloc_snapshot(root); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); + return ret; +} + +static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { @@ -3578,6 +3619,34 @@ out: return ret; } +/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or do nothing + * if the range is already reserved. + * + * Return 0 for successful reservation + * Return <0 for error (including -EQUOT) + * + * NOTE: This function may sleep for memory allocation, dirty page flushing and + * commit transaction. So caller should not hold any dirty page locked. + */ +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + int ret; + + ret = qgroup_reserve_data(inode, reserved_ret, start, len); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(inode->root); + if (ret < 0) + return ret; + return qgroup_reserve_data(inode, reserved_ret, start, len); +} + /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) @@ -3746,7 +3815,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; } -int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3773,6 +3842,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; } +int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) +{ + int ret; + + ret = qgroup_reserve_meta(root, num_bytes, type, enforce); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(root); + if (ret < 0) + return ret; + return qgroup_reserve_meta(root, num_bytes, type, enforce); +} + void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; -- cgit From adca4d945c8dca28a85df45c5b117e6dac2e77f1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jul 2020 18:50:49 +0800 Subject: btrfs: qgroup: remove ASYNC_COMMIT mechanism in favor of reserve retry-after-EDQUOT commit a514d63882c3 ("btrfs: qgroup: Commit transaction in advance to reduce early EDQUOT") tries to reduce the early EDQUOT problems by checking the qgroup free against threshold and tries to wake up commit kthread to free some space. The problem of that mechanism is, it can only free qgroup per-trans metadata space, can't do anything to data, nor prealloc qgroup space. Now since we have the ability to flush qgroup space, and implemented retry-after-EDQUOT behavior, such mechanism can be completely replaced. So this patch will cleanup such mechanism in favor of retry-after-EDQUOT. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 43 ++----------------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) (limited to 'fs/btrfs/qgroup.c') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 23579dc39fa6..c0f350c3a0cf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "ctree.h" #include "transaction.h" @@ -2895,20 +2894,8 @@ out: return ret; } -/* - * Two limits to commit transaction in advance. - * - * For RATIO, it will be 1/RATIO of the remaining limit as threshold. - * For SIZE, it will be in byte unit as threshold. - */ -#define QGROUP_FREE_RATIO 32 -#define QGROUP_FREE_SIZE SZ_32M -static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup *qg, u64 num_bytes) +static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 free; - u64 threshold; - if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; @@ -2917,32 +2904,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; - /* - * Even if we passed the check, it's better to check if reservation - * for meta_pertrans is pushing us near limit. - * If there is too much pertrans reservation or it's near the limit, - * let's try commit transaction to free some, using transaction_kthread - */ - if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | - BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { - free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; - threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, - QGROUP_FREE_SIZE); - } else { - free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; - threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, - QGROUP_FREE_SIZE); - } - - /* - * Use transaction_kthread to commit transaction, so we no - * longer need to bother nested transaction nor lock context. - */ - if (free < threshold) - btrfs_commit_transaction_locksafe(fs_info); - } - return true; } @@ -2990,7 +2951,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) { + if (enforce && !qgroup_check_limits(qg, num_bytes)) { ret = -EDQUOT; goto out; } -- cgit