From 49e5fb46211de0744170928466e68166eabbd8b3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 28 Jun 2020 13:07:15 +0800 Subject: btrfs: qgroup: export qgroups in sysfs This patch will add the following sysfs interface: /sys/fs/btrfs//qgroups//referenced /sys/fs/btrfs//qgroups//exclusive /sys/fs/btrfs//qgroups//max_referenced /sys/fs/btrfs//qgroups//max_exclusive /sys/fs/btrfs//qgroups//limit_flags Which is also available in output of "btrfs qgroup show". /sys/fs/btrfs//qgroups//rsv_data /sys/fs/btrfs//qgroups//rsv_meta_pertrans /sys/fs/btrfs//qgroups//rsv_meta_prealloc The last 3 rsv related members are not visible to users, but can be very useful to debug qgroup limit related bugs. Also, to avoid '/' used in , the separator between qgroup level and qgroup id is changed to '_'. The interface is not hidden behind 'debug' as we want this interface to be included into production build and to provide another way to read the qgroup information besides the ioctls. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) (limited to 'fs/btrfs/sysfs.c') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a39bff64ff24..5885abe57c3e 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -19,6 +19,7 @@ #include "volumes.h" #include "space-info.h" #include "block-group.h" +#include "qgroup.h" struct btrfs_feature_attr { struct kobj_attribute kobj_attr; @@ -1455,6 +1456,153 @@ failure: return error; } +static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) +{ + return to_fs_info(kobj->parent->parent); +} + +#define QGROUP_ATTR(_member, _show_name) \ +static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ + struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ + struct btrfs_qgroup, kobj); \ + return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \ +} \ +BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member) + +#define QGROUP_RSV_ATTR(_name, _type) \ +static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ + struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ + struct btrfs_qgroup, kobj); \ + return btrfs_show_u64(&qgroup->rsv.values[_type], \ + &fs_info->qgroup_lock, buf); \ +} \ +BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name) + +QGROUP_ATTR(rfer, referenced); +QGROUP_ATTR(excl, exclusive); +QGROUP_ATTR(max_rfer, max_referenced); +QGROUP_ATTR(max_excl, max_exclusive); +QGROUP_ATTR(lim_flags, limit_flags); +QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); +QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); +QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); + +static struct attribute *qgroup_attrs[] = { + BTRFS_ATTR_PTR(qgroup, referenced), + BTRFS_ATTR_PTR(qgroup, exclusive), + BTRFS_ATTR_PTR(qgroup, max_referenced), + BTRFS_ATTR_PTR(qgroup, max_exclusive), + BTRFS_ATTR_PTR(qgroup, limit_flags), + BTRFS_ATTR_PTR(qgroup, rsv_data), + BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans), + BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc), + NULL +}; +ATTRIBUTE_GROUPS(qgroup); + +static void qgroup_release(struct kobject *kobj) +{ + struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj); + + memset(&qgroup->kobj, 0, sizeof(*kobj)); +} + +static struct kobj_type qgroup_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = qgroup_release, + .default_groups = qgroup_groups, +}; + +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + struct kobject *qgroups_kobj = fs_info->qgroups_kobj; + int ret; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return 0; + if (qgroup->kobj.state_initialized) + return 0; + if (!qgroups_kobj) + return -EINVAL; + + ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj, + "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid)); + if (ret < 0) + kobject_put(&qgroup->kobj); + + return ret; +} + +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *next; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return; + + rbtree_postorder_for_each_entry_safe(qgroup, next, + &fs_info->qgroup_tree, node) + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kobject_del(fs_info->qgroups_kobj); + kobject_put(fs_info->qgroups_kobj); + fs_info->qgroups_kobj = NULL; +} + +/* Called when qgroups get initialized, thus there is no need for locking */ +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) +{ + struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *next; + int ret = 0; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return 0; + + ASSERT(fsid_kobj); + if (fs_info->qgroups_kobj) + return 0; + + fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj); + if (!fs_info->qgroups_kobj) { + ret = -ENOMEM; + goto out; + } + rbtree_postorder_for_each_entry_safe(qgroup, next, + &fs_info->qgroup_tree, node) { + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + } + +out: + if (ret < 0) + btrfs_sysfs_del_qgroups(fs_info); + return ret; +} + +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return; + + if (qgroup->kobj.state_initialized) { + kobject_del(&qgroup->kobj); + kobject_put(&qgroup->kobj); + } +} /* * Change per-fs features in /sys/fs/btrfs/UUID/features to match current -- cgit From 3092c68fc58c7bd54a6dde8d42ef05beb41890f5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 3 Jul 2020 11:13:15 +0300 Subject: btrfs: sysfs: add bdi link to the fsid directory Since BTRFS uses a private bdi it makes sense to create a link to this bdi under /sys/fs/btrfs//bdi. This allows size of read ahead to be controlled. Without this patch it's not possible to uniquely identify which bdi pertains to which btrfs filesystem in the case of multiple btrfs filesystems. It's fine to simply call sysfs_remove_link without checking if the link indeed has been created. The call path sysfs_remove_link kernfs_remove_by_name kernfs_remove_by_name_ns will simply return -ENOENT in case it doesn't exist. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/sysfs.c') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5885abe57c3e..38c0b95e0e7f 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -937,8 +937,12 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { + struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; + btrfs_reset_fs_info_ptr(fs_info); + sysfs_remove_link(fsid_kobj, "bdi"); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); @@ -958,8 +962,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) } #endif addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); - sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); + sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(fsid_kobj, btrfs_attrs); btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); } @@ -1439,6 +1443,10 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) if (error) goto failure; + error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); + if (error) + goto failure; + fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { -- cgit From a47bd78d0c44621efb98b525d04d60dc4d1a79b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:17:50 -0400 Subject: btrfs: sysfs: use NOFS for device creation Dave hit this splat during testing btrfs/078: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc6-default+ #1191 Not tainted ------------------------------------------------------ kswapd0/75 is trying to acquire lock: ffffa040e9d04ff8 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] but task is already holding lock: ffffffff8b0c8040 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (fs_reclaim){+.+.}-{0:0}: __lock_acquire+0x56f/0xaa0 lock_acquire+0xa3/0x440 fs_reclaim_acquire.part.0+0x25/0x30 __kmalloc_track_caller+0x49/0x330 kstrdup+0x2e/0x60 __kernfs_new_node.constprop.0+0x44/0x250 kernfs_new_node+0x25/0x50 kernfs_create_link+0x34/0xa0 sysfs_do_create_link_sd+0x5e/0xd0 btrfs_sysfs_add_devices_dir+0x65/0x100 [btrfs] btrfs_init_new_device+0x44c/0x12b0 [btrfs] btrfs_ioctl+0xc3c/0x25c0 [btrfs] ksys_ioctl+0x68/0xa0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0xe0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __lock_acquire+0x56f/0xaa0 lock_acquire+0xa3/0x440 __mutex_lock+0xa0/0xaf0 btrfs_chunk_alloc+0x137/0x3e0 [btrfs] find_free_extent+0xb44/0xfb0 [btrfs] btrfs_reserve_extent+0x9b/0x180 [btrfs] btrfs_alloc_tree_block+0xc1/0x350 [btrfs] alloc_tree_block_no_bg_flush+0x4a/0x60 [btrfs] __btrfs_cow_block+0x143/0x7a0 [btrfs] btrfs_cow_block+0x15f/0x310 [btrfs] push_leaf_right+0x150/0x240 [btrfs] split_leaf+0x3cd/0x6d0 [btrfs] btrfs_search_slot+0xd14/0xf70 [btrfs] btrfs_insert_empty_items+0x64/0xc0 [btrfs] __btrfs_commit_inode_delayed_items+0xb2/0x840 [btrfs] btrfs_async_run_delayed_root+0x10e/0x1d0 [btrfs] btrfs_work_helper+0x2f9/0x650 [btrfs] process_one_work+0x22c/0x600 worker_thread+0x50/0x3b0 kthread+0x137/0x150 ret_from_fork+0x1f/0x30 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: check_prev_add+0x98/0xa20 validate_chain+0xa8c/0x2a00 __lock_acquire+0x56f/0xaa0 lock_acquire+0xa3/0x440 __mutex_lock+0xa0/0xaf0 __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] btrfs_evict_inode+0x3bf/0x560 [btrfs] evict+0xd6/0x1c0 dispose_list+0x48/0x70 prune_icache_sb+0x54/0x80 super_cache_scan+0x121/0x1a0 do_shrink_slab+0x175/0x420 shrink_slab+0xb1/0x2e0 shrink_node+0x192/0x600 balance_pgdat+0x31f/0x750 kswapd+0x206/0x510 kthread+0x137/0x150 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> &fs_info->chunk_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&fs_info->chunk_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/75: #0: ffffffff8b0c8040 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff8b0b50b8 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x54/0x2e0 #2: ffffa040e057c0e8 (&type->s_umount_key#26){++++}-{3:3}, at: trylock_super+0x16/0x50 stack backtrace: CPU: 2 PID: 75 Comm: kswapd0 Not tainted 5.8.0-rc6-default+ #1191 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x16f/0x190 check_prev_add+0x98/0xa20 validate_chain+0xa8c/0x2a00 __lock_acquire+0x56f/0xaa0 lock_acquire+0xa3/0x440 ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] __mutex_lock+0xa0/0xaf0 ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] ? __lock_acquire+0x56f/0xaa0 ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] ? lock_acquire+0xa3/0x440 ? btrfs_evict_inode+0x138/0x560 [btrfs] ? btrfs_evict_inode+0x2fe/0x560 [btrfs] ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] btrfs_evict_inode+0x3bf/0x560 [btrfs] evict+0xd6/0x1c0 dispose_list+0x48/0x70 prune_icache_sb+0x54/0x80 super_cache_scan+0x121/0x1a0 do_shrink_slab+0x175/0x420 shrink_slab+0xb1/0x2e0 shrink_node+0x192/0x600 balance_pgdat+0x31f/0x750 kswapd+0x206/0x510 ? _raw_spin_unlock_irqrestore+0x3e/0x50 ? finish_wait+0x90/0x90 ? balance_pgdat+0x750/0x750 kthread+0x137/0x150 ? kthread_stop+0x2a0/0x2a0 ret_from_fork+0x1f/0x30 This is because we're holding the chunk_mutex while adding this device and adding its sysfs entries. We actually hold different locks in different places when calling this function, the dev_replace semaphore for instance in dev replace, so instead of moving this call around simply wrap it's operations in NOFS. CC: stable@vger.kernel.org # 4.14+ Reported-by: David Sterba Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/sysfs.c') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 38c0b95e0e7f..104c80caaa74 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1278,7 +1278,9 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, { int error = 0; struct btrfs_device *dev; + unsigned int nofs_flag; + nofs_flag = memalloc_nofs_save(); list_for_each_entry(dev, &fs_devices->devices, dev_list) { if (one_device && one_device != dev) @@ -1306,6 +1308,7 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, break; } } + memalloc_nofs_restore(nofs_flag); return error; } -- cgit From 62ab2cc04ddc5dfab4e382ae167734fc111da5ed Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 3 Aug 2020 14:20:11 +0800 Subject: btrfs: sysfs: fix NULL pointer dereference at btrfs_sysfs_del_qgroups() [BUG] Unmounting a btrfs filesystem with quota disabled will cause the following NULL pointer dereference: BTRFS info (device dm-5): has skinny extents BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page CPU: 7 PID: 637 Comm: umount Not tainted 5.8.0-rc7-next-20200731-custom #76 RIP: 0010:kobject_del+0x6/0x20 Call Trace: btrfs_sysfs_del_qgroups+0xac/0xf0 [btrfs] btrfs_free_qgroup_config+0x63/0x70 [btrfs] close_ctree+0x1f5/0x323 [btrfs] btrfs_put_super+0x15/0x17 [btrfs] generic_shutdown_super+0x72/0x110 kill_anon_super+0x18/0x30 btrfs_kill_super+0x17/0x30 [btrfs] deactivate_locked_super+0x3b/0xa0 deactivate_super+0x40/0x50 cleanup_mnt+0x135/0x190 __cleanup_mnt+0x12/0x20 task_work_run+0x64/0xb0 exit_to_user_mode_prepare+0x18a/0x190 syscall_exit_to_user_mode+0x4f/0x270 do_syscall_64+0x45/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace 37b7adca5c1d5c5d ]--- [CAUSE] Commit 079ad2fb4bf9 ("kobject: Avoid premature parent object freeing in kobject_cleanup()") changed kobject_del() that it no longer accepts NULL pointer. Before that commit, kobject_del() and kobject_put() all accept NULL pointers and just ignore such NULL pointers. But that mentioned commit needs to access the parent node, killing the old NULL pointer behavior. Unfortunately btrfs is relying on that hidden feature thus we will trigger such NULL pointer dereference. [FIX] Instead of just saving several lines, do proper fs_info->qgroups_kobj check before calling kobject_del() and kobject_put(). Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/sysfs.c') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 104c80caaa74..c8df2edafd85 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1565,9 +1565,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) btrfs_sysfs_del_one_qgroup(fs_info, qgroup); - kobject_del(fs_info->qgroups_kobj); - kobject_put(fs_info->qgroups_kobj); - fs_info->qgroups_kobj = NULL; + if (fs_info->qgroups_kobj) { + kobject_del(fs_info->qgroups_kobj); + kobject_put(fs_info->qgroups_kobj); + fs_info->qgroups_kobj = NULL; + } } /* Called when qgroups get initialized, thus there is no need for locking */ -- cgit