From f311ade3a7adf31658ed882aaab9f9879fdccef7 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sat, 1 Feb 2020 20:38:38 +0000 Subject: btrfs: ref-verify: fix memory leaks In btrfs_ref_tree_mod(), 'ref' and 'ra' are allocated through kzalloc() and kmalloc(), respectively. In the following code, if an error occurs, the execution will be redirected to 'out' or 'out_unlock' and the function will be exited. However, on some of the paths, 'ref' and 'ra' are not deallocated, leading to memory leaks. For example, if 'action' is BTRFS_ADD_DELAYED_EXTENT, add_block_entry() will be invoked. If the return value indicates an error, the execution will be redirected to 'out'. But, 'ref' is not deallocated on this path, causing a memory leak. To fix the above issues, deallocate both 'ref' and 'ra' before exiting from the function when an error is encountered. CC: stable@vger.kernel.org # 4.15+ Signed-off-by: Wenwen Wang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index b57f3618e58e..454a1015d026 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -744,6 +744,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, */ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); if (IS_ERR(be)) { + kfree(ref); kfree(ra); ret = PTR_ERR(be); goto out; @@ -757,6 +758,8 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "re-allocated a block that still has references to it!"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); goto out_unlock; } @@ -819,6 +822,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a existing root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } @@ -834,6 +838,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "attempting to add another ref for an existing ref on a tree block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } -- cgit From ac05ca913e9f3871126d61da275bfe8516ff01ca Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 31 Jan 2020 14:06:07 +0000 Subject: Btrfs: fix race between using extent maps and merging them We have a few cases where we allow an extent map that is in an extent map tree to be merged with other extents in the tree. Such cases include the unpinning of an extent after the respective ordered extent completed or after logging an extent during a fast fsync. This can lead to subtle and dangerous problems because when doing the merge some other task might be using the same extent map and as consequence see an inconsistent state of the extent map - for example sees the new length but has seen the old start offset. With luck this triggers a BUG_ON(), and not some silent bug, such as the following one in __do_readpage(): $ cat -n fs/btrfs/extent_io.c 3061 static int __do_readpage(struct extent_io_tree *tree, 3062 struct page *page, (...) 3127 em = __get_extent_map(inode, page, pg_offset, cur, 3128 end - cur + 1, get_extent, em_cached); 3129 if (IS_ERR_OR_NULL(em)) { 3130 SetPageError(page); 3131 unlock_extent(tree, cur, end); 3132 break; 3133 } 3134 extent_offset = cur - em->start; 3135 BUG_ON(extent_map_end(em) <= cur); (...) Consider the following example scenario, where we end up hitting the BUG_ON() in __do_readpage(). We have an inode with a size of 8KiB and 2 extent maps: extent A: file offset 0, length 4KiB, disk_bytenr = X, persisted on disk by a previous transaction extent B: file offset 4KiB, length 4KiB, disk_bytenr = X + 4KiB, not yet persisted but writeback started for it already. The extent map is pinned since there's writeback and an ordered extent in progress, so it can not be merged with extent map A yet The following sequence of steps leads to the BUG_ON(): 1) The ordered extent for extent B completes, the respective page gets its writeback bit cleared and the extent map is unpinned, at that point it is not yet merged with extent map A because it's in the list of modified extents; 2) Due to memory pressure, or some other reason, the MM subsystem releases the page corresponding to extent B - btrfs_releasepage() is called and returns 1, meaning the page can be released as it's not dirty, not under writeback anymore and the extent range is not locked in the inode's iotree. However the extent map is not released, either because we are not in a context that allows memory allocations to block or because the inode's size is smaller than 16MiB - in this case our inode has a size of 8KiB; 3) Task B needs to read extent B and ends up __do_readpage() through the btrfs_readpage() callback. At __do_readpage() it gets a reference to extent map B; 4) Task A, doing a fast fsync, calls clear_em_loggin() against extent map B while holding the write lock on the inode's extent map tree - this results in try_merge_map() being called and since it's possible to merge extent map B with extent map A now (the extent map B was removed from the list of modified extents), the merging begins - it sets extent map B's start offset to 0 (was 4KiB), but before it increments the map's length to 8KiB (4kb + 4KiB), task A is at: BUG_ON(extent_map_end(em) <= cur); The call to extent_map_end() sees the extent map has a start of 0 and a length still at 4KiB, so it returns 4KiB and 'cur' is 4KiB, so the BUG_ON() is triggered. So it's dangerous to modify an extent map that is in the tree, because some other task might have got a reference to it before and still using it, and needs to see a consistent map while using it. Generally this is very rare since most paths that lookup and use extent maps also have the file range locked in the inode's iotree. The fsync path is pretty much the only exception where we don't do it to avoid serialization with concurrent reads. Fix this by not allowing an extent map do be merged if if it's being used by tasks other then the one attempting to merge the extent map (when the reference count of the extent map is greater than 2). Reported-by: ryusuke1925 Reported-by: Koki Mitani Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=206211 CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6f417ff68980..bd6229fb2b6f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -237,6 +237,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) struct extent_map *merge = NULL; struct rb_node *rb; + /* + * We can't modify an extent map that is in the tree and that is being + * used by another task, as it can cause that other task to see it in + * inconsistent state during the merging. We always have 1 reference for + * the tree and 1 for this task (which is unpinning the extent map or + * clearing the logging flag), so anything > 2 means it's being used by + * other tasks too. + */ + if (refcount_read(&em->refs) > 2) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) -- cgit From e8294f2f6aa6208ed0923aa6d70cea3be178309a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 17:12:16 +0100 Subject: btrfs: print message when tree-log replay starts There's no logged information about tree-log replay although this is something that points to previous unclean unmount. Other filesystems report that as well. Suggested-by: Chris Murphy CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7fa9bb79ad08..89422aa8e9d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3164,6 +3164,7 @@ int __cold open_ctree(struct super_block *sb, /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; -- cgit From 10a3a3edc5b89a8cd095bc63495fb1e0f42047d9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 17:12:28 +0100 Subject: btrfs: log message when rw remount is attempted with unclean tree-log A remount to a read-write filesystem is not safe when there's tree-log to be replayed. Files that could be opened until now might be affected by the changes in the tree-log. A regular mount is needed to replay the log so the filesystem presents the consistent view with the pending changes included. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0616a5434793..67c63858812a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1834,6 +1834,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); ret = -EINVAL; goto restore; } -- cgit From 28553fa992cb28be6a65566681aac6cafabb4f2d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 7 Feb 2020 12:23:09 +0000 Subject: Btrfs: fix race between shrinking truncate and fiemap When there is a fiemap executing in parallel with a shrinking truncate we can end up in a situation where we have extent maps for which we no longer have corresponding file extent items. This is generally harmless and at the moment the only consequences are missing file extent items representing holes after we expand the file size again after the truncate operation removed the prealloc extent items, and stale information for future fiemap calls (reporting extents that no longer exist or may have been reallocated to other files for example). Consider the following example: 1) Our inode has a size of 128KiB, one 128KiB extent at file offset 0 and a 1MiB prealloc extent at file offset 128KiB; 2) Task A starts doing a shrinking truncate of our inode to reduce it to a size of 64KiB. Before it searches the subvolume tree for file extent items to delete, it drops all the extent maps in the range from 64KiB to (u64)-1 by calling btrfs_drop_extent_cache(); 3) Task B starts doing a fiemap against our inode. When looking up for the inode's extent maps in the range from 128KiB to (u64)-1, it doesn't find any in the inode's extent map tree, since they were removed by task A. Because it didn't find any in the extent map tree, it scans the inode's subvolume tree for file extent items, and it finds the 1MiB prealloc extent at file offset 128KiB, then it creates an extent map based on that file extent item and adds it to inode's extent map tree (this ends up being done by btrfs_get_extent() <- btrfs_get_extent_fiemap() <- get_extent_skip_holes()); 4) Task A then drops the prealloc extent at file offset 128KiB and shrinks the 128KiB extent file offset 0 to a length of 64KiB. The truncation operation finishes and we end up with an extent map representing a 1MiB prealloc extent at file offset 128KiB, despite we don't have any more that extent; After this the two types of problems we have are: 1) Future calls to fiemap always report that a 1MiB prealloc extent exists at file offset 128KiB. This is stale information, no longer correct; 2) If the size of the file is increased, by a truncate operation that increases the file size or by a write into a file offset > 64KiB for example, we end up not inserting file extent items to represent holes for any range between 128KiB and 128KiB + 1MiB, since the hole expansion function, btrfs_cont_expand() will skip hole insertion for any range for which an extent map exists that represents a prealloc extent. This causes fsck to complain about missing file extent items when not using the NO_HOLES feature. The second issue could be often triggered by test case generic/561 from fstests, which runs fsstress and duperemove in parallel, and duperemove does frequent fiemap calls. Essentially the problems happens because fiemap does not acquire the inode's lock while truncate does, and fiemap locks the file range in the inode's iotree while truncate does not. So fix the issue by making btrfs_truncate_inode_items() lock the file range from the new file size to (u64)-1, so that it serializes with fiemap. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b3ec93ff911..7d26b4bfb2c6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4085,6 +4085,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + struct extent_state *cached_state = NULL; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4101,6 +4103,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = READA_BACK; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + /* * We want to drop from the next block forward in case this new size is * not block aligned since we will be keeping the last block of the @@ -4367,6 +4372,9 @@ out: btrfs_ordered_update_i_size(inode, last_size, NULL); } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + btrfs_free_path(path); return ret; } -- cgit From a013d141eceee0f7747385e900da2858141aa0f3 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 12 Feb 2020 17:28:10 +0800 Subject: btrfs: sysfs, add UUID/devinfo kobject Create directory /sys/fs/btrfs/UUID/devinfo to hold devices directories by the id (unlike /devices). Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 15 +++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7436422194da..6bac61c42c05 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -901,6 +901,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { + if (fs_devs->devinfo_kobj) { + kobject_del(fs_devs->devinfo_kobj); + kobject_put(fs_devs->devinfo_kobj); + fs_devs->devinfo_kobj = NULL; + } + if (fs_devs->devices_kobj) { kobject_del(fs_devs->devices_kobj); kobject_put(fs_devs->devices_kobj); @@ -1369,6 +1375,15 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) return -ENOMEM; } + fs_devs->devinfo_kobj = kobject_create_and_add("devinfo", + &fs_devs->fsid_kobj); + if (!fs_devs->devinfo_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs devinfo kobject"); + btrfs_sysfs_remove_fsid(fs_devs); + return -ENOMEM; + } + return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 690d4f5a0653..309cda477589 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -258,6 +258,7 @@ struct btrfs_fs_devices { /* sysfs kobjects */ struct kobject fsid_kobj; struct kobject *devices_kobj; + struct kobject *devinfo_kobj; struct completion kobj_unregister; }; -- cgit From 1b9867eb6120db85f8dca8ff42789d9ec9ee16a5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 12 Feb 2020 17:28:11 +0800 Subject: btrfs: sysfs, move device id directories to UUID/devinfo Originally it was planned to create device id directories under UUID/devinfo, but it got under UUID/devices by mistake. We really want it under definfo so the bare device node names are not mixed with device ids and are easy to enumerate. Fixes: 668e48af7a94 ("btrfs: sysfs, add devid/dev_state kobject and device attributes") Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 6bac61c42c05..3c10e78924d0 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1295,7 +1295,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, init_completion(&dev->kobj_unregister); error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devices_kobj, "%llu", + fs_devices->devinfo_kobj, "%llu", dev->devid); if (error) { kobject_put(&dev->devid_kobj); -- cgit