From fc626fe322f13ecdfac58cbfa6cfea797ed22623 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:57 -0700 Subject: fs/ext4: Change EXT4_MOUNT_DAX to EXT4_MOUNT_DAX_ALWAYS In prep for the new tri-state mount option which then introduces EXT4_MOUNT_DAX_NEVER. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-4-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..7b99c44d0a91 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1775,7 +1775,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -3982,7 +3982,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); goto failed_mount; @@ -4092,7 +4092,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); @@ -5412,7 +5412,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); err = -EINVAL; @@ -5433,10 +5433,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT4_MOUNT_DAX; + sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) -- cgit From a8ab6d3885ef5e2300d683b79a9e1999403eefd9 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:58 -0700 Subject: fs/ext4: Update ext4_should_use_dax() S_DAX should only be enabled when the underlying block device supports dax. Cache the underlying support for DAX in the super block and modify ext4_should_use_dax() to check for device support prior to the over riding mount option. While we are at it change the function to ext4_should_enable_dax() as this better reflects the ask as well as matches xfs. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-5-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7b99c44d0a91..f7d76dcaedfe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4092,13 +4092,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (bdev_dax_supported(sb->s_bdev, blocksize)) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); goto failed_mount; } - if (!bdev_dax_supported(sb->s_bdev, blocksize)) { + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); goto failed_mount; -- cgit From 043546e46dc70c25ff7e2cf6d09cbb0424fc9978 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:59 -0700 Subject: fs/ext4: Only change S_DAX on inode load To prevent complications with in memory inodes we only set S_DAX on inode load. FS_XFLAG_DAX can be changed at any time and S_DAX will change after inode eviction and reload. Add init bool to ext4_set_inode_flags() to indicate if the inode is being newly initialized. Assert that S_DAX is not set on an inode which is just being loaded. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-6-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f7d76dcaedfe..80eb814c47eb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1348,7 +1348,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); } return res; } @@ -1375,7 +1375,7 @@ retry: * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); -- cgit From 9cb20f94afcd2964944f9468e38da736ee855b19 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:00 -0700 Subject: fs/ext4: Make DAX mount option a tri-state We add 'always', 'never', and 'inode' (default). '-o dax' continues to operate the same which is equivalent to 'always'. This new functionality is limited to ext4 only. Specifically we introduce a 2nd DAX mount flag EXT4_MOUNT2_DAX_NEVER and set it and EXT4_MOUNT_DAX_ALWAYS appropriately for the mode. We also force EXT4_MOUNT2_DAX_NEVER if !CONFIG_FS_DAX. Finally, EXT4_MOUNT2_DAX_INODE is used solely to detect if the user specified that option for printing. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-7-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 10 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 80eb814c47eb..5e056aa20ce9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1512,7 +1512,8 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, @@ -1579,6 +1580,9 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_dax, "dax"}, + {Opt_dax_always, "dax=always"}, + {Opt_dax_inode, "dax=inode"}, + {Opt_dax_never, "dax=never"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_warn_on_error, "warn_on_error"}, @@ -1726,6 +1730,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_STRING 0x0400 +#define MOPT_SKIP 0x0800 static const struct mount_opts { int token; @@ -1775,7 +1780,13 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, + {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -2084,13 +2095,32 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif - } else if (token == Opt_dax) { + } else if (token == Opt_dax || token == Opt_dax_always || + token == Opt_dax_inode || token == Opt_dax_never) { #ifdef CONFIG_FS_DAX - ext4_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= m->mount_opt; + switch (token) { + case Opt_dax: + case Opt_dax_always: + ext4_msg(sb, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + break; + case Opt_dax_never: + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + break; + case Opt_dax_inode: + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + /* Strictly for printing options */ + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; + break; + } #else ext4_msg(sb, KERN_INFO, "dax option not supported"); + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; return -1; #endif } else if (token == Opt_data_err_abort) { @@ -2254,7 +2284,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || - (m->flags & MOPT_CLEAR_ERR)) + (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) continue; if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ @@ -2314,6 +2344,17 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (DUMMY_ENCRYPTION_ENABLED(sbi)) SEQ_OPTS_PUTS("test_dummy_encryption"); + if (test_opt(sb, DAX_ALWAYS)) { + if (IS_EXT2_SB(sb)) + SEQ_OPTS_PUTS("dax"); + else + SEQ_OPTS_PUTS("dax=always"); + } else if (test_opt2(sb, DAX_NEVER)) { + SEQ_OPTS_PUTS("dax=never"); + } else if (test_opt2(sb, DAX_INODE)) { + SEQ_OPTS_PUTS("dax=inode"); + } + ext4_show_quota_options(seq, sb); return 0; } @@ -5436,10 +5477,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS || + (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_NEVER || + (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_INODE) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " - "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS; + "dax mount option with busy inodes while remounting"); + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt |= old_opts.s_mount_opt & EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~(EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); + sbi->s_mount_opt2 |= old_opts.s_mount_opt2 & + (EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) -- cgit From b383a73f2b832491a2f9e6e8ada26aad53b5763d Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:02 -0700 Subject: fs/ext4: Introduce DAX inode flag Add a flag ([EXT4|FS]_DAX_FL) to preserve FS_XFLAG_DAX in the ext4 inode. Set the flag to be user visible and changeable. Set the flag to be inherited. Allow applications to change the flag at any time except if it conflicts with the set of mutually exclusive flags (Currently VERITY, ENCRYPT, JOURNAL_DATA). Furthermore, restrict setting any of the exclusive flags if DAX is set. While conceptually possible, we do not allow setting EXT4_DAX_FL while at the same time clearing exclusion flags (or vice versa) for 2 reasons: 1) The DAX flag does not take effect immediately which introduces quite a bit of complexity 2) There is no clear use case for being this flexible Finally, on regular files, flag the inode to not be cached to facilitate changing S_DAX on the next creation of the inode. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-9-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e056aa20ce9..3658e3016999 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1323,6 +1323,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + res = ext4_convert_inline_data(inode); if (res) return res; -- cgit From 829b37b8cddb1db75c1b7905505b90e593b15db1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 10 Jun 2020 11:16:37 -0400 Subject: ext4: avoid race conditions when remounting with options that change dax Trying to change dax mount options when remounting could allow mount options to be enabled for a small amount of time, and then the mount option change would be reverted. In the case of "mount -o remount,dax", this can cause a race where files would temporarily treated as DAX --- and then not. Cc: stable@kernel.org Reported-by: syzbot+bca9799bf129256190da@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a22d67c5bc00..edf06c1bee9d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2104,16 +2104,40 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, switch (token) { case Opt_dax: case Opt_dax_always: + if (is_remount && + (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { + fail_dax_change_remount: + ext4_msg(sb, KERN_ERR, "can't change " + "dax mount option while remounting"); + return -1; + } + if (is_remount && + (test_opt(sb, DATA_FLAGS) == + EXT4_MOUNT_JOURNAL_DATA)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -1; + } ext4_msg(sb, KERN_WARNING, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; break; case Opt_dax_never: + if (is_remount && + (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) + goto fail_dax_change_remount; sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; break; case Opt_dax_inode: + if (is_remount && + ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) + goto fail_dax_change_remount; sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; /* Strictly for printing options */ @@ -5454,12 +5478,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - if (test_opt(sb, DAX_ALWAYS)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - err = -EINVAL; - goto restore_opts; - } } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " @@ -5475,18 +5493,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS || - (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_NEVER || - (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_INODE) { - ext4_msg(sb, KERN_WARNING, "warning: refusing change of " - "dax mount option with busy inodes while remounting"); - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; - sbi->s_mount_opt |= old_opts.s_mount_opt & EXT4_MOUNT_DAX_ALWAYS; - sbi->s_mount_opt2 &= ~(EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); - sbi->s_mount_opt2 |= old_opts.s_mount_opt2 & - (EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); - } - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); -- cgit From 5adaccac46ea79008d7b75f47913f1a00f91d0ce Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 1 Jun 2020 15:34:04 +0800 Subject: ext4: stop overwrite the errcode in ext4_setup_super Now the errcode from ext4_commit_super will overwrite EROFS exists in ext4_setup_super. Actually, no need to call ext4_commit_super since we will return EROFS. Fix it by goto done directly. Fixes: c89128a00838 ("ext4: handle errors on ext4_commit_super") Signed-off-by: yangerkun Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200601073404.3712492-1-yangerkun@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index edf06c1bee9d..127b8efa8d54 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2412,6 +2412,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; + goto done; } if (read_only) goto done; -- cgit From 7b97d868b7ab2448859668de9222b8af43f76e78 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Tue, 9 Jun 2020 15:35:40 +0800 Subject: ext4, jbd2: ensure panic by fix a race between jbd2 abort and ext4 error handlers In the ext4 filesystem with errors=panic, if one process is recording errno in the superblock when invoking jbd2_journal_abort() due to some error cases, it could be raced by another __ext4_abort() which is setting the SB_RDONLY flag but missing panic because errno has not been recorded. jbd2_journal_commit_transaction() jbd2_journal_abort() journal->j_flags |= JBD2_ABORT; jbd2_journal_update_sb_errno() | ext4_journal_check_start() | __ext4_abort() | sb->s_flags |= SB_RDONLY; | if (!JBD2_REC_ERR) | return; journal->j_flags |= JBD2_REC_ERR; Finally, it will no longer trigger panic because the filesystem has already been set read-only. Fix this by introduce j_abort_mutex to make sure journal abort is completed before panic, and remove JBD2_REC_ERR flag. Fixes: 4327ba52afd03 ("ext4, jbd2: ensure entering into panic after recording an error in superblock") Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200609073540.3810702-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 127b8efa8d54..2660a7e47eef 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -522,9 +522,6 @@ static void ext4_handle_error(struct super_block *sb) smp_wmb(); sb->s_flags |= SB_RDONLY; } else if (test_opt(sb, ERRORS_PANIC)) { - if (EXT4_SB(sb)->s_journal && - !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) - return; panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } @@ -725,23 +722,20 @@ void __ext4_abort(struct super_block *sb, const char *function, va_end(args); if (sb_rdonly(sb) == 0) { - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible * before ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } - if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { - if (EXT4_SB(sb)->s_journal && - !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) - return; + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) panic("EXT4-fs panic from previous error\n"); - } } void __ext4_msg(struct super_block *sb, -- cgit From 4f74d15fe40833d4fb4f5ead7437aa73840596a5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 2 Jul 2020 01:56:07 +0000 Subject: ext4: add inline encryption support Wire up ext4 to support inline encryption via the helper functions which fs/crypto/ now provides. This includes: - Adding a mount option 'inlinecrypt' which enables inline encryption on encrypted files where it can be used. - Setting the bio_crypt_ctx on bios that will be submitted to an inline-encrypted file. Note: submit_bh_wbc() in fs/buffer.c also needed to be patched for this part, since ext4 sometimes uses ll_rw_block() on file data. - Not adding logically discontiguous data to bios that will be submitted to an inline-encrypted file. - Not doing filesystem-layer crypto on inline-encrypted files. Co-developed-by: Satya Tangirala Signed-off-by: Satya Tangirala Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20200702015607.1215430-5-satyat@google.com Signed-off-by: Eric Biggers --- fs/ext4/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 330957ed1f05..0907f907c47d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1508,6 +1508,7 @@ enum { Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, + Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, @@ -1610,6 +1611,7 @@ static const match_table_t tokens = { {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_inlinecrypt, "inlinecrypt"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ {Opt_removed, "check=none"}, /* mount option from ext2/3 */ @@ -1946,6 +1948,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, case Opt_nolazytime: sb->s_flags &= ~SB_LAZYTIME; return 1; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + sb->s_flags |= SB_INLINECRYPT; +#else + ext4_msg(sb, KERN_ERR, "inline encryption not supported"); +#endif + return 1; } for (m = ext4_mount_opts; m->token != Opt_err; m++) @@ -2404,6 +2413,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, fscrypt_show_test_dummy_encryption(seq, sep, sb); + if (sb->s_flags & SB_INLINECRYPT) + SEQ_OPTS_PUTS("inlinecrypt"); + if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); -- cgit From 11215630aada28307ba555a43138db6ac54fa825 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 10 Jul 2020 16:07:59 +0200 Subject: ext4: don't BUG on inconsistent journal feature A customer has reported a BUG_ON in ext4_clear_journal_err() hitting during an LTP testing. Either this has been caused by a test setup issue where the filesystem was being overwritten while LTP was mounting it or the journal replay has overwritten the superblock with invalid data. In either case it is preferable we don't take the machine down with a BUG_ON. So handle the situation of unexpectedly missing has_journal feature more gracefully. We issue warning and fail the mount in the cases where the race window is narrow and the failed check is most likely a programming error. In cases where fs corruption is more likely, we do full ext4_error() handling before failing mount / remount. Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200710140759.18031-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 68 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 21 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 330957ed1f05..9fdad843b30e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -66,10 +66,10 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static int ext4_commit_super(struct super_block *sb, int sync); -static void ext4_mark_recovery_complete(struct super_block *sb, +static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); -static void ext4_clear_journal_err(struct super_block *sb, - struct ext4_super_block *es); +static int ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -4770,7 +4770,9 @@ no_journal: EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); - ext4_mark_recovery_complete(sb, es); + err = ext4_mark_recovery_complete(sb, es); + if (err) + goto failed_mount8; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -4813,10 +4815,8 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; -#ifdef CONFIG_QUOTA failed_mount8: ext4_unregister_sysfs(sb); -#endif failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -4956,7 +4956,8 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return NULL; journal_inode = ext4_get_journal_inode(sb, journal_inum); if (!journal_inode) @@ -4986,7 +4987,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, struct ext4_super_block *es; struct block_device *bdev; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return NULL; bdev = ext4_blkdev_get(j_dev, sb); if (bdev == NULL) @@ -5078,7 +5080,8 @@ static int ext4_load_journal(struct super_block *sb, int err = 0; int really_read_only; - BUG_ON(!ext4_has_feature_journal(sb)); + if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) + return -EFSCORRUPTED; if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { @@ -5148,7 +5151,12 @@ static int ext4_load_journal(struct super_block *sb, } EXT4_SB(sb)->s_journal = journal; - ext4_clear_journal_err(sb, es); + err = ext4_clear_journal_err(sb, es); + if (err) { + EXT4_SB(sb)->s_journal = NULL; + jbd2_journal_destroy(journal); + return err; + } if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { @@ -5244,26 +5252,32 @@ static int ext4_commit_super(struct super_block *sb, int sync) * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ -static void ext4_mark_recovery_complete(struct super_block *sb, - struct ext4_super_block *es) +static int ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) { + int err; journal_t *journal = EXT4_SB(sb)->s_journal; if (!ext4_has_feature_journal(sb)) { - BUG_ON(journal != NULL); - return; + if (journal != NULL) { + ext4_error(sb, "Journal got removed while the fs was " + "mounted!"); + return -EFSCORRUPTED; + } + return 0; } jbd2_journal_lock_updates(journal); - if (jbd2_journal_flush(journal) < 0) + err = jbd2_journal_flush(journal); + if (err < 0) goto out; if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { ext4_clear_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); } - out: jbd2_journal_unlock_updates(journal); + return err; } /* @@ -5271,14 +5285,17 @@ out: * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ -static void ext4_clear_journal_err(struct super_block *sb, +static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es) { journal_t *journal; int j_errno; const char *errstr; - BUG_ON(!ext4_has_feature_journal(sb)); + if (!ext4_has_feature_journal(sb)) { + ext4_error(sb, "Journal got removed while the fs was mounted!"); + return -EFSCORRUPTED; + } journal = EXT4_SB(sb)->s_journal; @@ -5303,6 +5320,7 @@ static void ext4_clear_journal_err(struct super_block *sb, jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); } + return 0; } /* @@ -5573,8 +5591,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - if (sbi->s_journal) + if (sbi->s_journal) { + /* + * We let remount-ro finish even if marking fs + * as clean failed... + */ ext4_mark_recovery_complete(sb, es); + } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); } else { @@ -5622,8 +5645,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * been changed by e2fsck since we originally mounted * the partition.) */ - if (sbi->s_journal) - ext4_clear_journal_err(sb, es); + if (sbi->s_journal) { + err = ext4_clear_journal_err(sb, es); + if (err) + goto restore_opts; + } sbi->s_mount_state = le16_to_cpu(es->s_state); err = ext4_setup_super(sb, es, 0); -- cgit From 273108fa5015eeffc4bacfa5ce272af3434b96e4 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 17 Jul 2020 11:06:05 +0200 Subject: ext4: handle read only external journal device Ext4 uses blkdev_get_by_dev() to get the block_device for journal device which does check to see if the read-only block device was opened read-only. As a result ext4 will hapily proceed mounting the file system with external journal on read-only device. This is bad as we would not be able to use the journal leading to errors later on. Instead of simply failing to mount file system in this case, treat it in a similar way we treat internal journal on read-only device. Allow to mount with -o noload in read-only mode. This can be reproduced easily like this: mke2fs -F -O journal_dev $JOURNAL_DEV 100M mkfs.$FSTYPE -F -J device=$JOURNAL_DEV $FS_DEV blockdev --setro $JOURNAL_DEV mount $FS_DEV $MNT touch $MNT/file umount $MNT leading to error like this [ 1307.318713] ------------[ cut here ]------------ [ 1307.323362] generic_make_request: Trying to write to read-only block-device dm-2 (partno 0) [ 1307.331741] WARNING: CPU: 36 PID: 3224 at block/blk-core.c:855 generic_make_request_checks+0x2c3/0x580 [ 1307.341041] Modules linked in: ext4 mbcache jbd2 rfkill intel_rapl_msr intel_rapl_common isst_if_commd [ 1307.419445] CPU: 36 PID: 3224 Comm: jbd2/dm-2 Tainted: G W I 5.8.0-rc5 #2 [ 1307.427359] Hardware name: Dell Inc. PowerEdge R740/01KPX8, BIOS 2.3.10 08/15/2019 [ 1307.434932] RIP: 0010:generic_make_request_checks+0x2c3/0x580 [ 1307.440676] Code: 94 03 00 00 48 89 df 48 8d 74 24 08 c6 05 cf 2b 18 01 01 e8 7f a4 ff ff 48 c7 c7 50e [ 1307.459420] RSP: 0018:ffffc0d70eb5fb48 EFLAGS: 00010286 [ 1307.464646] RAX: 0000000000000000 RBX: ffff9b33b2978300 RCX: 0000000000000000 [ 1307.471780] RDX: ffff9b33e12a81e0 RSI: ffff9b33e1298000 RDI: ffff9b33e1298000 [ 1307.478913] RBP: ffff9b7b9679e0c0 R08: 0000000000000837 R09: 0000000000000024 [ 1307.486044] R10: 0000000000000000 R11: ffffc0d70eb5f9f0 R12: 0000000000000400 [ 1307.493177] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000 [ 1307.500308] FS: 0000000000000000(0000) GS:ffff9b33e1280000(0000) knlGS:0000000000000000 [ 1307.508396] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1307.514142] CR2: 000055eaf4109000 CR3: 0000003dee40a006 CR4: 00000000007606e0 [ 1307.521273] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1307.528407] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1307.535538] PKRU: 55555554 [ 1307.538250] Call Trace: [ 1307.540708] generic_make_request+0x30/0x340 [ 1307.544985] submit_bio+0x43/0x190 [ 1307.548393] ? bio_add_page+0x62/0x90 [ 1307.552068] submit_bh_wbc+0x16a/0x190 [ 1307.555833] jbd2_write_superblock+0xec/0x200 [jbd2] [ 1307.560803] jbd2_journal_update_sb_log_tail+0x65/0xc0 [jbd2] [ 1307.566557] jbd2_journal_commit_transaction+0x2ae/0x1860 [jbd2] [ 1307.572566] ? check_preempt_curr+0x7a/0x90 [ 1307.576756] ? update_curr+0xe1/0x1d0 [ 1307.580421] ? account_entity_dequeue+0x7b/0xb0 [ 1307.584955] ? newidle_balance+0x231/0x3d0 [ 1307.589056] ? __switch_to_asm+0x42/0x70 [ 1307.592986] ? __switch_to_asm+0x36/0x70 [ 1307.596918] ? lock_timer_base+0x67/0x80 [ 1307.600851] kjournald2+0xbd/0x270 [jbd2] [ 1307.604873] ? finish_wait+0x80/0x80 [ 1307.608460] ? commit_timeout+0x10/0x10 [jbd2] [ 1307.612915] kthread+0x114/0x130 [ 1307.616152] ? kthread_park+0x80/0x80 [ 1307.619816] ret_from_fork+0x22/0x30 [ 1307.623400] ---[ end trace 27490236265b1630 ]--- Signed-off-by: Lukas Czerner Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20200717090605.2612-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 51 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9fdad843b30e..dda967efcbc2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5079,6 +5079,7 @@ static int ext4_load_journal(struct super_block *sb, dev_t journal_dev; int err = 0; int really_read_only; + int journal_dev_ro; if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) return -EFSCORRUPTED; @@ -5091,7 +5092,31 @@ static int ext4_load_journal(struct super_block *sb, } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); - really_read_only = bdev_read_only(sb->s_bdev); + if (journal_inum && journal_dev) { + ext4_msg(sb, KERN_ERR, + "filesystem has both journal inode and journal device!"); + return -EINVAL; + } + + if (journal_inum) { + journal = ext4_get_journal(sb, journal_inum); + if (!journal) + return -EINVAL; + } else { + journal = ext4_get_dev_journal(sb, journal_dev); + if (!journal) + return -EINVAL; + } + + journal_dev_ro = bdev_read_only(journal->j_dev); + really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; + + if (journal_dev_ro && !sb_rdonly(sb)) { + ext4_msg(sb, KERN_ERR, + "journal device read-only, try mounting with '-o ro'"); + err = -EROFS; + goto err_out; + } /* * Are we loading a blank journal or performing recovery after a @@ -5106,27 +5131,14 @@ static int ext4_load_journal(struct super_block *sb, ext4_msg(sb, KERN_ERR, "write access " "unavailable, cannot proceed " "(try mounting with noload)"); - return -EROFS; + err = -EROFS; + goto err_out; } ext4_msg(sb, KERN_INFO, "write access will " "be enabled during recovery"); } } - if (journal_inum && journal_dev) { - ext4_msg(sb, KERN_ERR, "filesystem has both journal " - "and inode journals!"); - return -EINVAL; - } - - if (journal_inum) { - if (!(journal = ext4_get_journal(sb, journal_inum))) - return -EINVAL; - } else { - if (!(journal = ext4_get_dev_journal(sb, journal_dev))) - return -EINVAL; - } - if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); @@ -5146,8 +5158,7 @@ static int ext4_load_journal(struct super_block *sb, if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); - jbd2_journal_destroy(journal); - return err; + goto err_out; } EXT4_SB(sb)->s_journal = journal; @@ -5167,6 +5178,10 @@ static int ext4_load_journal(struct super_block *sb, } return 0; + +err_out: + jbd2_journal_destroy(journal); + return err; } static int ext4_commit_super(struct super_block *sb, int sync) -- cgit From bc71726c725767205757821df364acff87f92ac5 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 20 Jun 2020 10:54:23 +0800 Subject: ext4: abort the filesystem if failed to async write metadata buffer There is a risk of filesystem inconsistency if we failed to async write back metadata buffer in the background. Because of current buffer's end io procedure is handled by end_buffer_async_write() in the block layer, and it only clear the buffer's uptodate flag and mark the write_io_error flag, so ext4 cannot detect such failure immediately. In most cases of getting metadata buffer (e.g. ext4_read_inode_bitmap()), although the buffer's data is actually uptodate, it may still read data from disk because the buffer's uptodate flag has been cleared. Finally, it may lead to on-disk filesystem inconsistency if reading old data from the disk successfully and write them out again. This patch detect bdev mapping->wb_err when getting journal's write access and mark the filesystem error if bdev's mapping->wb_err was increased, this could prevent further writing and potential inconsistency. Signed-off-by: zhangyi (F) Suggested-by: Jan Kara Link: https://lore.kernel.org/r/20200620025427.1756360-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dda967efcbc2..c77b10257b36 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4765,6 +4765,15 @@ no_journal: } #endif /* CONFIG_QUOTA */ + /* + * Save the original bdev mapping's wb_err value which could be + * used to detect the metadata async write error. + */ + spin_lock_init(&sbi->s_bdev_wb_lock); + if (!sb_rdonly(sb)) + errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + &sbi->s_bdev_wb_err); + sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -5654,6 +5663,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + /* + * Update the original bdev mapping's wb_err value + * which could be used to detect the metadata async + * write error. + */ + errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + &sbi->s_bdev_wb_err); + /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have -- cgit From 529a781ee07aaa58be8164d75ba5998eb7dd216c Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 20 Jun 2020 10:54:27 +0800 Subject: jbd2: remove unused parameter in jbd2_journal_try_to_free_buffers() Parameter gfp_mask in jbd2_journal_try_to_free_buffers() is no longer used after commit <536fc240e7147> ("jbd2: clean up jbd2_journal_try_to_free_buffers()"), so just remove it. Signed-off-by: zhangyi (F) Link: https://lore.kernel.org/r/20200620025427.1756360-6-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c77b10257b36..8c00f0a09f4d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1288,8 +1288,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, if (!page_has_buffers(page)) return 0; if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_DIRECT_RECLAIM); + return jbd2_journal_try_to_free_buffers(journal, page); + return try_to_free_buffers(page); } -- cgit From 3d392b2676bf3199863a1e5efb2c087ad9d442a4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Jul 2020 00:14:40 -0400 Subject: ext4: add prefetch_block_bitmaps mount option For file systems where we can afford to keep the buddy bitmaps cached, we can speed up initial writes to large file systems by starting to load the block allocation bitmaps as soon as the file system is mounted. This won't work well for _super_ large file systems, or memory constrained systems, so we only enable this when it is requested via a mount option. Addresses-Google-Bug: 159488342 Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/super.c | 57 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 16 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8c00f0a09f4d..1cbe3f248faf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1521,6 +1521,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, + Opt_prefetch_block_bitmaps, }; static const match_table_t tokens = { @@ -1612,6 +1613,7 @@ static const match_table_t tokens = { {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ + {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1829,6 +1831,8 @@ static const struct mount_opts { {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, + MOPT_SET}, {Opt_err, 0, 0} }; @@ -3201,15 +3205,34 @@ static void print_daily_error_info(struct timer_list *t) static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; - ext4_group_t group, ngroups; - struct super_block *sb; + struct super_block *sb = elr->lr_super; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t group = elr->lr_next_group; unsigned long timeout = 0; + unsigned int prefetch_ios = 0; int ret = 0; - sb = elr->lr_super; - ngroups = EXT4_SB(sb)->s_groups_count; + if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { + elr->lr_next_group = ext4_mb_prefetch(sb, group, + EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); + if (prefetch_ios) + ext4_mb_prefetch_fini(sb, elr->lr_next_group, + prefetch_ios); + trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, + prefetch_ios); + if (group >= elr->lr_next_group) { + ret = 1; + if (elr->lr_first_not_zeroed != ngroups && + !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + elr->lr_next_group = elr->lr_first_not_zeroed; + elr->lr_mode = EXT4_LI_MODE_ITABLE; + ret = 0; + } + } + return ret; + } - for (group = elr->lr_next_group; group < ngroups; group++) { + for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; @@ -3227,9 +3250,10 @@ static int ext4_run_li_request(struct ext4_li_request *elr) timeout = jiffies; ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); + trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { timeout = (jiffies - timeout) * - elr->lr_sbi->s_li_wait_mult; + EXT4_SB(elr->lr_super)->s_li_wait_mult; elr->lr_timeout = timeout; } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -3244,15 +3268,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr) */ static void ext4_remove_li_request(struct ext4_li_request *elr) { - struct ext4_sb_info *sbi; - if (!elr) return; - sbi = elr->lr_sbi; - list_del(&elr->lr_request); - sbi->s_li_request = NULL; + EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } @@ -3461,7 +3481,6 @@ static int ext4_li_info_new(void) static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { - struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); @@ -3469,8 +3488,13 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, return NULL; elr->lr_super = sb; - elr->lr_sbi = sbi; - elr->lr_next_group = start; + elr->lr_first_not_zeroed = start; + if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; + else { + elr->lr_mode = EXT4_LI_MODE_ITABLE; + elr->lr_next_group = start; + } /* * Randomize first schedule time of the request to @@ -3500,8 +3524,9 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (first_not_zeroed == ngroups || sb_rdonly(sb) || - !test_opt(sb, INIT_INODE_TABLE)) + if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || sb_rdonly(sb) || + !test_opt(sb, INIT_INODE_TABLE))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); -- cgit From f25391ebb475d3ffb3aa61bb90e3594c841749ef Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 23 Jul 2020 17:05:26 +0200 Subject: ext4: handle option set by mount flags correctly Currently there is a problem with mount options that can be both set by vfs using mount flags or by a string parsing in ext4. i_version/iversion options gets lost after remount, for example $ mount -o i_version /dev/pmem0 /mnt $ grep pmem0 /proc/self/mountinfo | grep i_version 310 95 259:0 / /mnt rw,relatime shared:163 - ext4 /dev/pmem0 rw,seclabel,i_version $ mount -o remount,ro /mnt $ grep pmem0 /proc/self/mountinfo | grep i_version nolazytime gets ignored by ext4 on remount, for example $ mount -o lazytime /dev/pmem0 /mnt $ grep pmem0 /proc/self/mountinfo | grep lazytime 310 95 259:0 / /mnt rw,relatime shared:163 - ext4 /dev/pmem0 rw,lazytime,seclabel $ mount -o remount,nolazytime /mnt $ grep pmem0 /proc/self/mountinfo | grep lazytime 310 95 259:0 / /mnt rw,relatime shared:163 - ext4 /dev/pmem0 rw,lazytime,seclabel Fix it by applying the SB_LAZYTIME and SB_I_VERSION flags from *flags to s_flags before we parse the option and use the resulting state of the same flags in *flags at the end of successful remount. Signed-off-by: Lukas Czerner Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20200723150526.19931-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1cbe3f248faf..a6de88fc7060 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5512,7 +5512,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long old_sb_flags; + unsigned long old_sb_flags, vfs_flags; struct ext4_mount_options old_opts; int enable_quota = 0; ext4_group_t g; @@ -5555,6 +5555,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + /* + * Some options can be enabled by ext4 and/or by VFS mount flag + * either way we need to make sure it matches in both *flags and + * s_flags. Copy those selected flags from *flags to s_flags + */ + vfs_flags = SB_LAZYTIME | SB_I_VERSION; + sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; @@ -5608,9 +5616,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if (*flags & SB_LAZYTIME) - sb->s_flags |= SB_LAZYTIME; - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; @@ -5758,7 +5763,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } #endif - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + /* + * Some options can be enabled by ext4 and/or by VFS mount flag + * either way we need to make sure it matches in both *flags and + * s_flags. Copy those selected flags from s_flags to *flags + */ + *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); return 0; -- cgit From 1cf006ed19a887c22e085189c8b4a3cbf60d2246 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 25 Jul 2020 12:33:13 +0000 Subject: ext4: export msg_count and warning_count via sysfs This numbers can be analized by system automation similar to errors_count. In ideal world it would be nice to have separate counters for different log-levels, but this makes this patch too intrusive. Signed-off-by: Dmitry Monakhov Link: https://lore.kernel.org/r/20200725123313.4467-1-dmtrmonakhov@yandex-team.ru Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a6de88fc7060..7deb2b75fcd0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -744,6 +744,7 @@ void __ext4_msg(struct super_block *sb, struct va_format vaf; va_list args; + atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; @@ -754,9 +755,12 @@ void __ext4_msg(struct super_block *sb, va_end(args); } -#define ext4_warning_ratelimit(sb) \ - ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \ - "EXT4-fs warning") +static int ext4_warning_ratelimit(struct super_block *sb) +{ + atomic_inc(&EXT4_SB(sb)->s_warning_count); + return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), + "EXT4-fs warning"); +} void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) @@ -4840,6 +4844,8 @@ no_journal: ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + atomic_set(&sbi->s_warning_count, 0); + atomic_set(&sbi->s_msg_count, 0); kfree(orig_data); return 0; -- cgit From d176b1f62f242ab259ff665a26fbac69db1aecba Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Jul 2020 15:04:32 +0200 Subject: ext4: handle error of ext4_setup_system_zone() on remount ext4_setup_system_zone() can fail. Handle the failure in ext4_remount(). Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200728130437.7804-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7deb2b75fcd0..6507e30e58b5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5747,7 +5747,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_register_li_request(sb, first_not_zeroed); } - ext4_setup_system_zone(sb); + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb, 1); if (err) -- cgit From 0f5bde1db174f6c471f0bd27198575719dabe3e5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Jul 2020 15:04:37 +0200 Subject: ext4: correctly restore system zone info when remount fails When remounting filesystem fails late during remount handling and block_validity mount option is also changed during the remount, we fail to restore system zone information to a state matching the mount option. This is mostly harmless, just the block validity checking will not match the situation described by the mount option. Make sure these two are always consistent. Reported-by: Lukas Czerner Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200728130437.7804-7-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6507e30e58b5..0892d71a52bd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4727,11 +4727,13 @@ no_journal: ext4_set_resv_clusters(sb); - err = ext4_setup_system_zone(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize system " - "zone (%d)", err); - goto failed_mount4a; + if (test_opt(sb, BLOCK_VALIDITY)) { + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)", err); + goto failed_mount4a; + } } ext4_ext_init(sb); @@ -5747,9 +5749,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_register_li_request(sb, first_not_zeroed); } - err = ext4_setup_system_zone(sb); - if (err) - goto restore_opts; + /* + * Handle creation of system zone data early because it can fail. + * Releasing of existing data is done when we are sure remount will + * succeed. + */ + if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) { + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + } if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb, 1); @@ -5771,6 +5780,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } #endif + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + ext4_release_system_zone(sb); /* * Some options can be enabled by ext4 and/or by VFS mount flag @@ -5792,6 +5803,8 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { -- cgit From 377254b2cd2252c7c3151b113cbdf93a7736c2e9 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Fri, 31 Jul 2020 12:10:25 -0400 Subject: fs: prevent BUG_ON in submit_bh_wbc() If a device is hot-removed --- for example, when a physical device is unplugged from pcie slot or a nbd device's network is shutdown --- this can result in a BUG_ON() crash in submit_bh_wbc(). This is because the when the block device dies, the buffer heads will have their Buffer_Mapped flag get cleared, leading to the crash in submit_bh_wbc. We had attempted to work around this problem in commit a17712c8 ("ext4: check superblock mapped prior to committing"). Unfortunately, it's still possible to hit the BUG_ON(!buffer_mapped(bh)) if the device dies between when the work-around check in ext4_commit_super() and when submit_bh_wbh() is finally called: Code path: ext4_commit_super judge if 'buffer_mapped(sbh)' is false, return <== commit a17712c8 lock_buffer(sbh) ... unlock_buffer(sbh) __sync_dirty_buffer(sbh,... lock_buffer(sbh) judge if 'buffer_mapped(sbh))' is false, return <== added by this patch submit_bh(...,sbh) submit_bh_wbc(...,sbh,...) [100722.966497] kernel BUG at fs/buffer.c:3095! <== BUG_ON(!buffer_mapped(bh))' in submit_bh_wbc() [100722.966503] invalid opcode: 0000 [#1] SMP [100722.966566] task: ffff8817e15a9e40 task.stack: ffffc90024744000 [100722.966574] RIP: 0010:submit_bh_wbc+0x180/0x190 [100722.966575] RSP: 0018:ffffc90024747a90 EFLAGS: 00010246 [100722.966576] RAX: 0000000000620005 RBX: ffff8818a80603a8 RCX: 0000000000000000 [100722.966576] RDX: ffff8818a80603a8 RSI: 0000000000020800 RDI: 0000000000000001 [100722.966577] RBP: ffffc90024747ac0 R08: 0000000000000000 R09: ffff88207f94170d [100722.966578] R10: 00000000000437c8 R11: 0000000000000001 R12: 0000000000020800 [100722.966578] R13: 0000000000000001 R14: 000000000bf9a438 R15: ffff88195f333000 [100722.966580] FS: 00007fa2eee27700(0000) GS:ffff88203d840000(0000) knlGS:0000000000000000 [100722.966580] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [100722.966581] CR2: 0000000000f0b008 CR3: 000000201a622003 CR4: 00000000007606e0 [100722.966582] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [100722.966583] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [100722.966583] PKRU: 55555554 [100722.966583] Call Trace: [100722.966588] __sync_dirty_buffer+0x6e/0xd0 [100722.966614] ext4_commit_super+0x1d8/0x290 [ext4] [100722.966626] __ext4_std_error+0x78/0x100 [ext4] [100722.966635] ? __ext4_journal_get_write_access+0xca/0x120 [ext4] [100722.966646] ext4_reserve_inode_write+0x58/0xb0 [ext4] [100722.966655] ? ext4_dirty_inode+0x48/0x70 [ext4] [100722.966663] ext4_mark_inode_dirty+0x53/0x1e0 [ext4] [100722.966671] ? __ext4_journal_start_sb+0x6d/0xf0 [ext4] [100722.966679] ext4_dirty_inode+0x48/0x70 [ext4] [100722.966682] __mark_inode_dirty+0x17f/0x350 [100722.966686] generic_update_time+0x87/0xd0 [100722.966687] touch_atime+0xa9/0xd0 [100722.966690] generic_file_read_iter+0xa09/0xcd0 [100722.966694] ? page_cache_tree_insert+0xb0/0xb0 [100722.966704] ext4_file_read_iter+0x4a/0x100 [ext4] [100722.966707] ? __inode_security_revalidate+0x4f/0x60 [100722.966709] __vfs_read+0xec/0x160 [100722.966711] vfs_read+0x8c/0x130 [100722.966712] SyS_pread64+0x87/0xb0 [100722.966716] do_syscall_64+0x67/0x1b0 [100722.966719] entry_SYSCALL64_slow_path+0x25/0x25 To address this, add the check of 'buffer_mapped(bh)' to __sync_dirty_buffer(). This also has the benefit of fixing this for other file systems. With this addition, we can drop the workaround in ext4_commit_supper(). [ Commit description rewritten by tytso. ] Signed-off-by: Xianting Tian Link: https://lore.kernel.org/r/1596211825-8750-1-git-send-email-xianting_tian@126.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0892d71a52bd..daa94c7f7271 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5235,13 +5235,6 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; - /* - * The superblock bh should be mapped, but it might not be if the - * device was hot-removed. Not much we can do but fail the I/O. - */ - if (!buffer_mapped(sbh)) - return error; - /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock -- cgit From 27bc446e2def38db3244a6eb4bb1d6312936610a Mon Sep 17 00:00:00 2001 From: brookxu Date: Mon, 17 Aug 2020 15:36:15 +0800 Subject: ext4: limit the length of per-inode prealloc list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the scenario of writing sparse files, the per-inode prealloc list may be very long, resulting in high overhead for ext4_mb_use_preallocated(). To circumvent this problem, we limit the maximum length of per-inode prealloc list to 512 and allow users to modify it. After patching, we observed that the sys ratio of cpu has dropped, and the system throughput has increased significantly. We created a process to write the sparse file, and the running time of the process on the fixed kernel was significantly reduced, as follows: Running time on unfixed kernel: [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat real 0m2.051s user 0m0.008s sys 0m2.026s Running time on fixed kernel: [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat real 0m0.471s user 0m0.004s sys 0m0.395s Signed-off-by: Chunguang Xu Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index daa94c7f7271..13bdddc081e0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1127,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) inode_set_iversion(&ei->vfs_inode, 1); spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); + atomic_set(&ei->i_prealloc_active, 0); spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); @@ -1220,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); clear_inode(inode); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { -- cgit