From fc626fe322f13ecdfac58cbfa6cfea797ed22623 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:57 -0700 Subject: fs/ext4: Change EXT4_MOUNT_DAX to EXT4_MOUNT_DAX_ALWAYS In prep for the new tri-state mount option which then introduces EXT4_MOUNT_DAX_NEVER. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-4-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..1a3daf2d18ef 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1123,9 +1123,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX -#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else -#define EXT4_MOUNT_DAX 0 +#define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ -- cgit From a8ab6d3885ef5e2300d683b79a9e1999403eefd9 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:58 -0700 Subject: fs/ext4: Update ext4_should_use_dax() S_DAX should only be enabled when the underlying block device supports dax. Cache the underlying support for DAX in the super block and modify ext4_should_use_dax() to check for device support prior to the over riding mount option. While we are at it change the function to ext4_should_enable_dax() as this better reflects the ask as well as matches xfs. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-5-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1a3daf2d18ef..0b4db9ce7756 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1979,6 +1979,7 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) */ #define EXT4_FLAGS_RESIZING 0 #define EXT4_FLAGS_SHUTDOWN 1 +#define EXT4_FLAGS_BDEV_IS_DAX 2 static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) { -- cgit From 043546e46dc70c25ff7e2cf6d09cbb0424fc9978 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:59 -0700 Subject: fs/ext4: Only change S_DAX on inode load To prevent complications with in memory inodes we only set S_DAX on inode load. FS_XFLAG_DAX can be changed at any time and S_DAX will change after inode eviction and reload. Add init bool to ext4_set_inode_flags() to indicate if the inode is being newly initialized. Assert that S_DAX is not set on an inode which is just being loaded. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-6-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0b4db9ce7756..f5291693ce6e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2693,7 +2693,7 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern void ext4_set_inode_flags(struct inode *); +extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); -- cgit From 9cb20f94afcd2964944f9468e38da736ee855b19 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:00 -0700 Subject: fs/ext4: Make DAX mount option a tri-state We add 'always', 'never', and 'inode' (default). '-o dax' continues to operate the same which is equivalent to 'always'. This new functionality is limited to ext4 only. Specifically we introduce a 2nd DAX mount flag EXT4_MOUNT2_DAX_NEVER and set it and EXT4_MOUNT_DAX_ALWAYS appropriately for the mode. We also force EXT4_MOUNT2_DAX_NEVER if !CONFIG_FS_DAX. Finally, EXT4_MOUNT2_DAX_INODE is used solely to detect if the user specified that option for printing. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-7-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f5291693ce6e..65ffb831b2b9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1168,6 +1168,8 @@ struct ext4_inode_info { blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ -- cgit From b383a73f2b832491a2f9e6e8ada26aad53b5763d Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:02 -0700 Subject: fs/ext4: Introduce DAX inode flag Add a flag ([EXT4|FS]_DAX_FL) to preserve FS_XFLAG_DAX in the ext4 inode. Set the flag to be user visible and changeable. Set the flag to be inherited. Allow applications to change the flag at any time except if it conflicts with the set of mutually exclusive flags (Currently VERITY, ENCRYPT, JOURNAL_DATA). Furthermore, restrict setting any of the exclusive flags if DAX is set. While conceptually possible, we do not allow setting EXT4_DAX_FL while at the same time clearing exclusion flags (or vice versa) for 2 reasons: 1) The DAX flag does not take effect immediately which introduces quite a bit of complexity 2) There is no clear use case for being this flexible Finally, on regular files, flag the inode to not be cached to facilitate changing S_DAX on the next creation of the inode. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-9-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 65ffb831b2b9..598e00a9453f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -415,13 +415,16 @@ struct flex_groups { #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ + #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ @@ -429,14 +432,16 @@ struct flex_groups { EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ - EXT4_PROJINHERIT_FL) + EXT4_PROJINHERIT_FL | \ + EXT4_DAX_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ - EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ @@ -448,6 +453,10 @@ struct flex_groups { /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) +/* Flags which are mutually exclusive to DAX */ +#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ + EXT4_JOURNAL_DATA_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -488,6 +497,7 @@ enum { EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ -- cgit From 2a12e147da38807f8aab03c92de99a70ab983a96 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 12 Jul 2020 20:10:12 -0700 Subject: ext4: don't hardcode bit values in EXT4_FL_USER_* Define the EXT4_FL_USER_* constants by OR-ing together the appropriate flags, rather than hard-coding a numeric value. This makes it much easier to see which flags are listed. No change in the actual values. Signed-off-by: Eric Biggers Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200713031012.192440-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 42f5060f3cdf..b603a28a3696 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -434,8 +434,34 @@ struct flex_groups { #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */ +/* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ + EXT4_UNRM_FL | \ + EXT4_COMPR_FL | \ + EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_JOURNAL_DATA_FL | \ + EXT4_NOTAIL_FL | \ + EXT4_DIRSYNC_FL | \ + EXT4_TOPDIR_FL | \ + EXT4_EXTENTS_FL | \ + 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ + EXT4_DAX_FL | \ + EXT4_PROJINHERIT_FL | \ + EXT4_CASEFOLD_FL) + +/* User visible flags */ +#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ + EXT4_DIRTY_FL | \ + EXT4_COMPRBLK_FL | \ + EXT4_NOCOMPR_FL | \ + EXT4_ENCRYPT_FL | \ + EXT4_INDEX_FL | \ + EXT4_VERITY_FL | \ + EXT4_INLINE_DATA_FL) /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ -- cgit From cb29a02d3a9d51c749f556b3bbf6551fbc0454eb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 14 Jul 2020 16:09:09 -0700 Subject: ext4: use generic names for generic ioctls Don't define EXT4_IOC_* aliases to ioctls that already have a generic FS_IOC_* name. These aliases are unnecessary, and they make it unclear which ioctls are ext4-specific and which are generic. Exception: leave EXT4_IOC_GETVERSION_OLD and EXT4_IOC_SETVERSION_OLD as-is for now, since renaming them to FS_IOC_GETVERSION and FS_IOC_SETVERSION would probably make them more likely to be confused with EXT4_IOC_GETVERSION and EXT4_IOC_SETVERSION which also exist. Signed-off-by: Eric Biggers Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200714230909.56349-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b603a28a3696..dd5e6c645442 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -463,7 +463,7 @@ struct flex_groups { EXT4_VERITY_FL | \ EXT4_INLINE_DATA_FL) -/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +/* Flags we can manipulate with through FS_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ @@ -695,8 +695,6 @@ enum { /* * ioctl commands */ -#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS -#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS #define EXT4_IOC_GETVERSION _IOR('f', 3, long) #define EXT4_IOC_SETVERSION _IOW('f', 4, long) #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION @@ -713,17 +711,11 @@ enum { #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) -#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY -#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT -#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) -#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR -#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR - #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) /* @@ -748,8 +740,6 @@ enum { /* * ioctl commands in 32 bit emulation */ -#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS #define EXT4_IOC32_GETVERSION _IOR('f', 3, int) #define EXT4_IOC32_SETVERSION _IOW('f', 4, int) #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) -- cgit From cfd73237722135807967f389bcbda558a60a30d6 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Tue, 21 Apr 2020 10:54:07 +0300 Subject: ext4: add prefetching for block allocation bitmaps This should significantly improve bitmap loading, especially for flex groups as it tries to load all bitmaps within a flex.group instead of one by one synchronously. Prefetching is done in 8 * flex_bg groups, so it should be 8 read-ahead reads for a single allocating thread. At the end of allocation the thread waits for read-ahead completion and initializes buddy information so that read-aheads are not lost in case of memory pressure. At cr=0 the number of prefetching IOs is limited per allocation context to prevent a situation when mballoc loads thousands of bitmaps looking for a perfect group and ignoring groups with good chunks. Together with the patch "ext4: limit scanning of uninitialized groups" the mount time (which includes few tiny allocations) of a 1PB filesystem is reduced significantly: 0% full 50%-full unpatched patched mount time 33s 9279s 563s [ Restructured by tytso; removed the state flags in the allocation context, so it can be used to lazily prefetch the allocation bitmaps immediately after the file system is mounted. Skip prefetching block groups which are uninitialized. Finally pass in the REQ_RAHEAD flag to the block layer while prefetching. ] Signed-off-by: Alex Zhuravlev Reviewed-by: Andreas Dilger Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index dd5e6c645442..4fba138b8722 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1521,6 +1521,8 @@ struct ext4_sb_info { /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; + unsigned int s_mb_prefetch; + unsigned int s_mb_prefetch_limit; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -2462,7 +2464,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, - ext4_group_t block_group); + ext4_group_t block_group, + bool ignore_locked); extern int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh); @@ -3161,6 +3164,7 @@ struct ext4_group_info { (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) @@ -3175,6 +3179,8 @@ struct ext4_group_info { (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ + (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 -- cgit From bc71726c725767205757821df364acff87f92ac5 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 20 Jun 2020 10:54:23 +0800 Subject: ext4: abort the filesystem if failed to async write metadata buffer There is a risk of filesystem inconsistency if we failed to async write back metadata buffer in the background. Because of current buffer's end io procedure is handled by end_buffer_async_write() in the block layer, and it only clear the buffer's uptodate flag and mark the write_io_error flag, so ext4 cannot detect such failure immediately. In most cases of getting metadata buffer (e.g. ext4_read_inode_bitmap()), although the buffer's data is actually uptodate, it may still read data from disk because the buffer's uptodate flag has been cleared. Finally, it may lead to on-disk filesystem inconsistency if reading old data from the disk successfully and write them out again. This patch detect bdev mapping->wb_err when getting journal's write access and mark the filesystem error if bdev's mapping->wb_err was increased, this could prevent further writing and potential inconsistency. Signed-off-by: zhangyi (F) Suggested-by: Jan Kara Link: https://lore.kernel.org/r/20200620025427.1756360-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4fba138b8722..26ae31a994a2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1603,6 +1603,9 @@ struct ext4_sb_info { #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif + /* Record the errseq of the backing block device */ + errseq_t s_bdev_wb_err; + spinlock_t s_bdev_wb_lock; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) -- cgit From 3d392b2676bf3199863a1e5efb2c087ad9d442a4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Jul 2020 00:14:40 -0400 Subject: ext4: add prefetch_block_bitmaps mount option For file systems where we can afford to keep the buddy bitmaps cached, we can speed up initial writes to large file systems by starting to load the block allocation bitmaps as soon as the file system is mounted. This won't work well for _super_ large file systems, or memory constrained systems, so we only enable this when it is requested via a mount option. Addresses-Google-Bug: 159488342 Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/ext4.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 26ae31a994a2..e5f0c66a6156 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1188,6 +1188,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -2334,9 +2335,15 @@ struct ext4_lazy_init { struct mutex li_list_mtx; }; +enum ext4_li_mode { + EXT4_LI_MODE_PREFETCH_BBITMAP, + EXT4_LI_MODE_ITABLE, +}; + struct ext4_li_request { struct super_block *lr_super; - struct ext4_sb_info *lr_sbi; + enum ext4_li_mode lr_mode; + ext4_group_t lr_first_not_zeroed; ext4_group_t lr_next_group; struct list_head lr_request; unsigned long lr_next_sched; @@ -2676,6 +2683,12 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); +extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, + ext4_group_t group, + unsigned int nr, int *cnt); +extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr); + extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); -- cgit From 6dbd300129089ad57de20be89ab7c122b36a4c26 Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Thu, 23 Jul 2020 23:29:54 -0400 Subject: ext4: remove some redundant function declarations ext4 update feature functions do not exist now, remove these useless function declarations. Signed-off-by: Shijie Luo Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20200724032954.22097-1-luoshijie1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e5f0c66a6156..32260ceb9cb5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2959,12 +2959,6 @@ do { \ #endif -extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, - __u32 compat); -extern int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat); -extern int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat); extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, -- cgit From 1cf006ed19a887c22e085189c8b4a3cbf60d2246 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 25 Jul 2020 12:33:13 +0000 Subject: ext4: export msg_count and warning_count via sysfs This numbers can be analized by system automation similar to errors_count. In ideal world it would be nice to have separate counters for different log-levels, but this makes this patch too intrusive. Signed-off-by: Dmitry Monakhov Link: https://lore.kernel.org/r/20200725123313.4467-1-dmtrmonakhov@yandex-team.ru Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 32260ceb9cb5..c73b9a1959a9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1591,6 +1591,8 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + atomic_t s_warning_count; + atomic_t s_msg_count; /* Encryption context for '-o test_dummy_encryption' */ struct fscrypt_dummy_context s_dummy_enc_ctx; -- cgit From ce9f24cccdc019229b70a5c15e2b09ad9c0ab5d1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Jul 2020 15:04:34 +0200 Subject: ext4: check journal inode extents more carefully Currently, system zones just track ranges of block, that are "important" fs metadata (bitmaps, group descriptors, journal blocks, etc.). This however complicates how extent tree (or indirect blocks) can be checked for inodes that actually track such metadata - currently the journal inode but arguably we should be treating quota files or resize inode similarly. We cannot run __ext4_ext_check() on such metadata inodes when loading their extents as that would immediately trigger the validity checks and so we just hack around that and special-case the journal inode. This however leads to a situation that a journal inode which has extent tree of depth at least one can have invalid extent tree that gets unnoticed until ext4_cache_extents() crashes. To overcome this limitation, track inode number each system zone belongs to (0 is used for zones not belonging to any inode). We can then verify inode number matches the expected one when verifying extent tree and thus avoid the false errors. With this there's no need to to special-case journal inode during extent tree checking anymore so remove it. Fixes: 0a944e8a6c66 ("ext4: don't perform block validity checks on the journal inode") Reported-by: Wolfgang Frisch Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200728130437.7804-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c73b9a1959a9..3994f012a9de 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3397,9 +3397,9 @@ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); -extern int ext4_data_block_valid(struct ext4_sb_info *sbi, - ext4_fsblk_t start_blk, - unsigned int count); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); -- cgit From 2fe34d2938181b26f86bceb7b87fbf7370bd92c4 Mon Sep 17 00:00:00 2001 From: Kyoungho Koo Date: Mon, 10 Aug 2020 17:07:05 +0900 Subject: ext4: remove unused parameter of ext4_generic_delete_entry function The ext4_generic_delete_entry function does not use the parameter handle, so it can be removed. Signed-off-by: Kyoungho Koo Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20200810080701.GA14160@koo-Z370-HD3 Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3994f012a9de..71b4370a3f91 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2802,8 +2802,7 @@ extern int ext4_search_dir(struct buffer_head *bh, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir); -extern int ext4_generic_delete_entry(handle_t *handle, - struct inode *dir, +extern int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, -- cgit From 27bc446e2def38db3244a6eb4bb1d6312936610a Mon Sep 17 00:00:00 2001 From: brookxu Date: Mon, 17 Aug 2020 15:36:15 +0800 Subject: ext4: limit the length of per-inode prealloc list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the scenario of writing sparse files, the per-inode prealloc list may be very long, resulting in high overhead for ext4_mb_use_preallocated(). To circumvent this problem, we limit the maximum length of per-inode prealloc list to 512 and allow users to modify it. After patching, we observed that the sys ratio of cpu has dropped, and the system throughput has increased significantly. We created a process to write the sparse file, and the running time of the process on the fixed kernel was significantly reduced, as follows: Running time on unfixed kernel: [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat real 0m2.051s user 0m0.008s sys 0m2.026s Running time on fixed kernel: [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat real 0m0.471s user 0m0.004s sys 0m0.395s Signed-off-by: Chunguang Xu Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 71b4370a3f91..523e00d7b392 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1070,6 +1070,7 @@ struct ext4_inode_info { struct timespec64 i_crtime; /* mballoc */ + atomic_t i_prealloc_active; struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; @@ -1518,6 +1519,7 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; @@ -2682,7 +2684,7 @@ extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern int ext4_mb_reserve_blocks(struct super_block *, int); -extern void ext4_discard_preallocations(struct inode *); +extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, -- cgit