diff options
Diffstat (limited to 'fs/ext4')
42 files changed, 8293 insertions, 5883 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e20d59221fc0..01873c2a34ad 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -1,38 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only -# Ext3 configs are here for backward compatibility with old configs which may -# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable -# kernels after the removal of ext3 driver. -config EXT3_FS - tristate "The Extended 3 (ext3) filesystem" - select EXT4_FS - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - -config EXT3_FS_POSIX_ACL - bool "Ext3 POSIX Access Control Lists" - depends on EXT3_FS - select EXT4_FS_POSIX_ACL - select FS_POSIX_ACL - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - -config EXT3_FS_SECURITY - bool "Ext3 Security Labels" - depends on EXT3_FS - select EXT4_FS_SECURITY - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select BUFFER_HEAD select JBD2 select CRC16 - select CRYPTO - select CRYPTO_CRC32C + select CRC32 select FS_IOMAP select FS_ENCRYPTION_ALGS if FS_ENCRYPTION help diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1f72f977c6db..8040c731b3e4 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -22,6 +22,7 @@ #include "mballoc.h" #include <trace/events/ext4.h> +#include <kunit/static_stub.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); @@ -111,10 +112,8 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb, itbl_blk_start = ext4_inode_table(sb, gdp); itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; if (itbl_blk_start <= end && itbl_blk_end >= start) { - itbl_blk_start = itbl_blk_start >= start ? - itbl_blk_start : start; - itbl_blk_end = itbl_blk_end <= end ? - itbl_blk_end : end; + itbl_blk_start = max(itbl_blk_start, start); + itbl_blk_end = min(itbl_blk_end, end); itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); @@ -274,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; + KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, + sb, block_group, bh); + if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); @@ -468,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, ext4_fsblk_t bitmap_blk; int err; + KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, + sb, block_group, ignore_locked); + desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); @@ -545,7 +550,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), - ext4_end_bitmap_read); + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -563,13 +569,15 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, { struct ext4_group_desc *desc; + KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, + sb, block_group, bh); + if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", @@ -641,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) @@ -695,7 +703,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * possible we just missed a transaction commit that did so */ smp_mb(); - if (sbi->s_mb_free_pending == 0) { + if (atomic_read(&sbi->s_mb_free_pending) == 0) { if (test_opt(sb, DISCARD)) { atomic_inc(&sbi->s_retry_alloc_pending); flush_work(&sbi->s_discard_work); @@ -744,7 +752,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, *count = ar.len; /* * Account for the allocated meta blocks. We will never - * fail EDQUOT for metdata, but we do account for it. + * fail EDQUOT for metadata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, @@ -913,11 +921,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } /* - * This function returns the number of file system metadata clusters at + * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ -static unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group) +unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; @@ -935,8 +943,15 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } - return EXT4_NUM_B2C(sbi, num); + return num; } + +static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); +} + /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index cd725bebe69e..87760fabdd2e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -18,17 +18,19 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars) int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz) + struct buffer_head *bh) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; + sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); @@ -40,15 +42,17 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz) + struct buffer_head *bh) { __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + sz = EXT4_INODES_PER_GROUP(sb) >> 3; + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); @@ -63,11 +67,11 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); @@ -85,10 +89,10 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5504f72bbbbe..e8c5525afc67 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, { struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; - struct rb_node *parent = NULL, *new_node = NULL; + struct rb_node *parent = NULL, *new_node; while (*n) { parent = *n; @@ -215,7 +215,6 @@ int ext4_setup_system_zone(struct super_block *sb) struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; - int flex_size = ext4_flex_bg_size(sbi); int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); @@ -223,12 +222,13 @@ int ext4_setup_system_zone(struct super_block *sb) return -ENOMEM; for (i=0; i < ngroups; i++) { + unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); + cond_resched(); - if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) { + if (meta_blks != 0) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1, 0); + meta_blks, 0); if (ret) goto err; } @@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line, { __le32 *bref = p; unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + if (journal && inode == journal->j_inode) return 0; while (bref < p+max) { diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index e20ac0654b3f..cf0a0970c095 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -31,9 +31,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, ext4_fname_from_fscrypt_name(fname, &name); -#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); -#endif + if (err) + ext4_fname_free_filename(fname); + return err; } @@ -49,9 +50,9 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, ext4_fname_from_fscrypt_name(fname, &name); -#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); -#endif + if (err) + ext4_fname_free_filename(fname); return err; } @@ -66,10 +67,7 @@ void ext4_fname_free_filename(struct ext4_filename *fname) fname->usr_fname = NULL; fname->disk_name.name = NULL; -#if IS_ENABLED(CONFIG_UNICODE) - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; -#endif + ext4_fname_free_ci_filename(fname); } static bool uuid_is_zero(__u8 u[16]) @@ -228,19 +226,16 @@ static bool ext4_has_stable_inodes(struct super_block *sb) return ext4_has_feature_stable_inodes(sb); } -static void ext4_get_ino_and_lblk_bits(struct super_block *sb, - int *ino_bits_ret, int *lblk_bits_ret) -{ - *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); - *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); -} - const struct fscrypt_operations ext4_cryptops = { - .key_prefix = "ext4:", + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .has_stable_inodes = ext4_has_stable_inodes, - .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 3985f8c33f95..256fe2c1d4c1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); - bool has_csum = ext4_has_metadata_csum(dir->i_sb); + bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; @@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; else return 0; @@ -133,6 +136,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + struct dir_private_info *info = file->private_data; err = fscrypt_prepare_readdir(inode); if (err) @@ -144,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; /* Can we just clear INDEX flag to ignore htree information? */ - if (!ext4_has_metadata_csum(sb)) { + if (!ext4_has_feature_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. @@ -188,13 +192,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) continue; } if (err > 0) { - pgoff_t index = map.m_pblk >> - (PAGE_SHIFT - inode->i_blkbits); + pgoff_t index = map.m_pblk << inode->i_blkbits >> + PAGE_SHIFT; if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( - sb->s_bdev->bd_inode->i_mapping, - &file->f_ra, file, - index, 1); + sb->s_bdev->bd_mapping, + &file->f_ra, file, index, + 1 << EXT4_SB(sb)->s_min_folio_order); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { @@ -229,7 +233,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (!inode_eq_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -249,7 +253,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); } while (ctx->pos < inode->i_size @@ -279,12 +283,20 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); + u32 hash; + u32 minor_hash; + + if (IS_CASEFOLDED(inode)) { + hash = EXT4_DIRENT_HASH(de); + minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + hash = 0; + minor_hash = 0; + } /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, - EXT4_DIRENT_HASH(de), - EXT4_DIRENT_MINOR_HASH(de), - &de_name, &fstr); + hash, minor_hash, &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) @@ -384,6 +396,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp) static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + struct dir_private_info *info = file->private_data; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); @@ -392,7 +405,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); - file->f_version = inode_peek_iversion(inode) - 1; + info->cookie = inode_peek_iversion(inode) - 1; return ret; } @@ -408,7 +421,7 @@ struct fname { __u32 inode; __u8 name_len; __u8 file_type; - char name[]; + char name[] __counted_by(name_len); }; /* @@ -429,18 +442,15 @@ static void free_rb_tree_fname(struct rb_root *root) *root = RB_ROOT; } - -static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, - loff_t pos) +static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) { - struct dir_private_info *p; - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return NULL; - p->curr_hash = pos2maj_hash(filp, pos); - p->curr_minor_hash = pos2min_hash(filp, pos); - return p; + struct dir_private_info *p = filp->private_data; + + if (is_dx_dir(file_inode(filp)) && !p->initialized) { + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + p->initialized = true; + } } void ext4_htree_free_dir_info(struct dir_private_info *p) @@ -464,14 +474,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; - int len; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ - len = sizeof(struct fname) + ent_name->len + 1; - new_fn = kzalloc(len, GFP_KERNEL); + new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), + GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; @@ -552,12 +561,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) struct fname *fname; int ret = 0; - if (!info) { - info = ext4_htree_create_dir_info(file, ctx->pos); - if (!info) - return -ENOMEM; - file->private_data = info; - } + ext4_htree_init_dir_info(file, ctx->pos); if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ @@ -590,10 +594,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) * cached entries. */ if ((!info->curr_node) || - !inode_eq_iversion(inode, file->f_version)) { + !inode_eq_iversion(inode, info->cookie)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); @@ -664,7 +668,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, return 0; } +static int ext4_dir_open(struct inode *inode, struct file *file) +{ + struct dir_private_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + file->private_data = info; + return 0; +} + const struct file_operations ext4_dir_operations = { + .open = ext4_dir_open, .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 481491e892df..56112f201cac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -33,7 +33,7 @@ #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> #include <linux/ratelimit.h> -#include <crypto/hash.h> +#include <linux/crc32c.h> #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #include <linux/fiemap.h> @@ -157,7 +157,7 @@ enum criteria { /* * Reads each block group sequentially, performing disk IO if - * necessary, to find find_suitable block group. Tries to + * necessary, to find suitable block group. Tries to * allocate goal length but might trim the request if nothing * is found after enough tries. */ @@ -176,9 +176,6 @@ enum criteria { EXT4_MB_NUM_CRS }; -/* criteria below which we use fast block scanning and avoid unnecessary IO */ -#define CR_FAST CR_GOAL_LEN_SLOW - /* * Flags used in mballoc's allocation_context flags field. * @@ -188,14 +185,8 @@ enum criteria { /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 -/* blocks already reserved */ -#define EXT4_MB_HINT_RESERVED 0x0002 -/* metadata is being allocated */ -#define EXT4_MB_HINT_METADATA 0x0004 /* first blocks in the file */ #define EXT4_MB_HINT_FIRST 0x0008 -/* search for the best chunk */ -#define EXT4_MB_HINT_BEST 0x0010 /* data is being allocated */ #define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ @@ -216,12 +207,6 @@ enum criteria { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 -/* Large fragment size list lookup succeeded at least once for cr = 0 */ -#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ -#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ -#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -255,14 +240,27 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_DELAYED BIT(BH_Delay) +/* + * This is for use in ext4_map_query_blocks() for a special case where we can + * have a physically and logically contiguous blocks split across two leaf + * nodes instead of a single extent. This is required in case of atomic writes + * to know whether the returned extent is last in leaf. If yes, then lookup for + * next in leaf block in ext4_map_query_blocks_next_in_leaf(). + * - This is never going to be added to any buffer head state. + * - We use the next available bit after BH_BITMAP_UPTODATE. + */ +#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; ext4_lblk_t m_lblk; unsigned int m_len; unsigned int m_flags; + u64 m_seq; }; /* @@ -276,7 +274,10 @@ struct ext4_system_blocks { /* * Flags for ext4_io_end->flags */ -#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 + +#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ @@ -365,7 +366,16 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) - +#define EXT4_B_TO_LBLK(inode, offset) \ + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) +#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) + +/* Translate a block number to a page index */ +#define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \ + PAGE_SHIFT) +/* Translate a page index to a block number */ +#define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \ + (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ @@ -692,16 +702,22 @@ enum { /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 - /* caller is from the direct IO path, request to creation of an - unwritten extents if not allocated, split the unwritten - extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_PRE_IO 0x0008 -#define EXT4_GET_BLOCKS_CONVERT 0x0010 -#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) - /* Convert extent to initialized after IO complete */ -#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + /* + * This means that we cannot merge newly allocated extents, and if we + * found an unwritten extent, we need to split it. + */ +#define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008 + /* + * Caller is from the dio or dioread_nolock buffered IO, reqest to + * create an unwritten extent if it does not exist or split the + * found unwritten extent. Also do not merge the newly created + * unwritten extent, io end will convert unwritten to written, + * and try to merge the written extent. + */ +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert unwritten extent to initialized. */ +#define EXT4_GET_BLOCKS_CONVERT 0x0010 /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -713,11 +729,23 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) - /* Caller will submit data before dropping transaction handle. This - * allows jbd2 to avoid submitting data before commit. */ + /* Caller is in the context of data submission, such as writeback, + * fsync, etc. Especially, in the generic writeback path, caller will + * submit data before dropping transaction handle. This allows jbd2 + * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ + EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 +/* + * Atomic write caller needs this to query in the slow path of mixed mapping + * case, when a contiguous extent can be split across two adjacent leaf nodes. + * Look EXT4_MAP_QUERY_LAST_IN_LEAF. + */ +#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the @@ -731,6 +759,13 @@ enum { #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 +/* + * ext4_map_query_blocks() uses this filter mask to filter the flags needed to + * pass while lookup/querying of on disk extent tree. + */ +#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ + EXT4_EX_NOFAIL |\ + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks @@ -894,10 +929,13 @@ do { \ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ - EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime) +#define EXT4_INODE_SET_ATIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) + +#define EXT4_INODE_SET_MTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) -#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ @@ -913,9 +951,16 @@ do { \ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ }) -#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +#define EXT4_INODE_GET_ATIME(inode, raw_inode) \ +do { \ + inode_set_atime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_MTIME(inode, raw_inode) \ do { \ - (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \ + inode_set_mtime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_CTIME(inode, raw_inode) \ @@ -1044,14 +1089,16 @@ struct ext4_inode_info { /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; - /* Number of ongoing updates on this inode */ - atomic_t i_fc_updates; + spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; - /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ - struct mutex i_fc_lock; + /* + * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len + * and inode's EXT4_FC_STATE_COMMITTING state bit. + */ + spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not @@ -1084,8 +1131,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ - /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. @@ -1094,6 +1139,10 @@ struct ext4_inode_info { /* mballoc */ atomic_t i_prealloc_active; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; struct rb_root i_prealloc_node; rwlock_t i_prealloc_lock; @@ -1106,14 +1155,12 @@ struct ext4_inode_info { ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ + u64 i_es_seq; /* Change counter for extents. + Protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; - /* allocation reservation info for delalloc */ - /* In case of bigalloc, this refer to clusters rather than blocks */ - unsigned int i_reserved_data_blocks; - /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; @@ -1128,6 +1175,7 @@ struct ext4_inode_info { /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif + spinlock_t i_block_reservation_lock; /* Lock protecting lists below */ spinlock_t i_completed_io_lock; @@ -1137,9 +1185,6 @@ struct ext4_inode_info { */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ - - spinlock_t i_block_reservation_lock; /* * Transactions that contain inode's metadata needed to complete @@ -1149,13 +1194,21 @@ struct ext4_inode_info { tid_t i_datasync_tid; #ifdef CONFIG_QUOTA - struct dquot *i_dquot[MAXQUOTAS]; + struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; kprojid_t i_projid; + +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif }; /* @@ -1241,6 +1294,7 @@ struct ext4_inode_info { #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1258,10 +1312,8 @@ struct ext4_inode_info { #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le -#define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le -#define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le @@ -1336,7 +1388,7 @@ struct ext4_super_block { /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ +/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* @@ -1417,7 +1469,9 @@ struct ext4_super_block { __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ - __le32 s_reserved[94]; /* Padding to the end of the block */ + __le16 s_def_resuid_hi; + __le16 s_def_resgid_hi; + __le32 s_reserved[93]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1498,6 +1552,7 @@ struct ext4_sb_info { loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + /* Array of bh's for the block group descriptors */ struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; @@ -1541,7 +1596,7 @@ struct ext4_sb_info { unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; - struct block_device *s_journal_bdev; + struct file *s_journal_bdev_file; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; @@ -1567,16 +1622,14 @@ struct ext4_sb_info { unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; - unsigned int s_mb_free_pending; - struct list_head s_freed_data_list; /* List of blocks to be freed + atomic_t s_mb_free_pending; + struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; - struct list_head *s_mb_avg_fragment_size; - rwlock_t *s_mb_avg_fragment_size_locks; - struct list_head *s_mb_largest_free_orders; - rwlock_t *s_mb_largest_free_orders_locks; + struct xarray *s_mb_avg_fragment_size; + struct xarray *s_mb_largest_free_orders; /* tunables */ unsigned long s_stripe; @@ -1588,12 +1641,15 @@ struct ext4_sb_info { unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; - unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; + unsigned int s_sb_update_sec; + unsigned int s_sb_update_kb; + + /* where last allocation was done - for stream allocation */ + ext4_group_t *s_mb_last_groups; + unsigned int s_mb_nr_global_goals; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1603,12 +1659,10 @@ struct ext4_sb_info { atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_stream_goals; /* stream allocation global goal hits */ atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - atomic_t s_bal_p2_aligned_bad_suggestions; - atomic_t s_bal_goal_fast_bad_suggestions; - atomic_t s_bal_best_avail_bad_suggestions; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ @@ -1650,14 +1704,16 @@ struct ext4_sb_info { /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; - /* Reference to checksum algorithm driver via cryptoapi */ - struct crypto_shash *s_chksum_driver; + /* minimum folio order of a page cache allocation */ + u16 s_min_folio_order; + /* supported maximum folio order, 0 means not supported */ + u16 s_max_folio_order; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; /* Reclaim extents from extent status tree */ - struct shrinker s_es_shrinker; + struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; @@ -1680,7 +1736,8 @@ struct ext4_sb_info { /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA - * or EXTENTS flag. + * or EXTENTS flag or between writepages ops and changing DELALLOC or + * DIOREAD_NOLOCK mount options on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; @@ -1708,10 +1765,17 @@ struct ext4_sb_info { const char *s_last_error_func; time64_t s_last_error_time; /* - * If we are in a context where we cannot update error information in - * the on-disk superblock, we queue this work to do it. + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. */ - struct work_struct s_error_work; + struct work_struct s_sb_upd_work; + + /* Atomic write unit values in bytes */ + unsigned int s_awu_min; + unsigned int s_awu_max; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -1732,7 +1796,7 @@ struct ext4_sb_info { * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ - spinlock_t s_fc_lock; + struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; @@ -1782,6 +1846,18 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +static inline int ext4_get_resuid(struct ext4_super_block *es) +{ + return le16_to_cpu(es->s_def_resuid) | + le16_to_cpu(es->s_def_resuid_hi) << 16; +} + +static inline int ext4_get_resgid(struct ext4_super_block *es) +{ + return le16_to_cpu(es->s_def_resgid) | + le16_to_cpu(es->s_def_resgid_hi) << 16; +} + /* * Returns: sbi->field[index] * Used to access an array element from the following sbi fields which require @@ -1804,8 +1880,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FS_ABORTED, /* Fatal error detected */ - EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) @@ -1850,14 +1926,6 @@ static inline bool ext4_simulate_fail(struct super_block *sb, return false; } -static inline void ext4_simulate_fail_bh(struct super_block *sb, - struct buffer_head *bh, - unsigned long code) -{ - if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) - clear_buffer_uptodate(bh); -} - /* * Error number codes for s_{first,last}_error_errno * @@ -1899,6 +1967,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; @@ -1959,6 +2028,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* + * Check whether the inode is tracked as orphan (either in orphan file or + * orphan list). + */ +static inline bool ext4_inode_orphan_tracked(struct inode *inode) +{ + return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || + !list_empty(&EXT4_I(inode)->i_orphan); +} + +/* * Codes for operating systems */ #define EXT4_OS_LINUX 0 @@ -2224,13 +2303,30 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ -#define EXT4_FLAGS_RESIZING 0 -#define EXT4_FLAGS_SHUTDOWN 1 -#define EXT4_FLAGS_BDEV_IS_DAX 2 +enum { + EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ + EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ + EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ + EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ +}; + +static inline int ext4_forced_shutdown(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); +} -static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +static inline int ext4_emergency_ro(struct super_block *sb) { - return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); +} + +static inline int ext4_emergency_state(struct super_block *sb) +{ + if (unlikely(ext4_forced_shutdown(sb))) + return -EIO; + if (unlikely(ext4_emergency_ro(sb))) + return -EROFS; + return 0; } /* @@ -2264,10 +2360,19 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) #define EXT4_DEFM_NODELALLOC 0x0800 /* - * Default journal batch times + * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + + +/* + * Default values for superblock update + */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ + /* * Minimum number of groups in a flexgroup before we separate out @@ -2323,9 +2428,9 @@ struct ext4_dir_entry_2 { ((struct ext4_dir_entry_hash *) \ (((void *)(entry)) + \ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) -#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash) #define EXT4_DIRENT_MINOR_HASH(entry) \ - le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) + le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash) static inline bool ext4_hash_in_dirent(const struct inode *inode) { @@ -2391,28 +2496,19 @@ static inline unsigned int ext4_dir_rec_len(__u8 name_len, return (rec_len & ~EXT4_DIR_ROUND); } -/* - * If we ever get support for fs block sizes > page_size, we'll need - * to remove the #if statements in the next two functions... - */ static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); -#if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); -#else - return len; -#endif } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); -#if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { @@ -2422,9 +2518,6 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -#else - return cpu_to_le16(len); -#endif } /* @@ -2447,23 +2540,11 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 #define DX_HASH_SIPHASH 6 +#define DX_HASH_LAST DX_HASH_SIPHASH -static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, - const void *address, unsigned int length) +static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { - struct { - struct shash_desc shash; - char ctx[4]; - } desc; - - BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); - - desc.shash.tfm = sbi->s_chksum_driver; - *(u32 *)desc.ctx = crc; - - BUG_ON(crypto_shash_update(&desc.shash, address, length)); - - return *(u32 *)desc.ctx; + return crc32c(crc, address, length); } #ifdef __KERNEL__ @@ -2496,7 +2577,7 @@ struct ext4_filename { struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) - struct fscrypt_str cf_name; + struct qstr cf_name; #endif }; @@ -2538,6 +2619,8 @@ struct dir_private_info { __u32 curr_hash; __u32 curr_minor_hash; __u32 next_hash; + u64 cookie; + bool initialized; }; /* calculate the first block number of the group */ @@ -2678,10 +2761,10 @@ struct mmpd_data { extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz); + struct buffer_head *bh); int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz); + struct buffer_head *bh); void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); @@ -2708,7 +2791,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); -extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); @@ -2731,8 +2813,25 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, - const struct qstr *iname, - struct ext4_filename *fname); + const struct qstr *iname, + struct ext4_filename *fname); + +static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) +{ + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +} +#else +static inline int ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct ext4_filename *fname) +{ + return 0; +} + +static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) +{ +} #endif /* ext4 encryption related stuff goes here crypto.c */ @@ -2755,16 +2854,11 @@ static inline int ext4_fname_setup_filename(struct inode *dir, int lookup, struct ext4_filename *fname) { - int err = 0; fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; -#if IS_ENABLED(CONFIG_UNICODE) - err = ext4_fname_setup_ci_filename(dir, iname, fname); -#endif - - return err; + return ext4_fname_setup_ci_filename(dir, iname, fname); } static inline int ext4_fname_prepare_lookup(struct inode *dir, @@ -2776,10 +2870,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, static inline void ext4_fname_free_filename(struct ext4_filename *fname) { -#if IS_ENABLED(CONFIG_UNICODE) - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; -#endif + ext4_fname_free_ci_filename(fname); } static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, @@ -2803,8 +2894,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); -extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); @@ -2864,7 +2954,6 @@ extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); -extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); @@ -2887,8 +2976,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); -void ext4_fc_start_update(struct inode *inode); -void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); @@ -2904,11 +2991,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); -extern int ext4_mb_release(struct super_block *); +extern void ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern int ext4_mb_reserve_blocks(struct super_block *, int); -extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, @@ -2929,12 +3015,17 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, - int len, int state); + int len, bool state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); +void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, @@ -2955,6 +3046,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); +void ext4_set_inode_mapping_order(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -2983,7 +3075,6 @@ extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); @@ -2993,13 +3084,17 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); -extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); +extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); @@ -3011,6 +3106,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +static inline bool is_special_ino(struct super_block *sb, unsigned long ino) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum); +} + /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3023,8 +3129,8 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); int ext4_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); -int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); int ext4_force_shutdown(struct super_block *sb, u32 flags); @@ -3072,16 +3178,17 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); +extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, + sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io); + bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io); + bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); -extern __le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es); +extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); @@ -3090,6 +3197,8 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, @@ -3249,18 +3358,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_metadata_csum(struct super_block *sb) -{ - WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && - !EXT4_SB(sb)->s_chksum_driver); - - return ext4_has_feature_metadata_csum(sb) && - (EXT4_SB(sb)->s_chksum_driver != NULL); -} - static inline int ext4_has_group_desc_csum(struct super_block *sb) { - return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); + return ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ @@ -3345,6 +3446,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) return 1 << sbi->s_log_groups_per_flex; } +static inline loff_t ext4_get_maxbytes(struct inode *inode) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return inode->i_sb->s_maxbytes; + return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ @@ -3409,8 +3517,6 @@ struct ext4_group_info { void *bb_bitmap; #endif struct rw_semaphore alloc_sem; - struct list_head bb_avg_fragment_size_node; - struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means @@ -3461,23 +3567,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); } +static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group) +{ + if (!spin_trylock(ext4_group_lock_ptr(sb, group))) + return false; + /* + * We're able to grab the lock right away, so drop the lock + * contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + return true; +} + static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { - spinlock_t *lock = ext4_group_lock_ptr(sb, group); - if (spin_trylock(lock)) - /* - * We're able to grab the lock right away, so drop the - * lock contention counter. - */ - atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); - else { + if (!ext4_try_lock_group(sb, group)) { /* * The lock is busy, so bump the contention counter, * and then wait on the spin lock. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, EXT4_MAX_CONTENTION); - spin_lock(lock); + spin_lock(ext4_group_lock_ptr(sb, group)); } } @@ -3531,22 +3642,21 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); +extern void ext4_update_final_de(void *de_buf, int old_size, int new_size); int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep); + struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct page **pagep, - void **fsdata); +extern int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); @@ -3593,10 +3703,10 @@ static inline int ext4_has_inline_data(struct inode *inode) extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); -extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, - struct ext4_dir_entry_2 *de, - int blocksize, int csum_size, - unsigned int parent_ino, int dotdot_real_len); +extern int ext4_init_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *dir_block, + unsigned int parent_ino, void *inline_buf, + int inline_size); extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, @@ -3679,6 +3789,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, + struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, @@ -3686,11 +3798,12 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode, extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, - struct ext4_ext_path **, - struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_insert_extent( + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags); extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path **, + struct ext4_ext_path *, int flags); extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); @@ -3780,34 +3893,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -/* For ioend & aio unwritten conversion wait queues */ -#define EXT4_WQ_HASH_SZ 37 -#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) -extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); -static inline void ext4_set_io_unwritten_flag(struct inode *inode, - struct ext4_io_end *io_end) +static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_unwritten); - } } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - struct inode *inode = io_end->inode; - - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) io_end->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - } } extern const struct iomap_ops ext4_iomap_ops; @@ -3827,6 +3925,17 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +static inline bool ext4_inode_can_atomic_write(struct inode *inode) +{ + + return S_ISREG(inode->i_mode) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + EXT4_SB(inode->i_sb)->s_awu_min > 0; +} + +extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, + get_block_t *get_block); #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 26435f3a3094..c484125d963f 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -31,13 +31,6 @@ #define CHECK_BINSEARCH__ /* - * If EXT_STATS is defined then stats numbers are collected. - * These number will be displayed at umount time. - */ -#define EXT_STATS_ - - -/* * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b38d59581411..05e5946ed9b3 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -63,15 +63,18 @@ static void ext4_put_nojournal(handle_t *handle) */ static int ext4_journal_check_start(struct super_block *sb) { + int ret; journal_t *journal; might_sleep(); - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; - if (sb_rdonly(sb)) + if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* @@ -205,7 +208,7 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line, static void ext4_check_bdev_write_error(struct super_block *sb) { - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct address_space *mapping = sb->s_bdev->bd_mapping; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; @@ -234,8 +237,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); - ext4_check_bdev_write_error(sb); - if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) { @@ -243,8 +244,10 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, handle, err); return err; } - } - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + } else + ext4_check_bdev_write_error(sb); + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, @@ -276,9 +279,16 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); - /* In the no journal case, we can just do a bforget and return */ + /* + * In the no journal case, we should wait for the ongoing buffer + * to complete and do a forget. + */ if (!ext4_handle_valid(handle)) { - bforget(bh); + if (bh) { + clear_buffer_dirty(bh); + wait_on_buffer(bh); + __bforget(bh); + } return 0; } @@ -331,7 +341,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line, err); return err; } - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0c77697d5e90..63d17c5201b5 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,90 +122,6 @@ #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 -/** - * struct ext4_journal_cb_entry - Base structure for callback information. - * - * This struct is a 'seed' structure for a using with your own callback - * structs. If you are using callbacks you must allocate one of these - * or another struct of your own definition which has this struct - * as it's first element and pass it to ext4_journal_callback_add(). - */ -struct ext4_journal_cb_entry { - /* list information for other callbacks attached to the same handle */ - struct list_head jce_list; - - /* Function to call with this callback structure */ - void (*jce_func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int error); - - /* user data goes here */ -}; - -/** - * ext4_journal_callback_add: add a function to call after transaction commit - * @handle: active journal transaction handle to register callback on - * @func: callback function to call after the transaction has committed: - * @sb: superblock of current filesystem for transaction - * @jce: returned journal callback data - * @rc: journal state at commit (0 = transaction committed properly) - * @jce: journal callback data (internal and function private data struct) - * - * The registered function will be called in the context of the journal thread - * after the transaction for which the handle was created has completed. - * - * No locks are held when the callback function is called, so it is safe to - * call blocking functions from within the callback, but the callback should - * not block or run for too long, or the filesystem will be blocked waiting for - * the next transaction to commit. No journaling functions can be used, or - * there is a risk of deadlock. - * - * There is no guaranteed calling order of multiple registered callbacks on - * the same transaction. - */ -static inline void _ext4_journal_callback_add(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - /* Add the jce to transaction's private list */ - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); -} - -static inline void ext4_journal_callback_add(handle_t *handle, - void (*func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc), - struct ext4_journal_cb_entry *jce) -{ - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - /* Add the jce to transaction's private list */ - jce->jce_func = func; - spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, jce); - spin_unlock(&sbi->s_md_lock); -} - - -/** - * ext4_journal_callback_del: delete a registered callback - * @handle: active journal transaction handle on which callback was registered - * @jce: registered journal callback entry to unregister - * Return true if object was successfully removed - */ -static inline bool ext4_journal_callback_try_del(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - bool deleted; - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - spin_lock(&sbi->s_md_lock); - deleted = !list_empty(&jce->jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - return deleted; -} - int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -403,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, revoke_creds, 0); } -static inline int ext4_journal_blocks_per_page(struct inode *inode) +static inline int ext4_journal_blocks_per_folio(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) - return jbd2_journal_blocks_per_page(inode); + return jbd2_journal_blocks_per_folio(inode); return 0; } @@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 1; } +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_sb_upd_work); + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 202c76996b62..2cf5759ba689 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } @@ -63,7 +62,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -77,19 +76,18 @@ static void ext4_extent_block_csum_set(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); et->et_checksum = ext4_extent_block_csum(inode, eh); } -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - ext4_lblk_t split, - int split_flag, - int flags); +static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { @@ -100,27 +98,33 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } +static inline void ext4_ext_path_brelse(struct ext4_ext_path *path) +{ + brelse(path->p_bh); + path->p_bh = NULL; +} + static void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; - if (!path) + if (IS_ERR_OR_NULL(path)) return; depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) { - brelse(path->p_bh); - path->p_bh = NULL; - } + for (i = 0; i <= depth; i++, path++) + ext4_ext_path_brelse(path); } void ext4_free_ext_path(struct ext4_ext_path *path) { + if (IS_ERR_OR_NULL(path)) + return; ext4_ext_drop_refs(path); kfree(path); } @@ -323,19 +327,18 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) return size; } -static inline int +static inline struct ext4_ext_path * ext4_force_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path **ppath, ext4_lblk_t lblk, + struct ext4_ext_path *path, ext4_lblk_t lblk, int nofail) { - struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); - int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; + int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; - return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? + return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, flags); } @@ -564,7 +567,7 @@ __read_extent_tree_block(const char *function, unsigned int line, if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); - err = ext4_read_bh(bh, 0, NULL); + err = ext4_read_bh(bh, 0, NULL, false); if (err < 0) goto errout; } @@ -607,6 +610,8 @@ int ext4_ext_precache(struct inode *inode) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ + ext4_check_map_extents_env(inode); + down_read(&ei->i_data_sem); depth = ext_depth(inode); @@ -635,8 +640,7 @@ int ext4_ext_precache(struct inode *inode) */ if ((i == depth) || path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; continue; } @@ -689,7 +693,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) struct ext4_extent *ex; int i; - if (!path) + if (IS_ERR_OR_NULL(path)) return; eh = path[depth].p_hdr; @@ -881,11 +885,10 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode) struct ext4_ext_path * ext4_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path **orig_path, int flags) + struct ext4_ext_path *path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; - struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; gfp_t gfp_flags = GFP_NOFS; @@ -906,7 +909,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_ext_drop_refs(path); if (depth > path[0].p_maxdepth) { kfree(path); - *orig_path = path = NULL; + path = NULL; } } if (!path) { @@ -961,8 +964,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, err: ext4_free_ext_path(path); - if (orig_path) - *orig_path = NULL; return ERR_PTR(ret); } @@ -1010,6 +1011,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ix = curp->p_idx; } + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EFSCORRUPTED; + } + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { @@ -1019,11 +1025,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); } - if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { - EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); - return -EFSCORRUPTED; - } - ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); @@ -1395,15 +1396,15 @@ out: * finds empty index and adds new leaf. * if no free index is found, then it requests in-depth growing. */ -static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int mb_flags, - unsigned int gb_flags, - struct ext4_ext_path **ppath, - struct ext4_extent *newext) +static struct ext4_ext_path * +ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, + unsigned int mb_flags, unsigned int gb_flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) { - struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; + ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block); repeat: i = depth = ext_depth(inode); @@ -1422,42 +1423,38 @@ repeat: * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) - goto out; + goto errout; /* refill path */ - path = ext4_find_extent(inode, - (ext4_lblk_t)le32_to_cpu(newext->ee_block), - ppath, gb_flags); - if (IS_ERR(path)) - err = PTR_ERR(path); - } else { - /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, mb_flags); - if (err) - goto out; + path = ext4_find_extent(inode, ee_block, path, gb_flags); + return path; + } - /* refill path */ - path = ext4_find_extent(inode, - (ext4_lblk_t)le32_to_cpu(newext->ee_block), - ppath, gb_flags); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } + /* tree is full, time to grow in depth */ + err = ext4_ext_grow_indepth(handle, inode, mb_flags); + if (err) + goto errout; - /* - * only first (depth 0 -> 1) produces free space; - * in all other cases we have to split the grown tree - */ - depth = ext_depth(inode); - if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { - /* now we need to split */ - goto repeat; - } + /* refill path */ + path = ext4_find_extent(inode, ee_block, path, gb_flags); + if (IS_ERR(path)) + return path; + + /* + * only first (depth 0 -> 1) produces free space; + * in all other cases we have to split the grown tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need to split */ + goto repeat; } -out: - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -1534,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode, static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent *ret_ex) + struct ext4_extent *ret_ex, int flags) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1608,7 +1605,8 @@ got_index: ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, + flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -1616,7 +1614,7 @@ got_index: put_bh(bh); } - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -1749,12 +1747,23 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, break; err = ext4_ext_get_access(handle, inode, path + k); if (err) - break; + goto clean; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) - break; + goto clean; } + return 0; + +clean: + /* + * The path[k].p_bh is either unmodified or with no verified bit + * set (see ext4_ext_get_access()). So just clear the verified bit + * of the successfully modified extents buffers, which will force + * these extents to be checked to avoid using inconsistent data. + */ + while (++k < depth) + clear_buffer_verified(path[k].p_bh); return err; } @@ -1876,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); path[0].p_hdr->eh_max = cpu_to_le16(max_root); - brelse(path[1].p_bh); + ext4_ext_path_brelse(path + 1); ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } @@ -1964,16 +1973,15 @@ out: * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ -int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path **ppath, - struct ext4_extent *newext, int gb_flags) +struct ext4_ext_path * +ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags) { - struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ - struct ext4_ext_path *npath = NULL; - int depth, len, err; + int depth, len, err = 0; ext4_lblk_t next; int mb_flags = 0, unwritten; @@ -1981,18 +1989,20 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } /* try to insert block into found extent and return */ - if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { + if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) { /* * Try to see whether we should rather test the extent on @@ -2026,7 +2036,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); @@ -2051,7 +2061,7 @@ prepend: err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; @@ -2076,21 +2086,26 @@ prepend: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { + struct ext4_ext_path *npath; + ext_debug(inode, "next leaf block - %u\n", next); - BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, gb_flags); - if (IS_ERR(npath)) - return PTR_ERR(npath); + if (IS_ERR(npath)) { + err = PTR_ERR(npath); + goto errout; + } BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); + ext4_free_ext_path(path); path = npath; goto has_space; } ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + ext4_free_ext_path(npath); } /* @@ -2099,10 +2114,10 @@ prepend: */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags |= EXT4_MB_USE_RESERVED; - err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, - ppath, newext); - if (err) - goto cleanup; + path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + path, newext); + if (IS_ERR(path)) + return path; depth = ext_depth(inode); eh = path[depth].p_hdr; @@ -2111,7 +2126,7 @@ has_space: err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto cleanup; + goto errout; if (!nearex) { /* there is no extent in this leaf, create first one */ @@ -2166,20 +2181,23 @@ has_space: merge: /* try to merge extents */ - if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) + if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) ext4_ext_try_to_merge(handle, inode, path, nearex); - /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) - goto cleanup; + goto errout; err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto errout; -cleanup: - ext4_free_ext_path(npath); - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } static int ext4_fill_es_cache_info(struct inode *inode, @@ -2195,7 +2213,7 @@ static int ext4_fill_es_cache_info(struct inode *inode, while (block <= end) { next = 0; flags = 0; - if (!ext4_es_lookup_extent(inode, block, &next, &es)) + if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL)) break; if (ext4_es_is_unwritten(&es)) flags |= FIEMAP_EXTENT_UNWRITTEN; @@ -2229,7 +2247,7 @@ static int ext4_fill_es_cache_info(struct inode *inode, /* - * ext4_ext_determine_hole - determine hole around given block + * ext4_ext_find_hole - find hole around given block according to the given path * @inode: inode we lookup in * @path: path in extent tree to @lblk * @lblk: pointer to logical block around which we want to determine hole @@ -2241,9 +2259,9 @@ static int ext4_fill_es_cache_info(struct inode *inode, * The function returns the length of a hole starting at @lblk. We update @lblk * to the beginning of the hole if we managed to find it. */ -static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t *lblk) +static ext4_lblk_t ext4_ext_find_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *lblk) { int depth = ext_depth(inode); struct ext4_extent *ex; @@ -2271,30 +2289,6 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, } /* - * ext4_ext_put_gap_in_cache: - * calculate boundaries of the gap that the requested block fits into - * and cache this gap - */ -static void -ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, - ext4_lblk_t hole_len) -{ - struct extent_status es; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, - hole_start + hole_len - 1, &es); - if (es.es_len) { - /* There's delayed extent containing lblock? */ - if (es.es_lblk <= hole_start) - return; - hole_len = min(es.es_lblk - hole_start, hole_len); - } - ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); - ext4_es_insert_extent(inode, hole_start, hole_len, ~0, - EXTENT_STATUS_HOLE); -} - -/* * ext4_ext_rm_idx: * removes index from the index block. */ @@ -2303,27 +2297,26 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, { int err; ext4_fsblk_t leaf; + int k = depth - 1; /* free index block */ - depth--; - path = path + depth; - leaf = ext4_idx_pblock(path->p_idx); - if (unlikely(path->p_hdr->eh_entries == 0)) { - EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + leaf = ext4_idx_pblock(path[k].p_idx); + if (unlikely(path[k].p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k); return -EFSCORRUPTED; } - err = ext4_ext_get_access(handle, inode, path); + err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; - if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { - int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) { + int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx; len *= sizeof(struct ext4_extent_idx); - memmove(path->p_idx, path->p_idx + 1, len); + memmove(path[k].p_idx, path[k].p_idx + 1, len); } - le16_add_cpu(&path->p_hdr->eh_entries, -1); - err = ext4_ext_dirty(handle, inode, path); + le16_add_cpu(&path[k].p_hdr->eh_entries, -1); + err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); @@ -2332,18 +2325,29 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - while (--depth >= 0) { - if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) + while (--k >= 0) { + if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr)) break; - path--; - err = ext4_ext_get_access(handle, inode, path); + err = ext4_ext_get_access(handle, inode, path + k); if (err) - break; - path->p_idx->ei_block = (path+1)->p_idx->ei_block; - err = ext4_ext_dirty(handle, inode, path); + goto clean; + path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block; + err = ext4_ext_dirty(handle, inode, path + k); if (err) - break; + goto clean; } + return 0; + +clean: + /* + * The path[k].p_bh is either unmodified or with no verified bit + * set (see ext4_ext_get_access()). So just clear the verified bit + * of the successfully modified extents buffers, which will force + * these extents to be checked to avoid using inconsistent data. + */ + while (++k < depth) + clear_buffer_verified(path[k].p_bh); + return err; } @@ -2394,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; - int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; - depth = ext_depth(inode); - + /* + * Extent tree can change between the time we estimate credits and + * the time we actually modify the tree. Assume the worst case. + */ if (extents <= 1) - index = depth * 2; + index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents; else - index = depth * 3; + index = (EXT4_MAX_EXTENT_DEPTH * 3) + + DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0)); return index; } @@ -2819,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL; partial.pclu = 0; partial.lblk = 0; @@ -2849,8 +2856,7 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, - EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); + path = ext4_find_extent(inode, end, NULL, flags); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2896,11 +2902,12 @@ again: * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ - err = ext4_force_split_extent_at(handle, inode, &path, - end + 1, 1); - if (err < 0) + path = ext4_force_split_extent_at(handle, inode, path, + end + 1, 1); + if (IS_ERR(path)) { + err = PTR_ERR(path); goto out; - + } } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && partial.state == initial) { /* @@ -2915,7 +2922,7 @@ again: */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, - NULL); + NULL, flags); if (err < 0) goto out; if (pblk) { @@ -2958,8 +2965,7 @@ again: err = ext4_ext_rm_leaf(handle, inode, path, &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; continue; } @@ -2992,8 +2998,7 @@ again: i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, - depth - i - 1, - EXT4_EX_NOCACHE); + depth - i - 1, flags); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); @@ -3021,8 +3026,7 @@ again: err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; ext_debug(inode, "return to level %d\n", i); } @@ -3137,7 +3141,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN); + EXTENT_STATUS_WRITTEN, false); } /* FIXME!! we need to try to merge to left or right after zero-out */ @@ -3171,16 +3175,14 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * - * return 0 on success. + * Return an extent path pointer on success, or an error pointer on failure. */ -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - ext4_lblk_t split, - int split_flag, - int flags) +static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, int flags) { - struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; @@ -3222,7 +3224,7 @@ static int ext4_split_extent_at(handle_t *handle, else ext4_ext_mark_initialized(ex); - if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); @@ -3250,10 +3252,31 @@ static int ext4_split_extent_at(handle_t *handle, if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); - err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); - if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) + path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (!IS_ERR(path)) goto out; + err = PTR_ERR(path); + if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) + return path; + + /* + * Get a new path to try to zeroout or fix the extent length. + * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent() + * will not return -ENOMEM, otherwise -ENOMEM will cause a + * retry in do_writepages(), and a WARN_ON may be triggered + * in ext4_da_update_reserve_space() due to an incorrect + * ee_len causing the i_reserved_data_blocks exception. + */ + path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL); + if (IS_ERR(path)) { + EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", + split, PTR_ERR(path)); + return path; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { @@ -3304,14 +3327,17 @@ fix_extent_len: * and err is a non-zero error code. */ ext4_ext_dirty(handle, inode, path + path->p_depth); - return err; out: + if (err) { + ext4_free_ext_path(path); + path = ERR_PTR(err); + } ext4_ext_show_leaf(inode, path); - return err; + return path; } /* - * ext4_split_extents() splits an extent and mark extent which is covered + * ext4_split_extent() splits an extent and mark extent which is covered * by @map as split_flags indicates * * It may result in splitting the extent into multiple extents (up to three) @@ -3321,21 +3347,18 @@ out: * c> Splits in three extents: Somone is splitting in middle of the extent * */ -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - struct ext4_map_blocks *map, - int split_flag, - int flags) +static struct ext4_ext_path *ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, int flags, + unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; - int err = 0; int unwritten; int split_flag1, flags1; - int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3345,34 +3368,33 @@ static int ext4_split_extent(handle_t *handle, if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; - flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; - err = ext4_split_extent_at(handle, inode, ppath, + path = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); - if (err) - goto out; - } else { - allocated = ee_len - (map->m_lblk - ee_block); - } - /* - * Update path is required because previous ext4_split_extent_at() may - * result in split of original leaf or extent zeroout. - */ - path = ext4_find_extent(inode, map->m_lblk, ppath, flags); - if (IS_ERR(path)) - return PTR_ERR(path); - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) { - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) map->m_lblk); - return -EFSCORRUPTED; + if (IS_ERR(path)) + return path; + /* + * Update path is required because previous ext4_split_extent_at + * may result in split of original leaf or extent zeroout. + */ + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + ext4_free_ext_path(path); + return ERR_PTR(-EFSCORRUPTED); + } + unwritten = ext4_ext_is_unwritten(ex); } - unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; @@ -3381,15 +3403,20 @@ static int ext4_split_extent(handle_t *handle, split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } - err = ext4_split_extent_at(handle, inode, ppath, + path = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); - if (err) - goto out; + if (IS_ERR(path)) + return path; } + if (allocated) { + if (map->m_lblk + map->m_len > ee_block + ee_len) + *allocated = ee_len - (map->m_lblk - ee_block); + else + *allocated = map->m_len; + } ext4_ext_show_leaf(inode, path); -out: - return err ? err : allocated; + return path; } /* @@ -3412,13 +3439,11 @@ out: * that are allocated and initialized. * It is guaranteed to be >= map->m_len. */ -static int ext4_ext_convert_to_initialized(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, - int flags) +static struct ext4_ext_path * +ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, struct ext4_ext_path *path, + int flags, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; @@ -3426,9 +3451,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; - int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; + unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map_len); @@ -3468,6 +3493,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ + *allocated = 0; if ((map->m_lblk == ee_block) && /* See if we can merge left */ (map_len < ee_len) && /*L1*/ @@ -3497,7 +3523,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); @@ -3512,7 +3538,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_len = cpu_to_le16(prev_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = map_len; + *allocated = map_len; } } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && (map_len < ee_len) && /*L1*/ @@ -3543,7 +3569,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); @@ -3558,18 +3584,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = map_len; + *allocated = map_len; } } - if (allocated) { + if (*allocated) { /* Mark the block containing both extents as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; + if (err) + goto errout; goto out; } else - allocated = ee_len - (map->m_lblk - ee_block); + *allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* @@ -3596,21 +3624,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (max_zeroout && (allocated > split_map.m_len)) { - if (allocated <= max_zeroout) { + if (max_zeroout && (*allocated > split_map.m_len)) { + if (*allocated <= max_zeroout) { /* case 3 or 5 */ zero_ex1.ee_block = cpu_to_le32(split_map.m_lblk + split_map.m_len); zero_ex1.ee_len = - cpu_to_le16(allocated - split_map.m_len); + cpu_to_le16(*allocated - split_map.m_len); ext4_ext_store_pblock(&zero_ex1, ext4_ext_pblock(ex) + split_map.m_lblk + split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto fallback; - split_map.m_len = allocated; + split_map.m_len = *allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < max_zeroout) { @@ -3628,22 +3656,24 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; - allocated = map->m_len; + *allocated = map->m_len; } } fallback: - err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, - flags); - if (err > 0) - err = 0; + path = ext4_split_extent(handle, inode, path, &split_map, split_flag, + flags, NULL); + if (IS_ERR(path)) + return path; out: /* If we have gotten a failure, don't zero out status tree */ - if (!err) { - ext4_zeroout_es(inode, &zero_ex1); - ext4_zeroout_es(inode, &zero_ex2); - } - return err ? err : allocated; + ext4_zeroout_es(inode, &zero_ex1); + ext4_zeroout_es(inode, &zero_ex2); + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -3668,15 +3698,16 @@ out: * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of unwritten extent to be written on success. + * The size of unwritten extent to be written is passed to the caller via the + * allocated pointer. Return an extent path pointer on success, or an error + * pointer on failure. */ -static int ext4_split_convert_extents(handle_t *handle, +static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, - int flags) + struct ext4_ext_path *path, + int flags, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; @@ -3690,10 +3721,6 @@ static int ext4_split_convert_extents(handle_t *handle, >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; - /* - * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully inside i_size or new_size. - */ depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); @@ -3704,20 +3731,24 @@ static int ext4_split_convert_extents(handle_t *handle, split_flag |= EXT4_EXT_DATA_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully inside i_size or new_size. + */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } - flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); + flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; + return ext4_split_extent(handle, inode, path, map, split_flag, flags, + allocated); } -static int ext4_convert_unwritten_extents_endio(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path **ppath) +static struct ext4_ext_path * +ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) { - struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3745,20 +3776,21 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_convert_extents(handle, inode, map, ppath, - EXT4_GET_BLOCKS_CONVERT); - if (err < 0) - return err; - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT, NULL); if (IS_ERR(path)) - return PTR_ERR(path); + return path; + + path = ext4_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) + return path; depth = ext_depth(inode); ex = path[depth].p_ext; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); @@ -3769,18 +3801,23 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); -out: + if (err) + goto errout; + ext4_ext_show_leaf(inode, path); - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } -static int +static struct ext4_ext_path * convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, + struct ext4_ext_path *path, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3803,25 +3840,27 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { - err = ext4_split_convert_extents(handle, inode, map, ppath, - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); - if (err < 0) - return err; - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL); if (IS_ERR(path)) - return PTR_ERR(path); + return path; + + path = ext4_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) + return path; depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; /* first mark the extent as unwritten */ ext4_ext_mark_unwritten(ex); @@ -3833,7 +3872,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) - return err; + goto errout; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3842,22 +3881,24 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; - return 0; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } -static int +static struct ext4_ext_path * ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, int flags, - unsigned int allocated, ext4_fsblk_t newblock) + struct ext4_ext_path *path, int flags, + unsigned int *allocated, ext4_fsblk_t newblock) { - struct ext4_ext_path __maybe_unused *path = *ppath; - int ret = 0; int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", (unsigned long long)map->m_lblk, map->m_len, flags, - allocated); + *allocated); ext4_ext_show_leaf(inode, path); /* @@ -3867,36 +3908,34 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; trace_ext4_ext_handle_unwritten_extents(inode, map, flags, - allocated, newblock); + *allocated, newblock); /* get_block() before submitting IO, split the extent */ - if (flags & EXT4_GET_BLOCKS_PRE_IO) { - ret = ext4_split_convert_extents(handle, inode, map, ppath, - flags | EXT4_GET_BLOCKS_CONVERT); - if (ret < 0) { - err = ret; - goto out2; - } + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { + path = ext4_split_convert_extents(handle, inode, map, path, + flags | EXT4_GET_BLOCKS_CONVERT, allocated); + if (IS_ERR(path)) + return path; /* - * shouldn't get a 0 return when splitting an extent unless + * shouldn't get a 0 allocated when splitting an extent unless * m_len is 0 (bug) or extent has been corrupted */ - if (unlikely(ret == 0)) { + if (unlikely(*allocated == 0)) { EXT4_ERROR_INODE(inode, - "unexpected ret == 0, m_len = %u", + "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; - goto out2; + goto errout; } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - err = ext4_convert_unwritten_extents_endio(handle, inode, map, - ppath); - if (err < 0) - goto out2; + path = ext4_convert_unwritten_extents_endio(handle, inode, + map, path); + if (IS_ERR(path)) + return path; ext4_update_inode_fsync_trans(handle, inode, 1); goto map_out; } @@ -3928,36 +3967,37 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, * For buffered writes, at writepage time, etc. Convert a * discovered unwritten extent to written. */ - ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); - if (ret < 0) { - err = ret; - goto out2; - } + path = ext4_ext_convert_to_initialized(handle, inode, map, path, + flags, allocated); + if (IS_ERR(path)) + return path; ext4_update_inode_fsync_trans(handle, inode, 1); /* - * shouldn't get a 0 return when converting an unwritten extent + * shouldn't get a 0 allocated when converting an unwritten extent * unless m_len is 0 (bug) or extent has been corrupted */ - if (unlikely(ret == 0)) { - EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", + if (unlikely(*allocated == 0)) { + EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; - goto out2; + goto errout; } out: - allocated = ret; map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: map->m_pblk = newblock; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; + if (*allocated > map->m_len) + *allocated = map->m_len; + map->m_len = *allocated; ext4_ext_show_leaf(inode, path); -out2: - return err ? err : allocated; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -4062,6 +4102,73 @@ static int get_implied_cluster_alloc(struct super_block *sb, return 0; } +/* + * Determine hole length around the given logical block, first try to + * locate and expand the hole from the given @path, and then adjust it + * if it's partially or completely converted to delayed extents, insert + * it into the extent cache tree if it's indeed a hole, finally return + * the length of the determined extent. + */ +static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t lblk) +{ + ext4_lblk_t hole_start, len; + struct extent_status es; + + hole_start = lblk; + len = ext4_ext_find_hole(inode, path, &hole_start); +again: + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + len - 1, &es); + if (!es.es_len) + goto insert_hole; + + /* + * There's a delalloc extent in the hole, handle it if the delalloc + * extent is in front of, behind and straddle the queried range. + */ + if (lblk >= es.es_lblk + es.es_len) { + /* + * The delalloc extent is in front of the queried range, + * find again from the queried start block. + */ + len -= lblk - hole_start; + hole_start = lblk; + goto again; + } else if (in_range(lblk, es.es_lblk, es.es_len)) { + /* + * The delalloc extent containing lblk, it must have been + * added after ext4_map_blocks() checked the extent status + * tree so we are not holding i_rwsem and delalloc info is + * only stabilized by i_data_sem we are going to release + * soon. Don't modify the extent status tree and report + * extent as a hole, just adjust the length to the delalloc + * extent's after lblk. + */ + len = es.es_lblk + es.es_len - lblk; + return len; + } else { + /* + * The delalloc extent is partially or completely behind + * the queried range, update hole length until the + * beginning of the delalloc extent. + */ + len = min(es.es_lblk - hole_start, len); + } + +insert_hole: + /* Put just found gap into cache to speed up subsequent requests */ + ext_debug(inode, " -> %u:%u\n", hole_start, len); + ext4_es_insert_extent(inode, hole_start, len, ~0, + EXTENT_STATUS_HOLE, false); + + /* Update hole_len to reflect hole size after lblk */ + if (hole_start != lblk) + len -= lblk - hole_start; + + return len; +} /* * Block allocation/map/preallocation routine for extents based files @@ -4069,10 +4176,10 @@ static int get_implied_cluster_alloc(struct super_block *sb, * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block - * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * * return > 0, number of blocks already mapped/allocated - * if create == 0 and these are pre-allocated blocks + * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * @@ -4088,7 +4195,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; - int err = 0, depth, ret; + int err = 0, depth; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4098,10 +4205,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) { err = PTR_ERR(path); - path = NULL; goto out; } @@ -4150,8 +4256,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { - err = convert_initialized_extent(handle, - inode, map, &path, &allocated); + path = convert_initialized_extent(handle, + inode, map, path, &allocated); + if (IS_ERR(path)) + err = PTR_ERR(path); goto out; } else if (!ext4_ext_is_unwritten(ex)) { map->m_flags |= EXT4_MAP_MAPPED; @@ -4163,38 +4271,26 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out; } - ret = ext4_ext_handle_unwritten_extents( - handle, inode, map, &path, flags, - allocated, newblock); - if (ret < 0) - err = ret; - else - allocated = ret; + path = ext4_ext_handle_unwritten_extents( + handle, inode, map, path, flags, + &allocated, newblock); + if (IS_ERR(path)) + err = PTR_ERR(path); goto out; } } /* * requested block isn't allocated yet; - * we couldn't try to create block if create flag is zero + * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - ext4_lblk_t hole_start, hole_len; + ext4_lblk_t len; - hole_start = map->m_lblk; - hole_len = ext4_ext_determine_hole(inode, path, &hole_start); - /* - * put just found gap into cache to speed up - * subsequent requests - */ - ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); + len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk); - /* Update hole_len to reflect hole size after map->m_lblk */ - if (hole_start != map->m_lblk) - hole_len -= map->m_lblk - hole_start; map->m_pblk = 0; - map->m_len = min_t(unsigned int, map->m_len, hole_len); - + map->m_len = min_t(unsigned int, map->m_len, len); goto out; } @@ -4221,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, + &ex2, flags); if (err < 0) goto out; @@ -4231,6 +4328,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; + err = 0; goto got_allocated_blocks; } @@ -4303,8 +4401,9 @@ got_allocated_blocks: map->m_flags |= EXT4_MAP_UNWRITTEN; } - err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); - if (err) { + path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (IS_ERR(path)) { + err = PTR_ERR(path); if (allocated_clusters) { int fb_flags = 0; @@ -4313,7 +4412,7 @@ got_allocated_blocks: * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, @@ -4324,43 +4423,6 @@ got_allocated_blocks: } /* - * Reduce the reserved cluster count to reflect successful deferred - * allocation of delayed allocated clusters or direct allocation of - * clusters discovered to be delayed allocated. Once allocated, a - * cluster is not included in the reserved count. - */ - if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - /* - * When allocating delayed allocated clusters, simply - * reduce the reserved cluster count and claim quota - */ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); - } else { - ext4_lblk_t lblk, len; - unsigned int n; - - /* - * When allocating non-delayed allocated clusters - * (from fallocate, filemap, DIO, or clusters - * allocated when delalloc has been disabled by - * ext4_nonda_switch), reduce the reserved cluster - * count by the number of allocated clusters that - * have previously been delayed allocated. Quota - * has been claimed by ext4_mb_new_blocks() above, - * so release the quota reservations made for any - * previously delayed allocated clusters. - */ - lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); - len = allocated_clusters << sbi->s_cluster_bits; - n = ext4_es_delayed_clu(inode, lblk, len); - if (n > 0) - ext4_da_update_reserve_space(inode, (int) n, 0); - } - } - - /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an unwritten extent. */ @@ -4375,6 +4437,20 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: + /* + * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag. + * So we know that the depth used here is correct, since there was no + * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set. + * If tomorrow we start using this QUERY flag with CREATE, then we will + * need to re-calculate the depth as it might have changed due to block + * allocation. + */ + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) { + WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE); + if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) + map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; + } + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, @@ -4424,7 +4500,9 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int depth = 0; struct ext4_map_blocks map; unsigned int credits; - loff_t epos; + loff_t epos, old_size = i_size_read(inode); + unsigned int blkbits = inode->i_blkbits; + bool alloc_zero = false; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; @@ -4438,6 +4516,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* + * Do the actual write zero during a running journal transaction + * costs a lot. First allocate an unwritten extent and then + * convert it to written after zeroing it out. + */ + if (flags & EXT4_GET_BLOCKS_ZERO) { + flags &= ~EXT4_GET_BLOCKS_ZERO; + flags |= EXT4_GET_BLOCKS_UNWRIT_EXT; + alloc_zero = true; + } + + /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); @@ -4473,15 +4562,19 @@ retry: * allow a full retry cycle for any remaining allocations */ retries = 0; - map.m_lblk += ret; - map.m_len = len = len - ret; - epos = (loff_t)map.m_lblk << inode->i_blkbits; + epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) - inode->i_mtime = inode_get_ctime(inode); + inode_set_mtime_to_ts(inode, + inode_get_ctime(inode)); + if (epos > old_size) { + pagecache_isize_extended(inode, old_size, epos); + ext4_zero_partial_blocks(handle, inode, + old_size, epos - old_size); + } } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4489,6 +4582,21 @@ retry: ret2 = ret3 ? ret3 : ret2; if (unlikely(ret2)) break; + + if (alloc_zero && + (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) { + ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, + map.m_len); + if (likely(!ret2)) + ret2 = ext4_convert_unwritten_extents(NULL, + inode, (loff_t)map.m_lblk << blkbits, + (loff_t)map.m_len << blkbits); + if (ret2) + break; + } + + map.m_lblk += ret; + map.m_len = len = len - ret; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -4504,129 +4612,69 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; - unsigned int max_blocks; loff_t new_size = 0; - int ret = 0; - int flags; - int credits; - int partial_begin, partial_end; - loff_t start, end; - ext4_lblk_t lblk; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; + unsigned int blocksize = i_blocksize(inode); unsigned int blkbits = inode->i_blkbits; + int ret, flags, credits; trace_ext4_zero_range(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); - /* - * Round up offset. This is not fallocate, we need to zero out - * blocks, so convert interior block aligned part of the range to - * unwritten and possibly manually zero out unaligned parts of the - * range. - */ - start = round_up(offset, 1 << blkbits); - end = round_down((offset + len), 1 << blkbits); - - if (start < offset || end > offset + len) - return -EINVAL; - partial_begin = offset & ((1 << blkbits) - 1); - partial_end = (offset + len) & ((1 << blkbits) - 1); - - lblk = start >> blkbits; - max_blocks = (end >> blkbits); - if (max_blocks < lblk) - max_blocks = 0; - else - max_blocks -= lblk; - - inode_lock(inode); - - /* - * Indirect files do not support unwritten extents - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out_mutex; - } + /* Indirect files do not support unwritten extents */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; ret = inode_newsize_ok(inode, new_size); if (ret) - goto out_mutex; + return ret; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - /* Preallocate the range including the unaligned edges */ - if (partial_begin || partial_end) { - ret = ext4_alloc_file_blocks(file, - round_down(offset, 1 << blkbits) >> blkbits, - (round_up((offset + len), 1 << blkbits) - - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags); - if (ret) - goto out_mutex; + if (!IS_ALIGNED(offset | end, blocksize)) { + ext4_lblk_t alloc_lblk = offset >> blkbits; + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, + new_size, flags); + if (ret) + return ret; } - /* Zero range excluding the unaligned edges */ - if (max_blocks > 0) { - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | - EXT4_EX_NOCACHE); - - /* - * Prevent page faults from reinstantiating pages we have - * released from page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - ret = ext4_update_disksize_before_punch(inode, offset, len); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) + return ret; - /* - * For journalled data we need to write (and checkpoint) pages - * before discarding page cache to avoid inconsitent data on - * disk in case of crash before zeroing trans is committed. - */ - if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - } + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; - /* Now release the pages and zero block aligned part of pages */ - truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode_set_ctime_current(inode); + /* Zero range excluding the unaligned edges */ + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> blkbits; + if (end_lblk > start_lblk) { + ext4_lblk_t zero_blks = end_lblk - start_lblk; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags); - filemap_invalidate_unlock(mapping); + if (mode & FALLOC_FL_WRITE_ZEROES) + flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE; + else + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, + new_size, flags); if (ret) - goto out_mutex; + return ret; } - if (!partial_begin && !partial_end) - goto out_mutex; + /* Finish zeroing out if it doesn't contain partial block */ + if (IS_ALIGNED(offset | end, blocksize)) + return ret; /* * In worst case we have to writeout two nonadjacent unwritten @@ -4639,27 +4687,69 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); - goto out_mutex; + return ret; } - inode->i_mtime = inode_set_ctime_current(inode); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret) + goto out_handle; + if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(handle, inode, offset, len); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); -out_mutex: - inode_unlock(inode); + return ret; +} + +static long ext4_do_fallocate(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + loff_t end = offset + len; + loff_t new_size = 0; + ext4_lblk_t start_lblk, len_lblk; + int ret; + + trace_ext4_fallocate_enter(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); + + start_lblk = offset >> inode->i_blkbits; + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); + + /* We only support preallocation for extent-based files only. */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; + } + + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } +out: + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); return ret; } @@ -4673,12 +4763,8 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - loff_t new_size = 0; - unsigned int max_blocks; - int ret = 0; - int flags; - ext4_lblk_t lblk; - unsigned int blkbits = inode->i_blkbits; + struct address_space *mapping = file->f_mapping; + int ret; /* * Encrypted inodes can't handle collapse range or insert @@ -4689,83 +4775,158 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; + /* + * Don't allow writing zeroes if the underlying device does not + * enable the unmap write zeroes operation. + */ + if ((mode & FALLOC_FL_WRITE_ZEROES) && + !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev)) + return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | - FALLOC_FL_INSERT_RANGE)) + FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES)) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_convert_inline_data(inode); - inode_unlock(inode); if (ret) - goto exit; + goto out_inode_lock; - if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(file, offset, len); - goto exit; - } + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); - if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(file, offset, len); - goto exit; - } + ret = file_modified(file); + if (ret) + goto out_inode_lock; - if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(file, offset, len); - goto exit; + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { + ret = ext4_do_fallocate(file, offset, len, mode); + goto out_inode_lock; } - if (mode & FALLOC_FL_ZERO_RANGE) { + /* + * Follow-up operations will drop page cache, hold invalidate lock + * to prevent page faults from reinstantiating pages we have + * released from page cache. + */ + filemap_invalidate_lock(mapping); + + ret = ext4_break_layouts(inode); + if (ret) + goto out_invalidate_lock; + + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: + ret = ext4_punch_hole(file, offset, len); + break; + case FALLOC_FL_COLLAPSE_RANGE: + ret = ext4_collapse_range(file, offset, len); + break; + case FALLOC_FL_INSERT_RANGE: + ret = ext4_insert_range(file, offset, len); + break; + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_WRITE_ZEROES: ret = ext4_zero_range(file, offset, len, mode); - goto exit; + break; + default: + ret = -EOPNOTSUPP; } - trace_ext4_fallocate_enter(inode, offset, len, mode); - lblk = offset >> blkbits; +out_invalidate_lock: + filemap_invalidate_unlock(mapping); +out_inode_lock: + inode_unlock(inode); + return ret; +} + +/* + * This function converts a range of blocks to written extents. The caller of + * this function will pass the start offset and the size. all unwritten extents + * within this range will be converted to written extents. + * + * This function is called from the direct IO end io call back function for + * atomic writes, to convert the unwritten extents after IO is completed. + * + * Note that the requirement for atomic writes is that all conversion should + * happen atomically in a single fs journal transaction. We mainly only allocate + * unwritten extents either on a hole on a pre-exiting unwritten extent range in + * ext4_map_blocks_atomic_write(). The only case where we can have multiple + * unwritten extents in a range [offset, offset+len) is when there is a split + * unwritten extent between two leaf nodes which was cached in extent status + * cache during ext4_iomap_alloc() time. That will allow + * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going + * into the slow path. That means we might need a loop for conversion of this + * unwritten extent split across leaf block within a single journal transaction. + * Split extents across leaf nodes is a rare case, but let's still handle that + * to meet the requirements of multi-fsblock atomic writes. + * + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0, ret2 = 0, ret3 = 0; + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; + int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE; + + map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - inode_lock(inode); + if (!handle) { + /* + * TODO: An optimization can be added later by having an extent + * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that + * it can tell if the extent in the cache is a split extent. + * But for now let's assume pextents as 2 always. + */ + credits = ext4_meta_trans_blocks(inode, max_blocks, 2); + } - /* - * We only support preallocation for extent-based files only - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out; + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; - ret = inode_newsize_ok(inode, new_size); - if (ret) - goto out; + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret != max_blocks) + ext4_msg(inode->i_sb, KERN_INFO, + "inode #%lu: block %u: len %u: " + "split block mapping found for atomic write, " + "ret = %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + if (ret <= 0) + break; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); + ret2 = ext4_mark_inode_dirty(handle, inode); - ret = file_modified(file); - if (ret) - goto out; + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - if (ret) - goto out; + if (ret <= 0 || ret2) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "returned %d or %d", + inode->i_ino, map.m_lblk, + map.m_len, ret, ret2); - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { - ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); - } -out: - inode_unlock(inode); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -exit: - return ret; + return ret > 0 ? ret2 : ret; } /* @@ -4807,8 +4968,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, break; } } + /* + * Do not cache any unrelated extents, as it does not hold the + * i_rwsem or invalidate_lock, which could corrupt the extent + * status tree. + */ ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_IO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT | + EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " @@ -4919,12 +5086,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { - u64 maxbytes; - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - maxbytes = inode->i_sb->s_maxbytes; - else - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + u64 maxbytes = ext4_get_maxbytes(inode); if (*len == 0) return -EINVAL; @@ -4944,10 +5106,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int error = 0; + inode_lock_shared(inode); if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) - return error; + goto unlock; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } @@ -4958,15 +5121,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) - return error; + goto unlock; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - return iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); } - - return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); +unlock: + inode_unlock_shared(inode); + return error; } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -4987,7 +5154,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + inode_lock_shared(inode); error = ext4_ext_precache(inode); + inode_unlock_shared(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; @@ -5046,7 +5215,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, credits = depth + 2; } - restart_credits = ext4_writepage_trans_blocks(inode); + restart_credits = ext4_chunk_trans_extent(inode, 0); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); if (err) { @@ -5148,7 +5317,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { - path = ext4_find_extent(inode, start - 1, &path, + path = ext4_find_extent(inode, start - 1, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5197,7 +5366,7 @@ again: * becomes NULL to indicate the end of the loop. */ while (iterator && start <= stop) { - path = ext4_find_extent(inode, *iterator, &path, + path = ext4_find_extent(inode, *iterator, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5266,109 +5435,74 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t punch_start, punch_stop; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; handle_t *handle; unsigned int credits; - loff_t new_size, ioffset; + loff_t start, new_size; int ret; - /* - * We need to test this early because xfstests assumes that a - * collapse range of (0, 1) will return EOPNOTSUPP if the file - * system does not support collapse range. - */ + trace_ext4_collapse_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_collapse_range(inode, offset, len); - - punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); - punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_mmap; + if (end >= inode->i_size) + return -EINVAL; /* + * Write tail of the last page before removed range and data that + * will be shifted since they will get removed from the page cache + * below. We are also protected from pages becoming dirty by + * i_rwsem and invalidate_lock. * Need to round down offset to be aligned with page size boundary * for page size > block size. */ - ioffset = round_down(offset, PAGE_SIZE); - /* - * Write tail of the last page before removed range since it will get - * removed from the page cache below. - */ - ret = filemap_write_and_wait_range(mapping, ioffset, offset); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, offset); + if (!ret) + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (ret) - goto out_mmap; - /* - * Write data that will be shifted to preserve them when discarding - * page cache below. We are also protected from pages becoming dirty - * by i_rwsem and invalidate_lock. - */ - ret = filemap_write_and_wait_range(mapping, offset + len, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + return ret; - credits = ext4_writepage_trans_blocks(inode); + truncate_pagecache(inode, start); + + credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); + start_lblk = offset >> inode->i_blkbits; + end_lblk = (offset + len) >> inode->i_blkbits; + + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); - ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); + ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); - ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start, SHIFT_LEFT); + ret = ext4_ext_shift_extents(inode, handle, end_lblk, + end_lblk - start_lblk, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } new_size = inode->i_size - len; @@ -5376,18 +5510,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_handle; + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -5407,99 +5539,65 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; - ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; - int ret = 0, depth, split_flag = 0; - loff_t ioffset; + int ret, depth, split_flag = 0; + loff_t start; - /* - * We need to test this early because xfstests assumes that an - * insert range of (0, 1) will return EOPNOTSUPP if the file - * system does not support insert range. - */ + trace_ext4_insert_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_insert_range(inode, offset, len); - - offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); - len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Check whether the maximum file size would be exceeded */ - if (len > inode->i_sb->s_maxbytes - inode->i_size) { - ret = -EFBIG; - goto out_mutex; - } - /* Offset must be less than i_size */ - if (offset >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; + if (offset >= inode->i_size) + return -EINVAL; + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) + return -EFBIG; /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. + * Write out all dirty pages. Need to round down to align start offset + * to page size boundary for page size > block size. */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); if (ret) - goto out_mmap; + return ret; - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + truncate_pagecache(inode, start); - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) - goto out_stop; + goto out_handle; + + start_lblk = offset >> inode->i_blkbits; + len_lblk = len >> inode->i_blkbits; + + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - path = ext4_find_extent(inode, offset_lblk, NULL, 0); + path = ext4_find_extent(inode, start_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + ret = PTR_ERR(path); + goto out_handle; } depth = ext_depth(inode); @@ -5509,51 +5607,47 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) ee_len = ext4_ext_get_actual_len(extent); /* - * If offset_lblk is not the starting block of extent, split - * the extent @offset_lblk + * If start_lblk is not the starting block of extent, split + * the extent @start_lblk */ - if ((offset_lblk > ee_start_lblk) && - (offset_lblk < (ee_start_lblk + ee_len))) { + if ((start_lblk > ee_start_lblk) && + (start_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; - ret = ext4_split_extent_at(handle, inode, &path, - offset_lblk, split_flag, + path = ext4_split_extent_at(handle, inode, path, + start_lblk, split_flag, EXT4_EX_NOCACHE | - EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_SPLIT_NOMERGE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } - ext4_free_ext_path(path); - if (ret < 0) { + if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + ret = PTR_ERR(path); + goto out_handle; } - } else { - ext4_free_ext_path(path); } - ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); + ext4_free_ext_path(path); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); /* - * if offset_lblk lies in a hole which is at start of file, use + * if start_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); - + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -5600,25 +5694,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, int e1_len, e2_len, len; int split = 0; - path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); - path1 = NULL; - finish: - count = 0; - goto repeat; + goto errout; } - path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); - path2 = NULL; - goto finish; + goto errout; } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) - goto finish; + goto errout; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); @@ -5640,7 +5730,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) - goto finish; + goto errout; /* Move to the rightest boundary */ len = next1 - lblk1; if (len < next2 - lblk2) @@ -5650,28 +5740,32 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, lblk1 += len; lblk2 += len; count -= len; - goto repeat; + continue; } /* Prepare left boundary */ if (e1_blk < lblk1) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode1, - &path1, lblk1, 0); - if (unlikely(*erp)) - goto finish; + path1 = ext4_force_split_extent_at(handle, inode1, + path1, lblk1, 0); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + goto errout; + } } if (e2_blk < lblk2) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode2, - &path2, lblk2, 0); - if (unlikely(*erp)) - goto finish; + path2 = ext4_force_split_extent_at(handle, inode2, + path2, lblk2, 0); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + goto errout; + } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) - goto repeat; + continue; /* Prepare right boundary */ len = count; @@ -5682,30 +5776,34 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, if (len != e1_len) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode1, - &path1, lblk1 + len, 0); - if (unlikely(*erp)) - goto finish; + path1 = ext4_force_split_extent_at(handle, inode1, + path1, lblk1 + len, 0); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + goto errout; + } } if (len != e2_len) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode2, - &path2, lblk2 + len, 0); - if (*erp) - goto finish; + path2 = ext4_force_split_extent_at(handle, inode2, + path2, lblk2 + len, 0); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + goto errout; + } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) - goto repeat; + continue; BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; @@ -5723,7 +5821,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, *erp = ext4_ext_dirty(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* @@ -5733,17 +5831,17 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, * aborted anyway. */ if (unlikely(*erp)) - goto finish; + goto errout; + lblk1 += len; lblk2 += len; replaced_count += len; count -= len; - - repeat: - ext4_free_ext_path(path1); - ext4_free_ext_path(path2); - path1 = path2 = NULL; } + +errout: + ext4_free_ext_path(path1); + ext4_free_ext_path(path2); return replaced_count; } @@ -5778,11 +5876,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out; - } + if (IS_ERR(path)) + return PTR_ERR(path); depth = ext_depth(inode); @@ -5844,7 +5939,7 @@ out: int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk) { - struct ext4_ext_path *path = NULL, *ppath; + struct ext4_ext_path *path; struct ext4_extent *ex; int ret; @@ -5860,30 +5955,34 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, if (le32_to_cpu(ex->ee_block) != start || ext4_ext_get_actual_len(ex) != len) { /* We need to split this extent to match our extent first */ - ppath = path; down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1); + path = ext4_force_split_extent_at(NULL, inode, path, start, 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) + if (IS_ERR(path)) { + ret = PTR_ERR(path); goto out; - kfree(path); - path = ext4_find_extent(inode, start, NULL, 0); + } + + path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) - return -1; - ppath = path; + return PTR_ERR(path); + ex = path[path->p_depth].p_ext; WARN_ON(le32_to_cpu(ex->ee_block) != start); + if (ext4_ext_get_actual_len(ex) != len) { down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_force_split_extent_at(NULL, inode, &ppath, - start + len, 1); + path = ext4_force_split_extent_at(NULL, inode, path, + start + len, 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) + if (IS_ERR(path)) { + ret = PTR_ERR(path); goto out; - kfree(path); - path = ext4_find_extent(inode, start, NULL, 0); + } + + path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) - return -EINVAL; + return PTR_ERR(path); ex = path[path->p_depth].p_ext; } } @@ -5965,12 +6064,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); + if (!ex) goto out; - } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; @@ -5996,32 +6092,28 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) ret = skip_hole(inode, &cur); if (ret < 0) goto out; - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; numblks += path->p_depth; - ext4_free_ext_path(path); while (cur < end) { - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); - return 0; - } + if (!ex) + goto cleanup; + cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); - if (ret < 0) { - ext4_free_ext_path(path); + if (ret < 0) break; - } - path2 = ext4_find_extent(inode, cur, NULL, 0); - if (IS_ERR(path2)) { - ext4_free_ext_path(path); + + path2 = ext4_find_extent(inode, cur, path2, 0); + if (IS_ERR(path2)) break; - } + for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { cmp1 = cmp2 = 0; if (i <= path->p_depth) @@ -6033,13 +6125,14 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (cmp1 != cmp2 && cmp2 != 0) numblks++; } - ext4_free_ext_path(path); - ext4_free_ext_path(path2); } out: inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); ext4_mark_inode_dirty(NULL, inode); +cleanup: + ext4_free_ext_path(path); + ext4_free_ext_path(path2); return 0; } @@ -6060,12 +6153,9 @@ int ext4_ext_clear_bb(struct inode *inode) if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); - return 0; - } + if (!ex) + goto out; end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_free_ext_path(path); cur = 0; while (cur < end) { @@ -6075,23 +6165,25 @@ int ext4_ext_clear_bb(struct inode *inode) if (ret < 0) break; if (ret > 0) { - path = ext4_find_extent(inode, map.m_lblk, NULL, 0); - if (!IS_ERR_OR_NULL(path)) { + path = ext4_find_extent(inode, map.m_lblk, path, 0); + if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) { - ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 0); + path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } - ext4_free_ext_path(path); + } else { + path = NULL; } - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } +out: + ext4_free_ext_path(path); return 0; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9b5b8951afb4..e04fbf10fe4f 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -120,9 +120,40 @@ * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * + * ========================================================================== + * 3. Assurance of Ext4 extent status tree consistency + * + * When mapping blocks, Ext4 queries the extent status tree first and should + * always trusts that the extent status tree is consistent and up to date. + * Therefore, it is important to adheres to the following rules when createing, + * modifying and removing extents. + * + * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, + * the extent information should always be processed through the extent + * status tree instead of being organized manually through the on-disk + * extent tree. + * + * 2. When updating the extent tree, Ext4 should acquire the i_data_sem + * exclusively and update the extent status tree atomically. If the extents + * to be modified are large enough to exceed the range that a single + * i_data_sem can process (as ext4_datasem_ensure_credits() may drop + * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() + * does): + * + * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against page faults, as well as reads and writes that may + * concurrently modify the extent status tree. + * b) Evict all page cache in the affected range and recommend rebuilding + * or dropping the extent status tree after modifying the on-disk + * extent tree. This ensures exclusion against concurrent writebacks + * that do not hold those locks but only holds a folio lock. + * + * 3. Based on the rules above, when querying block mappings, Ext4 should at + * least hold the i_rwsem or invalidate_lock or folio lock(s) for the + * specified querying range. * * ========================================================================== - * 3. Performance analysis + * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are @@ -134,7 +165,7 @@ * * * ========================================================================== - * 4. TODO list + * 5. TODO list * * -- Refactor delayed space reservation * @@ -152,8 +183,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, + struct pending_reservation **prealloc); int __init ext4_init_es(void) { @@ -203,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es) return es->es_lblk + es->es_len - 1; } +static inline void ext4_es_inc_seq(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); +} + /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. @@ -309,6 +348,8 @@ void ext4_es_find_extent_range(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { + es->es_lblk = es->es_len = es->es_pblk = 0; + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -448,6 +489,19 @@ static void ext4_es_list_del(struct inode *inode) spin_unlock(&sbi->s_es_lock); } +static inline struct pending_reservation *__alloc_pending(bool nofail) +{ + if (!nofail) + return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + + return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL); +} + +static inline void __free_pending(struct pending_reservation *pr) +{ + kmem_cache_free(ext4_pending_cachep, pr); +} + /* * Returns true if we cannot fail to allocate memory for this extent_status * entry and cannot reclaim it until its status changes. @@ -542,8 +596,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1, if (ext4_es_is_hole(es1)) return 1; - /* we need to check delayed extent is without unwritten status */ - if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + /* we need to check delayed extent */ + if (ext4_es_is_delayed(es1)) return 1; return 0; @@ -832,74 +886,113 @@ out: */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status) + unsigned int status, bool delalloc_reserve_used) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; - int err1 = 0; - int err2 = 0; + int err1 = 0, err2 = 0, err3 = 0; + int resv_used = 0, pending = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; + struct pending_reservation *pr = NULL; + bool revise_pending = false; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", - lblk, len, pblk, status, inode->i_ino); + es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", + lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); if (!len) return; BUG_ON(end < lblk); - - if ((status & EXTENT_STATUS_DELAYED) && - (status & EXTENT_STATUS_WRITTEN)) { - ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " - " delayed and written which can potentially " - " cause data loss.", lblk, len); - WARN_ON(1); - } + WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED); newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); - trace_ext4_es_insert_extent(inode, &newes); ext4_es_insert_extent_check(inode, &newes); + revise_pending = sbi->s_cluster_ratio > 1 && + test_opt(inode->i_sb, DELALLOC) && + (status & (EXTENT_STATUS_WRITTEN | + EXTENT_STATUS_UNWRITTEN)); retry: if (err1 && !es1) es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); + if ((err1 || err2 || err3 < 0) && revise_pending && !pr) + pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) err2 = 0; if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } - if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && - (status & EXTENT_STATUS_WRITTEN || - status & EXTENT_STATUS_UNWRITTEN)) - __revise_pending(inode, lblk, len); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); + if (revise_pending) { + err3 = __revise_pending(inode, lblk, len, &pr); + if (err3 < 0) + goto error; + if (pr) { + __free_pending(pr); + pr = NULL; + } + pending = err3; + } + /* + * TODO: For cache on-disk extents, there is no need to increment + * the sequence counter, this requires future optimization. + */ + ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2) + /* + * Reduce the reserved cluster count to reflect successful deferred + * allocation of delayed allocated clusters or direct allocation of + * clusters discovered to be delayed allocated. Once allocated, a + * cluster is not included in the reserved count. + * + * When direct allocating (from fallocate, filemap, DIO, or clusters + * allocated when delalloc has been disabled by ext4_nonda_switch()) + * an extent either 1) contains delayed blocks but start with + * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed + * allocated blocks which belong to delayed allocated clusters when + * bigalloc feature is enabled, quota has already been claimed by + * ext4_mb_new_blocks(), so release the quota reservations made for + * any previously delayed allocated clusters instead of claim them + * again. + */ + resv_used += pending; + if (resv_used) + ext4_da_update_reserve_space(inode, resv_used, + delalloc_reserve_used); + + if (err1 || err2 || err3 < 0) goto retry; + trace_ext4_es_insert_extent(inode, &newes); ext4_es_print_tree(inode); return; } @@ -946,8 +1039,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t *next_lblk, - struct extent_status *es) + ext4_lblk_t *next_lblk, struct extent_status *es, + u64 *pseq) { struct ext4_es_tree *tree; struct ext4_es_stats *stats; @@ -1006,6 +1099,8 @@ out: } else *next_lblk = 0; } + if (pseq) + *pseq = EXT4_I(inode)->i_es_seq; } else { percpu_counter_inc(&stats->es_stats_cache_misses); } @@ -1017,7 +1112,7 @@ out: } struct rsvd_count { - int ndelonly; + int ndelayed; bool first_do_lblk_found; ext4_lblk_t first_do_lblk; ext4_lblk_t last_do_lblk; @@ -1043,10 +1138,10 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct rb_node *node; - rc->ndelonly = 0; + rc->ndelayed = 0; /* - * for bigalloc, note the first delonly block in the range has not + * for bigalloc, note the first delayed block in the range has not * been found, record the extent containing the block to the left of * the region to be removed, if any, and note that there's no partial * cluster to track @@ -1066,9 +1161,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, } /* - * count_rsvd - count the clusters containing delayed and not unwritten - * (delonly) blocks in a range within an extent and add to - * the running tally in rsvd_count + * count_rsvd - count the clusters containing delayed blocks in a range + * within an extent and add to the running tally in rsvd_count * * @inode - file containing extent * @lblk - first block in range @@ -1085,13 +1179,13 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t i, end, nclu; - if (!ext4_es_is_delonly(es)) + if (!ext4_es_is_delayed(es)) return; WARN_ON(len <= 0); if (sbi->s_cluster_ratio == 1) { - rc->ndelonly += (int) len; + rc->ndelayed += (int) len; return; } @@ -1101,7 +1195,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, end = lblk + (ext4_lblk_t) len - 1; end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; - /* record the first block of the first delonly extent seen */ + /* record the first block of the first delayed extent seen */ if (!rc->first_do_lblk_found) { rc->first_do_lblk = i; rc->first_do_lblk_found = true; @@ -1115,7 +1209,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, * doesn't start with it, count it and stop tracking */ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { - rc->ndelonly++; + rc->ndelayed++; rc->partial = false; } @@ -1125,7 +1219,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, */ if (EXT4_LBLK_COFF(sbi, i) != 0) { if (end >= EXT4_LBLK_CFILL(sbi, i)) { - rc->ndelonly++; + rc->ndelayed++; rc->partial = false; i = EXT4_LBLK_CFILL(sbi, i) + 1; } @@ -1133,11 +1227,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, /* * if the current cluster starts on a cluster boundary, count the - * number of whole delonly clusters in the extent + * number of whole delayed clusters in the extent */ if ((i + sbi->s_cluster_ratio - 1) <= end) { nclu = (end - i + 1) >> sbi->s_cluster_bits; - rc->ndelonly += nclu; + rc->ndelayed += nclu; i += nclu << sbi->s_cluster_bits; } @@ -1197,10 +1291,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root, * @rc - pointer to reserved count data * * The number of reservations to be released is equal to the number of - * clusters containing delayed and not unwritten (delonly) blocks within - * the range, minus the number of clusters still containing delonly blocks - * at the ends of the range, and minus the number of pending reservations - * within the range. + * clusters containing delayed blocks within the range, minus the number of + * clusters still containing delayed blocks at the ends of the range, and + * minus the number of pending reservations within the range. */ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, struct extent_status *right_es, @@ -1211,33 +1304,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node *node; ext4_lblk_t first_lclu, last_lclu; - bool left_delonly, right_delonly, count_pending; + bool left_delayed, right_delayed, count_pending; struct extent_status *es; if (sbi->s_cluster_ratio > 1) { /* count any remaining partial cluster */ if (rc->partial) - rc->ndelonly++; + rc->ndelayed++; - if (rc->ndelonly == 0) + if (rc->ndelayed == 0) return 0; first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); /* - * decrease the delonly count by the number of clusters at the - * ends of the range that still contain delonly blocks - + * decrease the delayed count by the number of clusters at the + * ends of the range that still contain delayed blocks - * these clusters still need to be reserved */ - left_delonly = right_delonly = false; + left_delayed = right_delayed = false; es = rc->left_es; while (es && ext4_es_end(es) >= EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { - if (ext4_es_is_delonly(es)) { - rc->ndelonly--; - left_delonly = true; + if (ext4_es_is_delayed(es)) { + rc->ndelayed--; + left_delayed = true; break; } node = rb_prev(&es->rb_node); @@ -1245,7 +1338,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, break; es = rb_entry(node, struct extent_status, rb_node); } - if (right_es && (!left_delonly || first_lclu != last_lclu)) { + if (right_es && (!left_delayed || first_lclu != last_lclu)) { if (end < ext4_es_end(right_es)) { es = right_es; } else { @@ -1255,9 +1348,9 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, } while (es && es->es_lblk <= EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { - if (ext4_es_is_delonly(es)) { - rc->ndelonly--; - right_delonly = true; + if (ext4_es_is_delayed(es)) { + rc->ndelayed--; + right_delayed = true; break; } node = rb_next(&es->rb_node); @@ -1271,21 +1364,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, /* * Determine the block range that should be searched for * pending reservations, if any. Clusters on the ends of the - * original removed range containing delonly blocks are + * original removed range containing delayed blocks are * excluded. They've already been accounted for and it's not * possible to determine if an associated pending reservation * should be released with the information available in the * extents status tree. */ if (first_lclu == last_lclu) { - if (left_delonly | right_delonly) + if (left_delayed | right_delayed) count_pending = false; else count_pending = true; } else { - if (left_delonly) + if (left_delayed) first_lclu++; - if (right_delonly) + if (right_delayed) last_lclu--; if (first_lclu <= last_lclu) count_pending = true; @@ -1296,16 +1389,16 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, /* * a pending reservation found between first_lclu and last_lclu * represents an allocated cluster that contained at least one - * delonly block, so the delonly total must be reduced by one + * delayed block, so the delayed total must be reduced by one * for each pending reservation found and released */ if (count_pending) { pr = __pr_tree_search(&tree->root, first_lclu); while (pr && pr->lclu <= last_lclu) { - rc->ndelonly--; + rc->ndelayed--; node = rb_next(&pr->rb_node); rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); + __free_pending(pr); if (!node) break; pr = rb_entry(node, struct pending_reservation, @@ -1313,7 +1406,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, } } } - return rc->ndelonly; + return rc->ndelayed; } @@ -1399,8 +1492,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } } if (count_reserved) - count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, - &orig_es, &rc); + count_rsvd(inode, orig_es.es_lblk + len1, + orig_es.es_len - len1 - len2, &orig_es, &rc); goto out_get_reserved; } @@ -1471,7 +1564,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); @@ -1491,15 +1583,23 @@ retry: */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); - if (es && !es->es_len) - __es_free_extent(es); + if (err) + goto error; + /* Free preallocated extent if it didn't get used. */ + if (es) { + if (!es->es_len) + __es_free_extent(es); + es = NULL; + } + ext4_es_inc_seq(inode); +error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; + trace_ext4_es_remove_extent(inode, lblk, len); ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); - return; } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, @@ -1596,7 +1696,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, unsigned long nr; struct ext4_sb_info *sbi; - sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + sbi = shrink->private_data; nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; @@ -1605,8 +1705,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, static unsigned long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct ext4_sb_info *sbi = container_of(shrink, - struct ext4_sb_info, s_es_shrinker); + struct ext4_sb_info *sbi = shrink->private_data; int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; @@ -1690,13 +1789,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) if (err) goto err3; - sbi->s_es_shrinker.scan_objects = ext4_es_scan; - sbi->s_es_shrinker.count_objects = ext4_es_count; - sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", - sbi->s_sb->s_id); - if (err) + sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); + if (!sbi->s_es_shrinker) { + err = -ENOMEM; goto err4; + } + + sbi->s_es_shrinker->scan_objects = ext4_es_scan; + sbi->s_es_shrinker->count_objects = ext4_es_count; + sbi->s_es_shrinker->private_data = sbi; + + shrinker_register(sbi->s_es_shrinker); return 0; err4: @@ -1716,7 +1819,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); - unregister_shrinker(&sbi->s_es_shrinker); + shrinker_free(sbi->s_es_shrinker); } /* @@ -1897,11 +2000,13 @@ static struct pending_reservation *__get_pending(struct inode *inode, * * @inode - file containing the cluster * @lblk - logical block in the cluster to be added + * @prealloc - preallocated pending entry * - * Returns 0 on successful insertion and -ENOMEM on failure. If the + * Returns 1 on successful insertion and -ENOMEM on failure. If the * pending reservation is already in the set, returns successfully. */ -static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, + struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; @@ -1927,15 +2032,21 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) } } - pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); - if (pr == NULL) { - ret = -ENOMEM; - goto out; + if (likely(*prealloc == NULL)) { + pr = __alloc_pending(false); + if (!pr) { + ret = -ENOMEM; + goto out; + } + } else { + pr = *prealloc; + *prealloc = NULL; } pr->lclu = lclu; rb_link_node(&pr->rb_node, parent, p); rb_insert_color(&pr->rb_node, &tree->root); + ret = 1; out: return ret; @@ -1960,7 +2071,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) if (pr != NULL) { tree = &EXT4_I(inode)->i_pending_tree; rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); + __free_pending(pr); } } @@ -2006,34 +2117,47 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) } /* - * ext4_es_insert_delayed_block - adds a delayed block to the extents status - * tree, adding a pending reservation where - * needed + * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents + * status tree, adding a pending reservation + * where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added - * @allocated - indicates whether a physical cluster has been allocated for - * the logical cluster that contains the block + * @lblk - start logical block to be added + * @len - length of blocks to be added + * @lclu_allocated/end_allocated - indicates whether a physical cluster has + * been allocated for the logical cluster + * that contains the start/end block. Note that + * end_allocated should always be set to false + * if the start and the end block are in the + * same cluster */ -void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated) +void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, bool lclu_allocated, + bool end_allocated) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status newes; - int err1 = 0; - int err2 = 0; + ext4_lblk_t end = lblk + len - 1; + int err1 = 0, err2 = 0, err3 = 0; struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; + struct pending_reservation *pr1 = NULL; + struct pending_reservation *pr2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", - lblk, inode->i_ino); + es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + if (!len) + return; + + WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) && + end_allocated); newes.es_lblk = lblk; - newes.es_len = 1; + newes.es_len = len; ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); - trace_ext4_es_insert_delayed_block(inode, &newes, allocated); ext4_es_insert_extent_check(inode, &newes); @@ -2042,123 +2166,66 @@ retry: es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); + if (err1 || err2 || err3 < 0) { + if (lclu_allocated && !pr1) + pr1 = __alloc_pending(true); + if (end_allocated && !pr2) + pr2 = __alloc_pending(true); + } write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); + err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } - if (allocated) - __insert_pending(inode, lblk); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); + if (lclu_allocated) { + err3 = __insert_pending(inode, lblk, &pr1); + if (err3 < 0) + goto error; + if (pr1) { + __free_pending(pr1); + pr1 = NULL; + } + } + if (end_allocated) { + err3 = __insert_pending(inode, end, &pr2); + if (err3 < 0) + goto error; + if (pr2) { + __free_pending(pr2); + pr2 = NULL; + } + } + ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2) + if (err1 || err2 || err3 < 0) goto retry; + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, + end_allocated); ext4_es_print_tree(inode); ext4_print_pending_tree(inode); return; } /* - * __es_delayed_clu - count number of clusters containing blocks that - * are delayed only - * - * @inode - file containing block range - * @start - logical block defining start of range - * @end - logical block defining end of range - * - * Returns the number of clusters containing only delayed (not delayed - * and unwritten) blocks in the range specified by @start and @end. Any - * cluster or part of a cluster within the range and containing a delayed - * and not unwritten block within the range is counted as a whole cluster. - */ -static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) -{ - struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; - struct extent_status *es; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct rb_node *node; - ext4_lblk_t first_lclu, last_lclu; - unsigned long long last_counted_lclu; - unsigned int n = 0; - - /* guaranteed to be unequal to any ext4_lblk_t value */ - last_counted_lclu = ~0ULL; - - es = __es_tree_search(&tree->root, start); - - while (es && (es->es_lblk <= end)) { - if (ext4_es_is_delonly(es)) { - if (es->es_lblk <= start) - first_lclu = EXT4_B2C(sbi, start); - else - first_lclu = EXT4_B2C(sbi, es->es_lblk); - - if (ext4_es_end(es) >= end) - last_lclu = EXT4_B2C(sbi, end); - else - last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); - - if (first_lclu == last_counted_lclu) - n += last_lclu - first_lclu; - else - n += last_lclu - first_lclu + 1; - last_counted_lclu = last_lclu; - } - node = rb_next(&es->rb_node); - if (!node) - break; - es = rb_entry(node, struct extent_status, rb_node); - } - - return n; -} - -/* - * ext4_es_delayed_clu - count number of clusters containing blocks that - * are both delayed and unwritten - * - * @inode - file containing block range - * @lblk - logical block defining start of range - * @len - number of blocks in range - * - * Locking for external use of __es_delayed_clu(). - */ -unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_lblk_t end; - unsigned int n; - - if (len == 0) - return 0; - - end = lblk + len - 1; - WARN_ON(end < lblk); - - read_lock(&ei->i_es_lock); - - n = __es_delayed_clu(inode, lblk, end); - - read_unlock(&ei->i_es_lock); - - return n; -} - -/* * __revise_pending - makes, cancels, or leaves unchanged pending cluster * reservations for a specified block range depending * upon the presence or absence of delayed blocks @@ -2168,21 +2235,27 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, * @inode - file containing the range * @lblk - logical block defining the start of range * @len - length of range in blocks + * @prealloc - preallocated pending entry * * Used after a newly allocated extent is added to the extents status tree. * Requires that the extents in the range have either written or unwritten - * status. Must be called while holding i_es_lock. + * status. Must be called while holding i_es_lock. Returns number of new + * inserts pending cluster on insert pendings, returns 0 on remove pendings, + * return -ENOMEM on failure. */ -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, + struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t end = lblk + len - 1; ext4_lblk_t first, last; bool f_del = false, l_del = false; + int pendings = 0; + int ret = 0; if (len == 0) - return; + return 0; /* * Two cases - block range within single cluster and block range @@ -2200,39 +2273,53 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) - f_del = __es_scan_range(inode, &ext4_es_is_delonly, + f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { - __insert_pending(inode, first); + ret = __insert_pending(inode, first, prealloc); + if (ret < 0) + goto out; + pendings += ret; } else { last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, - &ext4_es_is_delonly, + &ext4_es_is_delayed, end + 1, last); - if (l_del) - __insert_pending(inode, last); - else + if (l_del) { + ret = __insert_pending(inode, last, prealloc); + if (ret < 0) + goto out; + pendings += ret; + } else __remove_pending(inode, last); } } else { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) - f_del = __es_scan_range(inode, &ext4_es_is_delonly, + f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); - if (f_del) - __insert_pending(inode, first); - else + if (f_del) { + ret = __insert_pending(inode, first, prealloc); + if (ret < 0) + goto out; + pendings += ret; + } else __remove_pending(inode, first); last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) - l_del = __es_scan_range(inode, &ext4_es_is_delonly, + l_del = __es_scan_range(inode, &ext4_es_is_delayed, end + 1, last); - if (l_del) - __insert_pending(inode, last); - else + if (l_del) { + ret = __insert_pending(inode, last, prealloc); + if (ret < 0) + goto out; + pendings += ret; + } else __remove_pending(inode, last); } +out: + return (ret < 0) ? ret : pendings; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index d9847a4a25db..f3396cf32b44 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -42,6 +42,10 @@ enum { #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) +/* + * Besides EXTENT_STATUS_REFERENCED, all these extent type masks + * are exclusive, only one type can be set at a time. + */ #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) @@ -51,7 +55,9 @@ enum { #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ - EXTENT_STATUS_HOLE) << ES_SHIFT) + EXTENT_STATUS_HOLE)) + +#define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1))) struct ext4_sb_info; struct ext4_extent; @@ -129,7 +135,8 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status); + unsigned int status, + bool delalloc_reserve_used); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); @@ -141,7 +148,7 @@ extern void ext4_es_find_extent_range(struct inode *inode, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, - struct extent_status *es); + struct extent_status *es, u64 *pseq); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); @@ -156,7 +163,7 @@ static inline unsigned int ext4_es_status(struct extent_status *es) static inline unsigned int ext4_es_type(struct extent_status *es) { - return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; + return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK; } static inline int ext4_es_is_written(struct extent_status *es) @@ -184,11 +191,6 @@ static inline int ext4_es_is_mapped(struct extent_status *es) return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } -static inline int ext4_es_is_delonly(struct extent_status *es) -{ - return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); -} - static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; @@ -224,17 +226,12 @@ static inline void ext4_es_store_pblock(struct extent_status *es, es->es_pblk = block; } -static inline void ext4_es_store_status(struct extent_status *es, - unsigned int status) -{ - es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | - (es->es_pblk & ~ES_MASK); -} - static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { + WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } @@ -249,10 +246,9 @@ extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); -extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated); -extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); +extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, bool lclu_allocated, + bool end_allocated); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index b06de728b3b6..fa66b08de999 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -12,6 +12,7 @@ #include "ext4_extents.h" #include "mballoc.h" +#include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- @@ -49,19 +50,27 @@ * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * - * [1] Lock inodes for any further data updates by setting COMMITTING state - * [2] Submit data buffers of all the inodes - * [3] Wait for [2] to complete - * [4] Commit all the directory entry updates in the fast commit space - * [5] Commit all the changed inode structures - * [6] Write tail tag (this tag ensures the atomicity, please read the following + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). - * [7] Wait for [4], [5] and [6] to complete. * - * All the inode updates must call ext4_fc_start_update() before starting an - * update. If such an ongoing update is present, fast commit waits for it to - * complete. The completion of such an update is marked by - * ext4_fc_stop_update(). + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- @@ -142,6 +151,13 @@ * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * * TODOs * ----- * @@ -156,13 +172,12 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Fast commit's commit path locks the entire file system during fast - * commit. This has significant performance penalty. Instead of that, we - * should use ext4_fc_start/stop_update functions to start inode level - * updates from ext4_journal_start/stop. Once we do that we can drop file - * system locking during commit path. + * 1) Handle more ineligible cases. * - * 2) Handle more ineligible cases. + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. */ #include <trace/events/ext4.h> @@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); - atomic_set(&ei->i_fc_updates, 0); -} - -/* This function must be called with sbi->s_fc_lock held. */ -static void ext4_fc_wait_committing_inode(struct inode *inode) -__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) -{ - wait_queue_head_t *wq; - struct ext4_inode_info *ei = EXT4_I(inode); - -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) @@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb) } /* - * Inform Ext4's fast about start of an inode update - * - * This function is called by the high level call VFS callbacks before - * performing any inode update. This function blocks if there's an ongoing - * fast commit on the inode in question. - */ -void ext4_fc_start_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) - goto out; - - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } -out: - atomic_inc(&ei->i_fc_updates); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -} - -/* - * Stop inode update and wake up waiting fast commits if any. - */ -void ext4_fc_stop_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (atomic_dec_and_test(&ei->i_fc_updates)) - wake_up_all(&ei->i_fc_wait); -} - -/* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ @@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + mutex_unlock(&sbi->s_fc_lock); + schedule(); + mutex_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); } - - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -320,14 +298,10 @@ restart: list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); - if (fc_dentry->fcd_name.name && - fc_dentry->fcd_name.len > DNAME_INLINE_LEN) - kfree(fc_dentry->fcd_name.name); + release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - - return; } /* @@ -339,23 +313,28 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl { struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; + bool has_transaction = true; + bool is_ineligible; if (ext4_fc_disabled(sb)) return; - ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (handle && !IS_ERR(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); - tid = sbi->s_journal->j_running_transaction ? - sbi->s_journal->j_running_transaction->t_tid : 0; + if (sbi->s_journal->j_running_transaction) + tid = sbi->s_journal->j_running_transaction->t_tid; + else + has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - spin_lock(&sbi->s_fc_lock); - if (sbi->s_fc_ineligible_tid < tid) + mutex_lock(&sbi->s_fc_lock); + is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; - spin_unlock(&sbi->s_fc_lock); + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -372,7 +351,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl */ static int ext4_fc_track_template( handle_t *handle, struct inode *inode, - int (*__fc_track_fn)(struct inode *, void *, bool), + int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), void *args, int enqueue) { bool update = false; @@ -382,27 +361,26 @@ static int ext4_fc_track_template( int ret; tid = handle->h_transaction->t_tid; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); ei->i_sync_tid = tid; } - ret = __fc_track_fn(inode, args, update); - mutex_unlock(&ei->i_fc_lock); - + ret = __fc_track_fn(handle, inode, args, update); + spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return ret; } @@ -413,7 +391,8 @@ struct __track_dentry_update_args { }; /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ -static int __track_dentry_update(struct inode *inode, void *arg, bool update) +static int __track_dentry_update(handle_t *handle, struct inode *inode, + void *arg, bool update) { struct ext4_fc_dentry_update *node; struct ext4_inode_info *ei = EXT4_I(inode); @@ -424,43 +403,29 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, - NULL); - mutex_lock(&ei->i_fc_lock); + handle); + spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); - mutex_lock(&ei->i_fc_lock); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); + spin_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; - if (dentry->d_name.len > DNAME_INLINE_LEN) { - node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); - if (!node->fcd_name.name) { - kmem_cache_free(ext4_fc_dentry_cachep, node); - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); - mutex_lock(&ei->i_fc_lock); - return -ENOMEM; - } - memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, - dentry->d_name.len); - } else { - memcpy(node->fcd_iname, dentry->d_name.name, - dentry->d_name.len); - node->fcd_name.name = node->fcd_iname; - } - node->fcd_name.len = dentry->d_name.len; + take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); - spin_lock(&sbi->s_fc_lock); + INIT_LIST_HEAD(&node->fcd_list); + mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -481,8 +446,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - spin_unlock(&sbi->s_fc_lock); - mutex_lock(&ei->i_fc_lock); + mutex_unlock(&sbi->s_fc_lock); + spin_lock(&ei->i_fc_lock); return 0; } @@ -569,7 +534,8 @@ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) } /* __track_fn for inode tracking */ -static int __track_inode(struct inode *inode, void *arg, bool update) +static int __track_inode(handle_t *handle, struct inode *inode, void *arg, + bool update) { if (update) return -EEXIST; @@ -581,6 +547,8 @@ static int __track_inode(struct inode *inode, void *arg, bool update) void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) @@ -598,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); + } + + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } @@ -607,7 +604,8 @@ struct __track_range_args { }; /* __track_fn for tracking data updates */ -static int __track_range(struct inode *inode, void *arg, bool update) +static int __track_range(handle_t *handle, struct inode *inode, void *arg, + bool update) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_lblk_t oldstart; @@ -649,6 +647,12 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + if (ext4_has_inline_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, + handle); + return; + } + args.start = start; args.end = end; @@ -659,7 +663,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { - blk_opf_t write_flags = REQ_SYNC; + blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ @@ -730,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); - *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); @@ -777,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); - crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); @@ -818,7 +822,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; - int dlen = fc_dentry->fcd_name.len; + int dlen = fc_dentry->fcd_name.name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); @@ -833,7 +837,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); - memcpy(dst, fc_dentry->fcd_name.name, dlen); + memcpy(dst, fc_dentry->fcd_name.name.name, dlen); return true; } @@ -896,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) struct ext4_extent *ex; int ret; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", @@ -913,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; @@ -957,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) } -/* Submit data for all the fast commit inodes */ -static int ext4_fc_submit_inode_data_all(journal_t *journal) +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; - spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } - spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - - return ret; -} - -/* Wait for completion of data for all the fast commit inodes */ -static int ext4_fc_wait_inode_data_all(journal_t *journal) -{ - struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *pos, *n; - int ret = 0; - - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) - continue; - spin_unlock(&sbi->s_fc_lock); - ret = jbd2_wait_inode_data(journal, pos->jinode); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) -__acquires(&sbi->s_fc_lock) -__releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1033,26 +1001,22 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - spin_lock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the - * corresponding inode pointer + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. */ - WARN_ON(list_empty(&fc_dentry->fcd_dilist)); + if (list_empty(&fc_dentry->fcd_dilist)) + continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - spin_unlock(&sbi->s_fc_lock); - /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first @@ -1062,23 +1026,14 @@ __releases(&sbi->s_fc_lock) */ ret = ext4_fc_write_inode(inode, crc); if (ret) - goto lock_and_exit; - + return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) - goto lock_and_exit; - - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - - spin_lock(&sbi->s_fc_lock); + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; } return 0; -lock_and_exit: - spin_lock(&sbi->s_fc_lock); - return ret; } static int ext4_fc_perform_commit(journal_t *journal) @@ -1092,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; - ret = ext4_fc_submit_inode_data_all(journal); - if (ret) - return ret; + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + mutex_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); - ret = ext4_fc_wait_inode_data_all(journal); + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + mutex_unlock(&sbi->s_fc_lock); + + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ if (ret) return ret; + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); + /* + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + mutex_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + /* - * If file system device is different from journal device, issue a cache - * flush before we start writing fast commit blocks. + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* - * Add a head tag only if this is the first fast commit - * in this TID. + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( @@ -1123,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal) } } - spin_lock(&sbi->s_fc_lock); + /* Step 6.2: Now write all the dentry updates. */ + mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); - if (ret) { - spin_unlock(&sbi->s_fc_lock); + if (ret) goto out; - } + /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: + mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } @@ -1194,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; + int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); @@ -1201,13 +1210,14 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); + old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid && - commit_tid > journal->j_commit_sequence) + tid_gt(commit_tid, journal->j_commit_sequence)) goto restart_fc; ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, commit_tid); @@ -1231,6 +1241,15 @@ restart_fc: goto fallback; } + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { @@ -1245,6 +1264,7 @@ restart_fc: } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time @@ -1254,6 +1274,7 @@ restart_fc: return ret; fallback: + set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; @@ -1267,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter, *iter_n; + struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) @@ -1276,20 +1297,39 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - list_del_init(&iter->i_fc_list); - ext4_clear_inode_state(&iter->vfs_inode, + mutex_lock(&sbi->s_fc_lock); + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (iter->i_sync_tid <= tid) - ext4_fc_reset_inode(&iter->vfs_inode); - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); + } else if (full) { + /* + * We are called after a full commit, inode has been + * modified while the commit was running. Re-enqueue + * the inode into STAGING, which will then be splice + * back into MAIN. This cannot happen during + * fastcommit because the journal is locked all the + * time in that case (and tid doesn't increase so + * tid check above isn't reliable). + */ + list_add_tail(&ei->i_fc_list, + &sbi->s_fc_q[FC_Q_STAGING]); + } + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); #if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } @@ -1299,13 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - spin_unlock(&sbi->s_fc_lock); - if (fc_dentry->fcd_name.name && - fc_dentry->fcd_name.len > DNAME_INLINE_LEN) - kfree(fc_dentry->fcd_name.name); + release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], @@ -1313,14 +1349,14 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); - if (tid >= sbi->s_fc_ineligible_tid) { + if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); } if (full) sbi->s_fc_bytes = 0; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } @@ -1766,7 +1802,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ret == 0) { /* Range is not mapped */ - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); @@ -1777,11 +1813,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_ext_insert_extent( - NULL, inode, &path, &newex, 0); + path = ext4_ext_insert_extent(NULL, inode, + path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); - ext4_free_ext_path(path); - if (ret) + if (IS_ERR(path)) goto out; goto next; } @@ -1806,7 +1841,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, * at the end of the FC replay using our array of * modified inodes. */ - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } @@ -1830,6 +1865,7 @@ next: ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: + ext4_free_ext_path(path); iput(inode); return 0; } @@ -1875,7 +1911,7 @@ ext4_fc_replay_del_range(struct super_block *sb, if (ret > 0) { remaining -= ret; cur += ret; - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; @@ -1930,22 +1966,25 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) break; if (ret > 0) { - path = ext4_find_extent(inode, map.m_lblk, NULL, 0); + path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 1); - ext4_free_ext_path(path); + path[j].p_block, 1, true); + } else { + path = NULL; } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, - map.m_len, 1); + map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } + + ext4_free_ext_path(path); } /* @@ -2094,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); @@ -2127,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal, break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 2fadb2c4780c..3bd534e4dbbf 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -109,8 +109,7 @@ struct ext4_fc_dentry_update { int fcd_op; /* Type of update create / unlink / link */ int fcd_parent; /* Parent inode number */ int fcd_ino; /* Inode number */ - struct qstr fcd_name; /* Dirent name */ - unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ + struct name_snapshot fcd_name; /* Dirent name */ struct list_head fcd_list; struct list_head fcd_dilist; }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2dc3f8301225..7a8b30932189 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -131,7 +131,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) @@ -153,7 +153,7 @@ static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, { struct inode *inode = file_inode(in); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } @@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) @@ -306,80 +306,38 @@ out: } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, - ssize_t written, size_t count) + ssize_t written, ssize_t count) { handle_t *handle; - bool truncate = false; - u8 blkbits = inode->i_blkbits; - ext4_lblk_t written_blk, end_blk; - int ret; - - /* - * Note that EXT4_I(inode)->i_disksize can get extended up to - * inode->i_size while the I/O was running due to writeback of delalloc - * blocks. But, the code in ext4_iomap_alloc() is careful to use - * zeroed/unwritten extents if this is possible; thus we won't leave - * uninitialized blocks in a file even if we didn't succeed in writing - * as much as we intended. - */ - WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); - if (offset + count <= EXT4_I(inode)->i_disksize) { - /* - * We need to ensure that the inode is removed from the orphan - * list if it has been added prematurely, due to writeback of - * delalloc blocks. - */ - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); - return PTR_ERR(handle); - } - - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - } - - return written; - } - - if (written < 0) - goto truncate; + lockdep_assert_held_write(&inode->i_rwsem); handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - written = PTR_ERR(handle); - goto truncate; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); if (ext4_update_inode_size(inode, offset + written)) { - ret = ext4_mark_inode_dirty(handle, inode); + int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { - written = ret; ext4_journal_stop(handle); - goto truncate; + return ret; } } - /* - * We may need to truncate allocated but not written blocks beyond EOF. - */ - written_blk = ALIGN(offset + written, 1 << blkbits); - end_blk = ALIGN(offset + count, 1 << blkbits); - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - - /* - * Remove the inode from the orphan list if it has been extended and - * everything went OK. - */ - if (!truncate && inode->i_nlink) + if ((written == count) && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); - if (truncate) { -truncate: + return written; +} + +/* + * Clean up the inode after DIO or DAX extending write has completed and the + * inode size has been updated using ext4_handle_inode_extension(). + */ +static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) +{ + lockdep_assert_held_write(&inode->i_rwsem); + if (need_trunc) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may @@ -388,9 +346,29 @@ truncate: */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); + return; } + /* + * If i_disksize got extended either due to writeback of delalloc + * blocks or extending truncate while the DIO was running we could fail + * to cleanup the orphan list in ext4_handle_inode_extension(). Do it + * now. + */ + if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { + handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - return written; + if (IS_ERR(handle)) { + /* + * The write has successfully completed. Not much to + * do with the error here so just cleanup the orphan + * list and hope for the best. + */ + ext4_orphan_del(NULL, inode); + return; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, @@ -399,31 +377,29 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - if (error) - return error; - if (size && flags & IOMAP_DIO_UNWRITTEN) { + if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && + (iocb->ki_flags & IOCB_ATOMIC)) + error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, + size); + else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); - if (error < 0) - return error; - } + if (error) + return error; /* - * If we are extending the file, we have to update i_size here before - * page cache gets invalidated in iomap_dio_rw(). Otherwise racing - * buffered reads could zero out too much from page cache pages. Update - * of on-disk size will happen later in ext4_dio_write_iter() where - * we have enough information to also perform orphan list handling etc. - * Note that we perform all extending writes synchronously under - * i_rwsem held exclusively so i_size update is safe here in that case. - * If the write was not extending, we cannot see pos > i_size here - * because operations reducing i_size like truncate wait for all - * outstanding DIO before updating i_size. + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. Also we can race with truncate or write + * expanding the file so we have to be a bit careful here. */ - pos += size; - if (pos > i_size_read(inode)) - i_size_write(inode, pos); - - return 0; + if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && + pos + size <= i_size_read(inode)) + return 0; + error = ext4_handle_inode_extension(inode, pos, size, size); + return error < 0 ? error : 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { @@ -476,6 +452,11 @@ restart: * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). + * + * Note that unaligned writes are allowed under shared lock so long as + * they are pure overwrites. Otherwise, concurrent unaligned writes risk + * data corruption due to partial block zeroing in the dio layer, and so + * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || @@ -492,21 +473,12 @@ restart: /* * Now that locking is settled, determine dio flags and exclusivity - * requirements. Unaligned writes are allowed under shared lock so long - * as they are pure overwrites. Set the iomap overwrite only flag as an - * added precaution in this case. Even though this is unnecessary, we - * can detect and warn on unexpected -EAGAIN if an unsafe unaligned - * write is ever submitted. - * - * Otherwise, concurrent unaligned writes risk data corruption due to - * partial block zeroing in the dio layer, and so the I/O must occur - * exclusively. The inode lock is already held exclusive if the write is - * non-overwrite or extending, so drain all outstanding dio and set the - * force wait dio flag. + * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce + * behavior already. The inode lock is already held exclusive if the + * write is non-overwrite or extending, so drain all outstanding dio and + * set the force wait dio flag. */ - if (*ilock_shared && unaligned_io) { - *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; - } else if (!*ilock_shared && (unaligned_io || *extend)) { + if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -573,18 +545,20 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); } + /* + * Prevent inline data from being created since we are going to allocate + * blocks for DIO. We know the inode does not currently have inline data + * because ext4_should_use_dio() checked for it, but we have to clear + * the state flag before the write checks because a lock cycle could + * introduce races with other writers. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &unwritten, &dio_flags); if (ret <= 0) return ret; - /* - * Make sure inline data cannot be created anymore since we are going - * to allocate blocks for DIO. We know the inode does not have any - * inline data now because ext4_dio_supported() checked for that. - */ - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - offset = iocb->ki_pos; count = ret; @@ -596,24 +570,27 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - ext4_journal_stop(handle); + if (ret) + goto out; } if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); - WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); if (ret == -ENOTBLK) ret = 0; - - if (extend) - ret = ext4_handle_inode_extension(inode, offset, ret, count); + if (extend) { + /* + * We always perform extending DIO write synchronously so by + * now the IO is completed and ext4_handle_inode_extension() + * was called. Cleanup the inode in case of error or race with + * writeback of delalloc blocks. + */ + WARN_ON_ONCE(ret == -EIOCBQUEUED); + ext4_inode_extension_cleanup(inode, ret < 0); + } out: if (ilock_shared) @@ -625,6 +602,13 @@ out: ssize_t err; loff_t endbyte; + /* + * There is no support for atomic writes on buffered-io yet, + * we should never fallback to buffered-io for DIO atomic + * writes. + */ + WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); + offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) @@ -694,8 +678,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); - if (extend) + if (extend) { ret = ext4_handle_inode_extension(inode, offset, ret, count); + ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); + } out: inode_unlock(inode); if (ret > 0) @@ -707,15 +693,30 @@ out: static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { + int ret; struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + + if (iocb->ki_flags & IOCB_ATOMIC) { + size_t len = iov_iter_count(from); + + if (len < EXT4_SB(inode->i_sb)->s_awu_min || + len > EXT4_SB(inode->i_sb)->s_awu_max) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else @@ -746,7 +747,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; - pfn_t pfn; + unsigned long pfn; if (write) { sb_start_pagefault(sb); @@ -803,28 +804,33 @@ static const struct vm_operations_struct ext4_file_vm_ops = { .page_mkwrite = ext4_page_mkwrite, }; -static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +static int ext4_file_mmap_prepare(struct vm_area_desc *desc) { + int ret; + struct file *file = desc->file; struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct dax_device *dax_dev = sbi->s_daxdev; + struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(sbi))) - return -EIO; + if (file->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(vma, dax_dev)) + if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { - vma->vm_ops = &ext4_dax_vm_ops; - vm_flags_set(vma, VM_HUGEPAGE); + desc->vm_ops = &ext4_dax_vm_ops; + desc->vm_flags |= VM_HUGEPAGE; } else { - vma->vm_ops = &ext4_file_vm_ops; + desc->vm_ops = &ext4_file_vm_ops; } return 0; } @@ -841,7 +847,8 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; - if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); @@ -869,8 +876,7 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (err) goto out_journal; lock_buffer(sbi->s_sbh); - strncpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); + strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); @@ -885,8 +891,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + if (filp->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) @@ -910,8 +920,10 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | - FMODE_DIO_PARALLEL_WRITE; + if (ext4_inode_can_atomic_write(inode)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } @@ -923,12 +935,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - loff_t maxbytes; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; - else - maxbytes = inode->i_sb->s_maxbytes; + loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: @@ -962,8 +969,7 @@ const struct file_operations ext4_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif - .mmap = ext4_file_mmap, - .mmap_supported_flags = MAP_SYNC, + .mmap_prepare = ext4_file_mmap_prepare, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, @@ -971,6 +977,9 @@ const struct file_operations ext4_file_operations = { .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index cdf9bfe10137..22fc333244ef 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -74,7 +74,8 @@ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { - return rec->fmr_physical < info->gfi_low.fmr_physical; + return rec->fmr_physical + rec->fmr_length <= + info->gfi_low.fmr_physical; } /* @@ -185,6 +186,59 @@ static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) return fmr->fmr_physical + fmr->fmr_length; } +static int ext4_getfsmap_meta_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb, fs_start, fs_end; + int error; + + fs_start = fsb = (EXT4_C2B(sbi, start) + + ext4_group_first_block_no(sb, agno)); + fs_end = fs_start + EXT4_C2B(sbi, len); + + /* + * Return relevant extents from the meta_list. We emit all extents that + * partially/fully overlap with the query range + */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + continue; + } + if (p->fmr_physical <= fs_end && + p->fmr_physical + p->fmr_length > fs_start) { + /* Emit the retained free extent record if present */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, + &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + fsb = p->fmr_physical + p->fmr_length; + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + list_del(&p->fmr_list); + kfree(p); + continue; + } + } + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + + return 0; +} + + /* Transform a blockgroup's free record into a fsmap */ static int ext4_getfsmap_datadev_helper(struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, @@ -343,6 +397,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, /* Reserved GDT blocks */ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + + /* + * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases, + * check for that. + */ + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_RESV_GDT); if (error) @@ -476,6 +538,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, ext4_group_t end_ag; ext4_grpblk_t first_cluster; ext4_grpblk_t last_cluster; + struct ext4_fsmap irec; int error = 0; bofs = le32_to_cpu(sbi->s_es->s_first_data_block); @@ -539,6 +602,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, error = ext4_mballoc_query_range(sb, info->gfi_agno, EXT4_B2C(sbi, info->gfi_low.fmr_physical), EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_meta_helper, ext4_getfsmap_datadev_helper, info); if (error) goto err; @@ -558,9 +622,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb, goto err; } - /* Report any gaps at the end of the bg */ + /* + * The dummy record below will cause ext4_getfsmap_helper() to report + * any allocated blocks at the end of the range. + */ + irec.fmr_device = 0; + irec.fmr_physical = end_fsb + 1; + irec.fmr_length = 0; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + info->gfi_last = true; - error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); + error = ext4_getfsmap_helper(sb, info, &irec); if (error) goto err; @@ -576,8 +649,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; - if (EXT4_SB(sb)->s_journal_bdev && - fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev)) + if (EXT4_SB(sb)->s_journal_bdev_file && + fm->fmr_device == + new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev)) return true; return false; } @@ -647,9 +721,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; - if (EXT4_SB(sb)->s_journal_bdev) { + if (EXT4_SB(sb)->s_journal_bdev_file) { handlers[1].gfd_dev = new_encode_dev( - EXT4_SB(sb)->s_journal_bdev->bd_dev); + file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0c56f3a011a1..e476c6de3074 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -131,24 +131,19 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(sbi))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); - if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_mount_flags value */ - smp_rmb(); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) - ret = -EROFS; + if (sb_rdonly(inode->i_sb)) goto out; - } - if (!sbi->s_journal) { + if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 46c3423ddfa1..48483cd015d3 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -268,7 +268,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, combined_hash = fscrypt_fname_siphash(dir, &qname); } else { ext4_warning_inode(dir, "Siphash requires key"); - return -1; + return -EINVAL; } hash = (__u32)(combined_hash >> 32); @@ -300,9 +300,9 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um && + if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { - buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); + buff = kzalloc(PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 48abef5f23e7..b20a1bf866ab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -87,10 +87,10 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; - grp = ext4_get_group_info(sb, block_group); - if (buffer_verified(bh)) return 0; + + grp = ext4_get_group_info(sb, block_group); if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; @@ -98,8 +98,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); - if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8) || + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " @@ -194,8 +193,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); + ext4_read_bh(bh, REQ_META | REQ_PRIO, + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO)); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error_err(sb, EIO, "Cannot read inode bitmap - " @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (atomic_read(&inode->i_count) > 1) { + if (icount_read(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); + icount_read(inode)); return; } if (inode->i_nlink) { @@ -327,8 +327,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } - ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -514,6 +513,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, if (min_inodes < 1) min_inodes = 1; min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; + if (min_clusters < 0) + min_clusters = 0; /* * Start looking in the flex group where we last allocated an @@ -690,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it - * must have been written out. + * must have been written out, or, most unlikely, is + * being migrated - false failure should be OK here. */ goto out; @@ -755,10 +757,10 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) struct ext4_group_desc *gdp; ext4_group_t group; int bit; - int err = -EFSCORRUPTED; + int err; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) - goto out; + return -EFSCORRUPTED; group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); @@ -772,7 +774,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); - if (!gdp || !group_desc_bh) { + if (!gdp) { err = -EINVAL; goto out; } @@ -851,8 +853,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } @@ -860,6 +861,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); sync_dirty_buffer(group_desc_bh); out: + brelse(inode_bitmap_bh); return err; } @@ -950,8 +952,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) - return ERR_PTR(-EIO); + ret2 = ext4_emergency_state(sb); + if (unlikely(ret2)) + return ERR_PTR(ret2); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -1053,14 +1056,14 @@ got_group: brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ - if (((!(sbi->s_mount_state & EXT4_FC_REPLAY)) - && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || - IS_ERR(inode_bitmap_bh)) { + if (IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; goto next_group; } + if (!(sbi->s_mount_state & EXT4_FC_REPLAY) && + EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; -repeat_in_this_group: ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (!ret2) goto next_group; @@ -1110,8 +1113,6 @@ repeat_in_this_group: if (!ret2) goto got; /* we grabbed the inode! */ - if (ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; next_group: if (++group == ngroups) group = 0; @@ -1224,8 +1225,7 @@ got: } } if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -1250,8 +1250,8 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); - ei->i_crtime = inode->i_mtime; + simple_inode_init_ts(inode); + ei->i_crtime = inode_get_mtime(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -1284,23 +1284,21 @@ got: inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && - (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) + (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); @@ -1336,10 +1334,9 @@ got: } } - if (ext4_handle_valid(handle)) { - ei->i_sync_tid = handle->h_transaction->t_tid; - ei->i_datasync_tid = handle->h_transaction->t_tid; - } + ext4_set_inode_mapping_order(inode); + + ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -1523,12 +1520,6 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; - /* This should not happen, but just to be sure check this */ - if (sb_rdonly(sb)) { - ret = 1; - goto out; - } - gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a9f3716119d3..da76353b3a57 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -170,7 +170,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, } if (!bh_uptodate_or_lock(bh)) { - if (ext4_read_bh(bh, 0, NULL) < 0) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { put_bh(bh); goto failure; } @@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, int indirect_blks; int blocks_to_boundary = 0; int depth; - int count = 0; + u64 count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); @@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, count++; /* Fill in size of a hole we found */ map->m_pblk = 0; - map->m_len = min_t(unsigned int, map->m_len, count); + map->m_len = umin(map->m_len, count); goto cleanup; } @@ -652,13 +652,6 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; - /* - * Update reserved blocks/metadata blocks after successful block - * allocation which had been deferred till now. - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, count, 1); - got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); @@ -714,7 +707,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; @@ -1032,7 +1025,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } /* Go read the buffer for the next level down */ - bh = ext4_sb_bread(inode->i_sb, nr, 0); + bh = ext4_sb_bread_nofail(inode->i_sb, nr); /* * A read failure? Report error and clear slot diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 003861037374..1f6bc05593df 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -20,6 +20,11 @@ #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 + +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + void **fsdata); + static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) @@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); @@ -298,7 +303,11 @@ static int ext4_create_inline_data(handle_t *handle, if (error) goto out; - BUG_ON(!is.s.not_found); + if (!is.s.not_found) { + EXT4_ERROR_INODE(inode, "unexpected inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { @@ -349,7 +358,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, if (error) goto out; - BUG_ON(is.s.not_found); + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, "missing inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); @@ -392,7 +405,7 @@ out: } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) + loff_t len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); @@ -405,7 +418,12 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, return -ENOSPC; ext4_write_lock_xattr(inode, &no_expand); - + /* + * ei->i_inline_size may have changed since the initial check + * if other xattrs were added. Recalculate to ensure + * ext4_update_inline_data() validates against current capacity. + */ + (void) ext4_find_inline_data_nolock(inode); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else @@ -433,9 +451,13 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (!ei->i_inline_off) return 0; + down_write(&ei->i_data_sem); + error = ext4_get_inode_loc(inode, &is.iloc); - if (error) + if (error) { + up_write(&ei->i_data_sem); return error; + } error = ext4_xattr_ibody_find(inode, &i, &is); if (error) @@ -474,6 +496,7 @@ out: brelse(is.iloc.bh); if (error == -ENODATA) error = 0; + up_write(&ei->i_data_sem); return error; } @@ -502,9 +525,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) BUG_ON(len > PAGE_SIZE); kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_folio(folio); + kaddr = folio_zero_tail(folio, len, kaddr + len); kunmap_local(kaddr); - folio_zero_segment(folio, len, folio_size(folio)); folio_mark_uptodate(folio); brelse(iloc.bh); @@ -558,7 +580,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, return 0; } - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -597,15 +619,18 @@ retry: goto out; } + ext4_fc_track_inode(handle, inode); ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(&folio->page, from, to, - ext4_get_block_unwritten); + ret = ext4_block_write_begin(handle, folio, from, to, + ext4_get_block_unwritten); } else - ret = __block_write_begin(&folio->page, from, to, ext4_get_block); + ret = ext4_block_write_begin(handle, folio, from, to, + ext4_get_block); + clear_buffer_new(folio_buffers(folio)); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -637,7 +662,7 @@ retry: goto retry; if (folio) - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); out: if (folio) { folio_unlock(folio); @@ -653,91 +678,109 @@ out_nofolio: } /* - * Try to write data in the inode. - * If the inode has inline data, check whether the new write can be - * in the inode also. If not, create the page the handle, move the data - * to the page make it update and let the later codes create extent for it. + * Prepare the write for the inline data. + * If the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). */ -int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct page **pagep) +int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; - - if (pos + len > ext4_get_max_inline_size(inode)) - goto convert; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; - /* - * The possible write could happen in the inode, - * so try to reserve the space in inode first. - */ +retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - handle = NULL; - goto out; + goto out_release_bh; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) - goto out; + goto out_stop_journal; - /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); - brelse(iloc.bh); - goto convert; - } + if (!da) { + brelse(iloc.bh); + /* Retry inside */ + return ext4_convert_inline_data_to_extent(mapping, inode); + } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out; + ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + goto out_release_bh; + } folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); - goto out; + goto out_stop_journal; } - *pagep = &folio->page; down_read(&EXT4_I(inode)->xattr_sem); + /* Someone else had converted it to extent */ if (!ext4_has_inline_data(inode)) { ret = 0; - folio_unlock(folio); - folio_put(folio); - goto out_up_read; + goto out_release_folio; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) { - folio_unlock(folio); - folio_put(folio); - goto out_up_read; - } + if (ret < 0) + goto out_release_folio; } - ret = 1; - handle = NULL; -out_up_read: + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); + if (ret) + goto out_release_folio; + *foliop = folio; up_read(&EXT4_I(inode)->xattr_sem); -out: - if (handle && (ret != 1)) - ext4_journal_stop(handle); + brelse(iloc.bh); + return 1; + +out_release_folio: + up_read(&EXT4_I(inode)->xattr_sem); + folio_unlock(folio); + folio_put(folio); +out_stop_journal: + ext4_journal_stop(handle); +out_release_bh: brelse(iloc.bh); return ret; -convert: - return ext4_convert_inline_data_to_extent(mapping, inode); +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop) +{ + if (pos + len > ext4_get_max_inline_size(inode)) + return ext4_convert_inline_data_to_extent(mapping, inode); + return ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, NULL, false); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, @@ -857,8 +900,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = __block_write_begin(&folio->page, 0, inline_size, - ext4_da_get_block_prep); + ret = ext4_block_write_begin(NULL, folio, 0, inline_size, + ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); @@ -867,6 +910,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, return ret; } + clear_buffer_new(folio_buffers(folio)); folio_mark_dirty(folio); folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); @@ -881,94 +925,6 @@ out: return ret; } -/* - * Prepare the write for the inline data. - * If the data can be written into the inode, we just read - * the page and make it uptodate, and start the journal. - * Otherwise read the page, makes it dirty so that it can be - * handle in writepages(the i_disksize update is left to the - * normal ext4_da_write_end). - */ -int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct page **pagep, - void **fsdata) -{ - int ret; - handle_t *handle; - struct folio *folio; - struct ext4_iloc iloc; - int retries = 0; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out_journal; - - if (ret == -ENOSPC) { - ext4_journal_stop(handle); - ret = ext4_da_convert_inline_data_to_extent(mapping, - inode, - fsdata); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - goto out; - } - - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - goto out_journal; - } - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - ret = 0; - goto out_release_page; - } - - if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) - goto out_release_page; - } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out_release_page; - - up_read(&EXT4_I(inode)->xattr_sem); - *pagep = &folio->page; - brelse(iloc.bh); - return 1; -out_release_page: - up_read(&EXT4_I(inode)->xattr_sem); - folio_unlock(folio); - folio_put(folio); -out_journal: - ext4_journal_stop(handle); -out: - brelse(iloc.bh); - return ret; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) @@ -1012,7 +968,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + err = ext4_find_dest_de(dir, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; @@ -1037,7 +993,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; @@ -1059,7 +1015,7 @@ static void *ext4_get_inline_xattr_pos(struct inode *inode, } /* Set the final de to cover the whole block. */ -static void ext4_update_final_de(void *de_buf, int old_size, int new_size) +void ext4_update_final_de(void *de_buf, int old_size, int new_size) { struct ext4_dir_entry_2 *de, *prev_de; void *limit; @@ -1123,51 +1079,6 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } -static int ext4_finish_convert_inline_dir(handle_t *handle, - struct inode *inode, - struct buffer_head *dir_block, - void *buf, - int inline_size) -{ - int err, csum_size = 0, header_size = 0; - struct ext4_dir_entry_2 *de; - void *target = dir_block->b_data; - - /* - * First create "." and ".." and then copy the dir information - * back to the block. - */ - de = target; - de = ext4_init_dot_dotdot(inode, de, - inode->i_sb->s_blocksize, csum_size, - le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); - header_size = (void *)de - target; - - memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, - inline_size - EXT4_INLINE_DOTDOT_SIZE); - - if (ext4_has_metadata_csum(inode->i_sb)) - csum_size = sizeof(struct ext4_dir_entry_tail); - - inode->i_size = inode->i_sb->s_blocksize; - i_size_write(inode, inode->i_sb->s_blocksize); - EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - ext4_update_final_de(dir_block->b_data, - inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, - inode->i_sb->s_blocksize - csum_size); - - if (csum_size) - ext4_initialize_dirent_tail(dir_block, - inode->i_sb->s_blocksize); - set_buffer_uptodate(dir_block); - unlock_buffer(dir_block); - err = ext4_handle_dirty_dirblock(handle, inode, dir_block); - if (err) - return err; - set_buffer_verified(dir_block); - return ext4_mark_inode_dirty(handle, inode); -} - static int ext4_convert_inline_data_nolock(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) @@ -1239,8 +1150,17 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { - error = ext4_finish_convert_inline_dir(handle, inode, data_bh, - buf, inline_size); + unlock_buffer(data_bh); + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + + error = ext4_init_dirblock(handle, inode, data_bh, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (!error) + error = ext4_mark_inode_dirty(handle, inode); } out_restore: @@ -1379,7 +1299,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; - strcpy(fake.name, "."); + memcpy(fake.name, ".", 2); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); @@ -1389,7 +1309,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; - strcpy(fake.name, ".."); + memcpy(fake.name, "..", 3); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); @@ -1411,7 +1331,11 @@ int ext4_inlinedir_to_tree(struct file *dir_file, hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (err) { + ret = err; + goto out; + } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && @@ -1457,6 +1381,7 @@ int ext4_read_inline_dir(struct file *file, struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; + struct dir_private_info *info = file->private_data; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1500,12 +1425,12 @@ int ext4_read_inline_dir(struct file *file, extra_size = extra_offset + inline_size; /* - * If the version has changed since the last call to + * If the cookie has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ - if (!inode_eq_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and @@ -1537,7 +1462,7 @@ int ext4_read_inline_dir(struct file *file, } offset = i; ctx->pos = offset; - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); } while (ctx->pos < extra_size) { @@ -1661,24 +1586,36 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; int ret; - struct ext4_iloc iloc; void *inline_start; int inline_size; - if (ext4_get_inode_loc(dir, &iloc)) - return NULL; + ret = ext4_get_inode_loc(dir, &is.iloc); + if (ret) + return ERR_PTR(ret); down_read(&EXT4_I(dir)->xattr_sem); + + ret = ext4_xattr_ibody_find(dir, &i, &is); + if (ret) + goto out; + if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } - inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; @@ -1688,20 +1625,23 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) goto out; - inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; - ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; out: - brelse(iloc.bh); - iloc.bh = NULL; + brelse(is.iloc.bh); + if (ret < 0) + is.iloc.bh = ERR_PTR(ret); + else + is.iloc.bh = NULL; out_find: up_read(&EXT4_I(dir)->xattr_sem); - return iloc.bh; + return is.iloc.bh; } int ext4_delete_inline_entry(handle_t *handle, @@ -1908,7 +1848,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) }; - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1947,7 +1887,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; - BUG_ON(is.s.not_found); + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, + "missing inline data xattr"); + err = -EFSCORRUPTED; + goto out_error; + } value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); @@ -1991,7 +1936,7 @@ out: ext4_orphan_del(handle, inode); if (err == 0) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -2023,7 +1968,7 @@ int ext4_convert_inline_data(struct inode *inode) return 0; } - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index f0c0fd507fbc..749af7ad4e09 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = { kunit_test_suites(&ext4_inode_test_suite); +MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding"); MODULE_LICENSE("GPL v2"); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89737d5a1614..0c466ccbed69 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -49,32 +50,35 @@ #include <trace/events/ext4.h> +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, + struct folio *folio, + unsigned from, unsigned to); + static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -88,7 +92,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -109,7 +113,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -136,16 +140,13 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + if (!ext4_has_feature_ea_inode(inode->i_sb)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; @@ -176,6 +177,8 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); + dax_break_layout_final(inode); + if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { @@ -199,8 +202,7 @@ void ext4_evict_inode(struct inode *inode) * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ - if (!list_empty_careful(&inode->i_io_list)) - inode_io_list_del(inode); + inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any @@ -371,17 +373,18 @@ void ext4_da_update_reserve_space(struct inode *inode, */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) return 0; + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " @@ -407,6 +410,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, return ret; } +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -453,6 +482,191 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + +static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + unsigned int status; + int retval; + unsigned int orig_mlen = map->m_len; + + flags &= EXT4_EX_QUERY_FILTER; + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(handle, inode, map, flags); + else + retval = ext4_ind_map_blocks(handle, inode, map, flags); + if (retval < 0) + return retval; + + /* A hole? */ + if (retval == 0) + goto out; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + map->m_len == orig_mlen) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } else { + retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); + } +out: + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); + return retval; +} + +static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + unsigned int status; + int err, retval = 0; + + /* + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE + * indicates that the blocks and quotas has already been + * checked when the data was copied into the page cache. + */ + if (map->m_flags & EXT4_MAP_DELAYED) + flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + /* + * Here we clear m_flags because after allocating an new extent, + * it will be set again. + */ + map->m_flags &= ~EXT4_MAP_FLAGS; + + /* + * We need to check for EXT4 here because migrate could have + * changed the inode type in between. + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + /* + * We allocated new blocks which will result in i_data's + * format changing. Force the migrate to fail by clearing + * migrate flags. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode %lu: " + "retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, + map->m_len); + if (err) + return err; + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { + if (ext4_es_is_written(&es)) + return retval; + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, + status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); + + return retval; +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -465,9 +679,10 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. if - * create==0 and the blocks are pre-allocated and unwritten, the resulting @map - * is marked as unwritten. If the create == 1, it will mark @map as mapped. + * On success, it returns the number of blocks being mapped or allocated. + * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are + * pre-allocated and unwritten, the resulting @map is marked as unwritten. + * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to @@ -481,6 +696,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct extent_status es; int retval; int ret = 0; + unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -501,9 +717,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ - if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -515,6 +740,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; + map->m_flags |= ext4_es_is_delayed(&es) ? + EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; @@ -530,7 +757,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif - goto found; + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -544,32 +775,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, 0); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, 0); - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -587,8 +793,7 @@ found: * Returns if the blocks have already allocated * * Note that if blocks have been preallocated - * ext4_ext_get_block() returns the create = 0 - * with buffer head unmapped. + * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* @@ -599,12 +804,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; - /* - * Here we clear m_flags because after allocating an new extent, - * it will be set again. - */ - map->m_flags &= ~EXT4_MAP_FLAGS; + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -612,78 +813,15 @@ found: * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); + retval = ext4_map_create_blocks(handle, inode, map, flags); + up_write((&EXT4_I(inode)->i_data_sem)); - /* - * We need to check for EXT4 here because migrate - * could have changed the inode type in between - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { - /* - * We allocated new blocks which will result in - * i_data's format changing. Force the migrate - * to fail by clearing migrate flags - */ - ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - } - } - - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - /* - * We have to zeroout blocks before inserting them into extent - * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. We also have to - * unmap metadata before zeroing as otherwise writeback can - * overwrite zeros with stale data from block device. - */ - if (flags & EXT4_GET_BLOCKS_ZERO && - map->m_flags & EXT4_MAP_MAPPED && - map->m_flags & EXT4_MAP_NEW) { - ret = ext4_issue_zeroout(inode, map->m_lblk, - map->m_pblk, map->m_len); - if (ret) { - retval = ret; - goto out_sem; - } - } - - /* - * If the extent has been zeroed out, we don't need to update - * extent status tree. - */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { - if (ext4_es_is_written(&es)) - goto out_sem; - } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } + if (retval < 0) + ext_debug(inode, "failed with err %d\n", retval); + if (retval <= 0) + return retval; -out_sem: - up_write((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + if (map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; @@ -698,9 +836,8 @@ out_sem: !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { - loff_t start_byte = - (loff_t)map->m_lblk << inode->i_blkbits; - loff_t length = (loff_t)map->m_len << inode->i_blkbits; + loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); + loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, @@ -712,12 +849,8 @@ out_sem: return ret; } } - if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || - map->m_flags & EXT4_MAP_MAPPED)) - ext4_fc_track_range(handle, inode, map->m_lblk, - map->m_lblk + map->m_len - 1); - if (retval < 0) - ext_debug(inode, "failed with err %d\n", retval); + ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + + map->m_len - 1); return retval; } @@ -733,7 +866,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ - if (!bh->b_page) { + if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } @@ -748,6 +881,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } +/* + * Make sure that the current journal transaction has enough credits to map + * one extent. Return -EAGAIN if it cannot extend the current running + * transaction. + */ +static inline int ext4_journal_ensure_extent_credits(handle_t *handle, + struct inode *inode) +{ + int credits; + int ret; + + /* Called from ext4_da_write_begin() which has no handle started? */ + if (!handle) + return 0; + + credits = ext4_chunk_trans_blocks(inode, 1); + ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); + return ret <= 0 ? ret : -EAGAIN; +} + static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { @@ -789,10 +942,22 @@ int ext4_get_block(struct inode *inode, sector_t iblock, int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + int ret = 0; + ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, + ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + + /* + * If the buffer is marked unwritten, mark it as new to make sure it is + * zeroed out correctly in case of partial writes. Otherwise, there is + * a chance of stale data getting exposed. + */ + if (ret == 0 && buffer_unwritten(bh_result)) + set_buffer_new(bh_result); + + return ret; } /* Maximum number of blocks we map for direct IO at once. */ @@ -826,7 +991,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); - bh = sb_getblk(inode->i_sb, map.m_pblk); + /* + * Since bh could introduce extra ref count such as referred by + * journal_head etc. Try to avoid using __GFP_MOVABLE here + * as it may fail the migration when journal_head remains. + */ + bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, + inode->i_sb->s_blocksize); + if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { @@ -968,64 +1140,50 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - int dirty = buffer_dirty(bh); - int ret; - if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - /* - * __block_write_begin() could have dirtied some buffers. Clean - * the dirty bit as jbd2_journal_get_write_access() could complain - * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_write_begin() isn't a real problem here as we clear - * the bit before releasing a page lock and thus writeback cannot - * ever write the buffer. - */ - if (dirty) - clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); - ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); - if (!ret && dirty) - ret = ext4_dirty_journalled_data(handle, bh); - return ret; } -#ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, - get_block_t *get_block) +int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, + get_block_t *get_block) { - unsigned from = pos & (PAGE_SIZE - 1); + unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; - unsigned blocksize = inode->i_sb->s_blocksize; - unsigned bbits; + unsigned int blocksize = i_blocksize(inode); struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; + bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); + WARN_ON_ONCE(blocksize > folio_size(folio)); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } - bbits = ilog2(blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); + if (!head) + head = create_empty_buffers(folio, blocksize, 0); + block = EXT4_PG_TO_LBLK(inode, folio->index); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { @@ -1036,18 +1194,32 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, } continue; } - if (buffer_new(bh)) + if (WARN_ON_ONCE(buffer_new(bh))) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); + err = ext4_journal_ensure_extent_credits(handle, inode); + if (!err) + err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { + /* + * We may be zeroing partial buffers or all new + * buffers in case of failure. Prepare JBD2 for + * that. + */ + if (should_journal_data) + do_journal_get_write_access(handle, + inode, bh); if (folio_test_uptodate(folio)) { - clear_buffer_new(bh); + /* + * Unlike __block_write_begin() we leave + * dirtying of new uptodate buffers to + * ->write_end() time or + * folio_zero_new_buffers(). + */ set_buffer_uptodate(bh); - mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) @@ -1077,7 +1249,11 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - folio_zero_new_buffers(folio, from, to); + if (should_journal_data) + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); + else + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1093,7 +1269,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, return err; } -#endif /* * To preserve ordering, it is essential that the hole instantiation and @@ -1102,9 +1277,10 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ -static int ext4_write_begin(struct file *file, struct address_space *mapping, +static int ext4_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; @@ -1114,22 +1290,22 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ - needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + needed_blocks = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)) + 1; index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, - pagep); + foliop); if (ret < 0) return ret; if (ret == 1) @@ -1137,23 +1313,29 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, } /* - * __filemap_get_folio() can take a long time if the + * write_begin_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); + + if (len > folio_next_pos(folio) - pos) + len = folio_next_pos(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!folio_buffers(folio)) - create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); + create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); @@ -1175,19 +1357,12 @@ retry_journal: /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); -#ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(folio, pos, len, + ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); -#else - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(&folio->page, pos, len, - ext4_get_block_unwritten); - else - ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); -#endif + ret = ext4_block_write_begin(handle, folio, pos, len, + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, @@ -1200,7 +1375,7 @@ retry_journal: folio_unlock(folio); /* - * __block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * @@ -1223,13 +1398,14 @@ retry_journal: ext4_orphan_del(NULL, inode); } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + if (ret == -EAGAIN || + (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_journal; folio_put(folio); return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -1244,22 +1420,22 @@ static int write_end_fn(handle_t *handle, struct inode *inode, ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); + clear_buffer_new(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). + * `iocb` can be NULL - eg, when called from page_symlink(). * - * ext4 never places buffers on inode->i_mapping->private_list. metadata + * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ -static int ext4_write_end(struct file *file, +static int ext4_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1274,7 +1450,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(pos, len, copied, folio); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1287,8 +1463,10 @@ static int ext4_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1348,9 +1526,9 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, size = min(to, block_end) - start; folio_zero_range(folio, start, size); - write_end_fn(handle, inode, bh); } clear_buffer_new(bh); + write_end_fn(handle, inode, bh); } } block_start = block_end; @@ -1358,12 +1536,11 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, } while (bh != head); } -static int ext4_journalled_write_end(struct file *file, +static int ext4_journalled_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1404,8 +1581,10 @@ static int ext4_journalled_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1438,9 +1617,9 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve space for a single cluster + * Reserve space for 'nr_resv' clusters */ -static int ext4_da_reserve_space(struct inode *inode) +static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1451,18 +1630,18 @@ static int ext4_da_reserve_space(struct inode *inode) * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); - if (ext4_claim_free_clusters(sbi, 1, 0)) { + if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); - dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } - ei->i_reserved_data_blocks++; - trace_ext4_da_reserve_space(inode); + ei->i_reserved_data_blocks += nr_resv; + trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1514,11 +1693,12 @@ struct mpage_da_data { unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ - pgoff_t first_page; /* The first page to write */ - pgoff_t next_page; /* Current page to examine */ - pgoff_t last_page; /* Last page to examine */ + loff_t start_pos; /* The start pos to write */ + loff_t next_pos; /* Current pos to examine */ + loff_t end_pos; /* Last pos to examine */ + /* - * Extent to map - this can be after first_page because that can be + * Extent to map - this can be after start_pos because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ @@ -1538,38 +1718,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - /* This is necessary when next_page == 0. */ - if (mpd->first_page >= mpd->next_page) + /* This is necessary when next_pos == 0. */ + if (mpd->start_pos >= mpd->next_pos) return; mpd->scanned_until_end = 0; - index = mpd->first_page; - end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; - start = index << (PAGE_SHIFT - inode->i_blkbits); - last = end << (PAGE_SHIFT - inode->i_blkbits); + start = EXT4_B_TO_LBLK(inode, mpd->start_pos); + last = mpd->next_pos >> inode->i_blkbits; /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); - ext4_es_remove_extent(inode, start, last - start + 1); + ext4_es_remove_extent(inode, start, last - start); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); - while (index <= end) { - nr = filemap_get_folios(mapping, &index, end, &fbatch); + index = mpd->start_pos >> PAGE_SHIFT; + end = mpd->next_pos >> PAGE_SHIFT; + while (index < end) { + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - if (folio->index < mpd->first_page) + if (folio_pos(folio) < mpd->start_pos) continue; - if (folio_next_index(folio) - 1 > end) + if (folio_next_index(folio) > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -1609,24 +1789,58 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * ext4_insert_delayed_block - adds a delayed block to the extents status - * tree, incrementing the reserved cluster/block - * count or making a pending reservation - * where needed + * Check whether the cluster containing lblk has been allocated or has + * delalloc reservation. + * + * Returns 0 if the cluster doesn't have either, 1 if it has delalloc + * reservation, 2 if it's already been allocated, negative error code on + * failure. + */ +static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + /* Has delalloc reservation? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) + return 1; + + /* Already been allocated? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) + return 2; + ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); + if (ret < 0) + return ret; + if (ret > 0) + return 2; + + return 0; +} + +/* + * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents + * status tree, incrementing the reserved + * cluster/block count or making pending + * reservations where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added + * @lblk - start logical block to be added + * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ -static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; - bool allocated = false; + bool lclu_allocated = false; + bool end_allocated = false; + ext4_lblk_t resv_clu; + ext4_lblk_t end = lblk + len - 1; /* - * If the cluster containing lblk is shared with a delayed, + * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's @@ -1637,84 +1851,86 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { - ret = ext4_da_reserve_space(inode); + ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ - if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { - if (!ext4_es_scan_clu(inode, - &ext4_es_is_mapped, lblk)) { - ret = ext4_clu_mapped(inode, - EXT4_B2C(sbi, lblk)); - if (ret < 0) - return ret; - if (ret == 0) { - ret = ext4_da_reserve_space(inode); - if (ret != 0) /* ENOSPC */ - return ret; - } else { - allocated = true; - } - } else { - allocated = true; + resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; + + ret = ext4_clu_alloc_state(inode, lblk); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + lclu_allocated = (ret == 2); + } + + if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { + ret = ext4_clu_alloc_state(inode, end); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + end_allocated = (ret == 2); } } + + if (resv_clu) { + ret = ext4_da_reserve_space(inode, resv_clu); + if (ret != 0) /* ENOSPC */ + return ret; + } } - ext4_es_insert_delayed_block(inode, lblk, allocated); + ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, + end_allocated); return 0; } /* - * This function is grabs code from the very beginning of - * ext4_map_blocks, but assumes that the caller is from delayed write - * time. This function looks up the requested blocks and sets the - * buffer delay bit under the protection of i_data_sem. + * Looks up the requested blocks and sets the delalloc extent map. + * First try to look up for the extent entry that contains the requested + * blocks in the extent status tree without i_data_sem, then try to look + * up for the ondisk extent mapping with i_data_sem in read mode, + * finally hold i_data_sem in write mode, looks up again and add a + * delalloc extent entry if it still couldn't find any extent. Pass out + * the mapped extent through @map and return 0 on success. */ -static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, - struct ext4_map_blocks *map, - struct buffer_head *bh) +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; - sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; - map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { - if (ext4_es_is_hole(&es)) { - retval = 0; - down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + + if (ext4_es_is_hole(&es)) goto add_delayed; - } +found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delayed(&es)) { + map->m_flags |= EXT4_MAP_DELAYED; return 0; } - map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; - retval = es.es_len - (iblock - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) @@ -1725,7 +1941,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif - return retval; + return 0; } /* @@ -1735,48 +1951,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; - else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); + retval = ext4_map_query_blocks(NULL, inode, map, 0); + up_read(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval < 0 ? retval : 0; add_delayed: - if (retval == 0) { - int ret; - - /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? - */ + down_write(&EXT4_I(inode)->i_data_sem); + /* + * Page fault path (ext4_page_mkwrite does not take i_rwsem) + * and fallocate path (no folio lock) can race. Make sure we + * lookup the extent status tree here again while i_data_sem + * is held in write mode, before inserting a new da entry in + * the extent status tree. + */ + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); - ret = ext4_insert_delayed_block(inode, map->m_lblk); - if (ret != 0) { - retval = ret; - goto out_unlock; + if (!ext4_es_is_hole(&es)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto found; } - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - } else if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); + } else if (!ext4_has_inline_data(inode)) { + retval = ext4_map_query_blocks(NULL, inode, map, 0); + if (retval) { + up_write(&EXT4_I(inode)->i_data_sem); + return retval < 0 ? retval : 0; } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); } -out_unlock: - up_read((&EXT4_I(inode)->i_data_sem)); + map->m_flags |= EXT4_MAP_DELAYED; + retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); + if (!retval) + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); + up_write(&EXT4_I(inode)->i_data_sem); return retval; } @@ -1797,11 +2007,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; + sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + map.m_lblk = iblock; map.m_len = 1; @@ -1810,10 +2024,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_da_map_blocks(inode, iblock, &map, bh); - if (ret <= 0) + ret = ext4_da_map_blocks(inode, &map); + if (ret < 0) return ret; + if (map.m_flags & EXT4_MAP_DELAYED) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); @@ -1832,7 +2053,8 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - mpd->first_page += folio_nr_pages(folio); + mpd->start_pos += folio_size(folio); + mpd->wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -1842,7 +2064,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) loff_t size; int err; - BUG_ON(folio->index != mpd->first_page); + WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path @@ -1861,10 +2083,8 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) - len = size & ~PAGE_MASK; + len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); - if (!err) - mpd->wbc->nr_to_write--; return err; } @@ -2012,7 +2232,6 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; - int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); @@ -2038,7 +2257,8 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, err = PTR_ERR(io_end_vec); goto out; } - io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, + mpd->map.m_lblk); } *map_bh = true; goto out; @@ -2048,7 +2268,7 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - io_end_size += (1 << blkbits); + io_end_size += i_blocksize(mpd->inode); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; @@ -2078,16 +2298,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; - int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; - start = mpd->map.m_lblk >> bpp_bits; - end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; - lblk = start << bpp_bits; + start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk); + end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1); pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2098,6 +2316,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; + lblk = EXT4_PG_TO_LBLK(inode, folio->index); err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2131,6 +2350,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) int get_blocks_flags; int err, dioread_nolock; + /* Make sure transaction has enough credits for this extent */ + err = ext4_journal_ensure_extent_credits(handle, inode); + if (err < 0) + return err; + trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or @@ -2140,21 +2364,18 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if - * the blocks in question are delalloc blocks. This indicates - * that the blocks and quotas has already been checked when - * the data was copied into the page cache. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (map->m_flags & BIT(BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) @@ -2165,7 +2386,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } - ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); @@ -2173,6 +2394,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) } /* + * This is used to submit mapped buffers in a single folio that is not fully + * mapped for various reasons, such as insufficient space or journal credits. + */ +static int mpage_submit_partial_folio(struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct folio *folio; + loff_t pos; + int ret; + + folio = filemap_get_folio(inode->i_mapping, + mpd->start_pos >> PAGE_SHIFT); + if (IS_ERR(folio)) + return PTR_ERR(folio); + /* + * The mapped position should be within the current processing folio + * but must not be the folio start position. + */ + pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; + if (WARN_ON_ONCE((folio_pos(folio) == pos) || + !folio_contains(folio, pos >> PAGE_SHIFT))) + return -EINVAL; + + ret = mpage_submit_folio(mpd, folio); + if (ret) + goto out; + /* + * Update start_pos to prevent this folio from being released in + * mpage_release_unused_pages(), it will be reset to the aligned folio + * pos when this folio is written again in the next round. Additionally, + * do not update wbc->nr_to_write here, as it will be updated once the + * entire folio has finished processing. + */ + mpd->start_pos = pos; +out: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +/* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * @@ -2207,24 +2469,31 @@ static int mpage_map_and_submit_extent(handle_t *handle, io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); - io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(EXT4_SB(sb)) || - ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ - if ((err == -ENOMEM) || + if ((err == -ENOMEM) || (err == -EAGAIN) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { - if (progress) + /* + * We may have already allocated extents for + * some bhs inside the folio, issue the + * corresponding data to prevent stale data. + */ + if (progress) { + if (mpage_submit_partial_folio(mpd)) + goto invalidate_dirty_pages; goto update_disksize; + } return err; } ext4_msg(sb, KERN_CRIT, @@ -2258,7 +2527,7 @@ update_disksize: * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ - disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; + disksize = mpd->start_pos; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; @@ -2282,21 +2551,6 @@ update_disksize: return err; } -/* - * Calculate the total number of credits to reserve for one writepages - * iteration. This is called from ext4_writepages(). We map an extent of - * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping - * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + - * bpp - 1 blocks in bpp different extents. - */ -static int ext4_da_writepages_trans_blocks(struct inode *inode) -{ - int bpp = ext4_journal_blocks_per_page(inode); - - return ext4_meta_trans_blocks(inode, - MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); -} - static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { @@ -2327,11 +2581,11 @@ static int mpage_journal_page_buffers(handle_t *handle, size_t len = folio_size(folio); folio_clear_checked(folio); - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size - folio_pos(folio); + len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } @@ -2361,23 +2615,19 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - pgoff_t index = mpd->first_page; - pgoff_t end = mpd->last_page; + pgoff_t index = mpd->start_pos >> PAGE_SHIFT; + pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; - int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; - int bpp = ext4_journal_blocks_per_page(mpd->inode); + int bpp = ext4_journal_blocks_per_folio(mpd->inode); - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; - mpd->next_page = index; + mpd->next_pos = mpd->start_pos; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); @@ -2404,11 +2654,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= - mpd->map.m_len >> (PAGE_SHIFT - blkbits)) + EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len)) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != folio->index) + if (mpd->map.m_len > 0 && + mpd->next_pos != folio_pos(folio)) goto out; if (handle) { @@ -2454,8 +2705,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) } if (mpd->map.m_len == 0) - mpd->first_page = folio->index; - mpd->next_page = folio_next_index(folio); + mpd->start_pos = folio_pos(folio); + mpd->next_pos = folio_next_pos(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -2481,8 +2732,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)folio->index) << - (PAGE_SHIFT - blkbits); + lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); @@ -2534,17 +2784,15 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because + * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || - ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { - ret = -EROFS; + ret = ext4_emergency_state(mapping->host->i_sb); + if (unlikely(ret)) goto out_writepages; - } /* * If we have inline data and arrive here, it means that @@ -2585,12 +2833,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { + int bpf = ext4_journal_blocks_per_folio(inode); /* * We may need to convert up to one extent per block in - * the page and we may dirty the inode. + * the folio and we may dirty the inode. */ - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, - PAGE_SIZE >> inode->i_blkbits); + rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) @@ -2600,18 +2848,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; - mpd->first_page = writeback_index; - mpd->last_page = -1; + mpd->start_pos = writeback_index << PAGE_SHIFT; + mpd->end_pos = LLONG_MAX; } else { - mpd->first_page = wbc->range_start >> PAGE_SHIFT; - mpd->last_page = wbc->range_end >> PAGE_SHIFT; + mpd->start_pos = wbc->range_start; + mpd->end_pos = wbc->range_end; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, mpd->first_page, - mpd->last_page); + tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, + mpd->end_pos >> PAGE_SHIFT); blk_start_plug(&plug); /* @@ -2654,8 +2902,14 @@ retry: * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); - needed_blocks = ext4_da_writepages_trans_blocks(inode); - + /* + * Calculate the number of credits needed to reserve for one + * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will + * attempt to extend the transaction or start a new iteration + * if the reserved credits are insufficient. + */ + needed_blocks = ext4_chunk_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); @@ -2671,7 +2925,8 @@ retry: } mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd->first_page, wbc); + trace_ext4_da_write_folios_start(inode, mpd->start_pos, + mpd->next_pos, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, @@ -2709,6 +2964,8 @@ retry: } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; + trace_ext4_da_write_folios_end(inode, mpd->start_pos, + mpd->next_pos, wbc, ret); if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2720,6 +2977,8 @@ retry: ret = 0; continue; } + if (ret == -EAGAIN) + ret = 0; /* Fatal error - ENOMEM, EIO... */ if (ret) break; @@ -2728,8 +2987,8 @@ unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - mpd->last_page = writeback_index - 1; - mpd->first_page = 0; + mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; + mpd->start_pos = 0; goto retry; } @@ -2739,7 +2998,7 @@ unplug: * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = mpd->first_page; + mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, @@ -2759,8 +3018,9 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); @@ -2798,16 +3058,17 @@ static int ext4_dax_writepages(struct address_space *mapping, int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); + ret = dax_writeback_mapping_range(mapping, + EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); @@ -2848,31 +3109,33 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -static int ext4_da_write_begin(struct file *file, struct address_space *mapping, +static int ext4_da_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; - return ext4_write_begin(file, mapping, pos, - len, pagep, fsdata); + return ext4_write_begin(iocb, mapping, pos, + len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - pagep, fsdata); + ret = ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) @@ -2880,24 +3143,20 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); - /* In case writeback began while the folio was unlocked */ - folio_wait_stable(folio); + if (len > folio_next_pos(folio) - pos) + len = folio_next_pos(folio) - pos; -#ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); -#else - ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); -#endif + ret = ext4_block_write_begin(NULL, folio, pos, len, + ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); /* - * block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ @@ -2910,7 +3169,7 @@ retry: return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -2937,20 +3196,87 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, return 1; } -static int ext4_da_write_end(struct file *file, +static int ext4_da_do_write_end(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool disksize_changed = false; + loff_t new_i_size, zero_len = 0; + handle_t *handle; + + if (unlikely(!folio_buffers(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } + /* + * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES + * flag, which all that's needed to trigger page writeback. + */ + copied = block_write_end(pos, len, copied, folio); + new_i_size = pos + copied; + + /* + * It's important to update i_size while still holding folio lock, + * because folio writeout could otherwise come in and zero beyond + * i_size. + * + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range up to i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize up to i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block which ext4_da_should_update_i_disksize() + * checked, we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks and update i_disksize. + */ + if (new_i_size > inode->i_size) { + unsigned long end; + + i_size_write(inode, new_i_size); + end = offset_in_folio(folio, new_i_size - 1); + if (copied && ext4_da_should_update_i_disksize(folio, end)) { + ext4_update_i_disksize(inode, new_i_size); + disksize_changed = true; + } + } + + folio_unlock(folio); + folio_put(folio); + + if (pos > old_size) { + pagecache_isize_extended(inode, old_size, pos); + zero_len = pos - old_size; + } + + if (!disksize_changed && !zero_len) + return copied; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return copied; +} + +static int ext4_da_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; - loff_t new_i_size; - unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; - struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) - return ext4_write_end(file, mapping, pos, - len, copied, &folio->page, fsdata); + return ext4_write_end(iocb, mapping, pos, + len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); @@ -2960,33 +3286,10 @@ static int ext4_da_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - if (unlikely(copied < len) && !PageUptodate(page)) + if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; - start = pos & (PAGE_SIZE - 1); - end = start + copied - 1; - - /* - * Since we are holding inode lock, we are sure i_disksize <= - * i_size. We also know that if i_disksize < i_size, there are - * delalloc writes pending in the range upto i_size. If the end of - * the current write is <= i_size, there's no need to touch - * i_disksize since writeback will push i_disksize upto i_size - * eventually. If the end of the current write is > i_size and - * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as certain - * ext4_writepages() paths not allocating blocks update i_disksize. - * - * Note that we defer inode dirtying to generic_write_end() / - * ext4_da_write_inline_data_end(). - */ - new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(folio, end)) - ext4_update_i_disksize(inode, new_i_size); - - return generic_write_end(file, mapping, pos, len, copied, &folio->page, - fsdata); + return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* @@ -3169,9 +3472,9 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) } /* Any metadata buffers to write? */ - if (!list_empty(&inode->i_mapping->private_list)) + if (!list_empty(&inode->i_mapping->i_private_list)) return true; - return inode->i_state & I_DIRTY_DATASYNC; + return inode_state_read_once(inode) & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, @@ -3193,12 +3496,16 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + /* HW-offload atomics are always used */ + if (flags & IOMAP_ATOMIC) + iomap->flags |= IOMAP_F_ATOMIC_BIO; + if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = (u64) map->m_lblk << blkbits; - iomap->length = (u64) map->m_len << blkbits; + iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); + iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -3223,18 +3530,157 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; + } else if (map->m_flags & EXT4_MAP_DELAYED) { + iomap->type = IOMAP_DELALLOC; + iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk = 0; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; - u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at @@ -3242,7 +3688,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } retry: /* @@ -3268,12 +3737,16 @@ retry: * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ - else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) + else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ret = ext4_map_blocks(handle, inode, map, m_flags); + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could @@ -3287,6 +3760,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + return ret; } @@ -3297,6 +3786,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; @@ -3310,6 +3800,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* @@ -3320,11 +3811,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) - goto out; + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3338,6 +3841,16 @@ out: */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; @@ -3359,61 +3872,19 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } -static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) -{ - /* - * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code so that we - * fallback to buffered I/O and attempt to complete the remainder of - * the I/O. Any blocks that may have been allocated in preparation for - * the direct I/O will be reused during buffered I/O. - */ - if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) - return -ENOTBLK; - - return 0; -} - const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, - .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, - .iomap_end = ext4_iomap_end, }; -static bool ext4_iomap_is_delalloc(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct extent_status es; - ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map->m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) - return false; - - if (es.es_lblk > map->m_lblk) { - map->m_len = es.es_lblk - map->m_lblk; - return false; - } - - offset = map->m_lblk - es.es_lblk; - map->m_len = es.es_len - offset; - - return true; -} - static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; - bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; @@ -3454,13 +3925,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; - if (ret == 0) - delalloc = ext4_iomap_is_delalloc(inode, &map); - set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); - if (delalloc && iomap->type == IOMAP_HOLE) - iomap->type = IOMAP_DELALLOC; return 0; } @@ -3517,10 +3983,9 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3534,10 +3999,9 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3551,16 +4015,14 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, @@ -3586,12 +4048,16 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } +/* + * Here we can't skip an unwritten buffer even though it usually reads zero + * because it might have data in pagecache (eg, if called from ext4_zero_range, + * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a + * racing writeback can come later and flush the stale pagecache to disk. + */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, pos; + unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3606,15 +4072,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = EXT4_PG_TO_LBLK(inode, folio->index); bh = folio_buffers(folio); - if (!bh) { - create_empty_buffers(&folio->page, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -3692,9 +4157,8 @@ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; - unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; - unsigned max = blocksize - (offset & (blocksize - 1)); + unsigned int max = blocksize - (from & (blocksize - 1)); /* * correct length if it does not fall between @@ -3719,7 +4183,6 @@ static int ext4_block_zero_page_range(handle_t *handle, static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { - unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; @@ -3728,8 +4191,8 @@ static int ext4_block_truncate_page(handle_t *handle, if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); + blocksize = i_blocksize(inode); + length = blocksize - (from & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } @@ -3787,7 +4250,11 @@ int ext4_can_truncate(struct inode *inode) * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but - * that will never happen after we truncate page cache. + * that will never happen if we remove the folio containing i_size from the + * page cache. Also if we punch hole within i_size but above i_disksize, + * following ext4_page_mkwrite() may mistakenly allocate written blocks over + * the hole and thus introduce allocated blocks beyond i_disksize which is + * not allowed (e2fsck would complain in case of crash). */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) @@ -3798,9 +4265,11 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); - if (offset > size || offset + len < size) + if (offset > size) return 0; + if (offset + len < size) + size = offset + len; if (EXT4_I(inode)->i_disksize >= size) return 0; @@ -3814,6 +4283,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3823,24 +4354,10 @@ static void ext4_wait_dax_page(struct inode *inode) int ext4_break_layouts(struct inode *inode) { - struct page *page; - int error; - if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; - do { - page = dax_layout_busy_page(inode->i_mapping); - if (!page) - return 0; - - error = ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, - TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(inode)); - } while (error == 0); - - return error; + return dax_break_layout_inode(inode, ext4_wait_dax_page); } /* @@ -3858,146 +4375,112 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset, max_length; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t start_lblk, end_lblk; + loff_t max_end = sb->s_maxbytes; + loff_t end = offset + length; handle_t *handle; unsigned int credits; - int ret = 0, ret2 = 0; + int ret; trace_ext4_punch_hole(inode, offset, length, 0); + WARN_ON_ONCE(!inode_is_locked(inode)); /* - * Write out all dirty pages to avoid race conditions - * Then release them. + * For indirect-block based inodes, make sure that the hole within + * one block before last range. */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - - inode_lock(inode); + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; + if (offset >= inode->i_size || offset >= max_end) + return 0; /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size + * If the hole extends beyond i_size, set the hole to end after + * the block that contains i_size to save pointless tail block zeroing. */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - - offset; - } + if (end >= inode->i_size) + end = round_up(inode->i_size, sb->s_blocksize); + if (end > max_end) + end = max_end; + length = end - offset; /* - * For punch hole the length + offset needs to be within one block - * before last range. Adjust the length if it goes beyond that limit. + * Attach jinode to inode for jbd2 if we do any zeroing of partial + * block. */ - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; - if (offset + length > max_length) - length = max_length - offset; - - if (offset & (sb->s_blocksize - 1) || - (offset + length) & (sb->s_blocksize - 1)) { - /* - * Attach jinode to inode for jbd2 if we do any zeroing of - * partial block - */ + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) - goto out_mutex; - + return ret; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - ret = file_modified(file); + ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_dio; - - first_block_offset = round_up(offset, sb->s_blocksize); - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + return ret; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) { - ret = ext4_update_disksize_before_punch(inode, offset, length); - if (ret) - goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - } + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 2); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); - goto out_dio; + return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, - length); + ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) - goto out_stop; - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + goto out_handle; /* If there are blocks to remove, do it */ - if (stop_block > first_block) { + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> inode->i_blkbits; + if (end_lblk > start_lblk) { + ext4_lblk_t hole_len = end_lblk - start_lblk; + + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, - stop_block - first_block); + ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); + ret = ext4_ext_remove_space(inode, start_lblk, + end_lblk - 1); else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + ret = ext4_ind_remove_space(handle, inode, start_lblk, + end_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, + EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(handle, inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); + + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - - inode->i_mtime = inode_set_ctime_current(inode); - ret2 = ext4_mark_inode_dirty(handle, inode); - if (unlikely(ret2)) - ret = ret2; - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_dio: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -4067,7 +4550,7 @@ int ext4_truncate(struct inode *inode) * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ - if (!(inode->i_state & (I_NEW|I_FREEING))) + if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); @@ -4093,7 +4576,7 @@ int ext4_truncate(struct inode *inode) } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 1); else credits = ext4_blocks_for_truncate(inode); @@ -4119,9 +4602,11 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; - down_write(&EXT4_I(inode)->i_data_sem); + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); - ext4_discard_preallocations(inode, 0); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); @@ -4146,7 +4631,7 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; @@ -4232,7 +4717,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ - if (ei->i_dtime && list_empty(&ei->i_orphan)) { + if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { @@ -4250,8 +4735,8 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); @@ -4439,10 +4924,10 @@ make_io: * Read the block from disk. */ trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, + ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; @@ -4584,6 +5069,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode, *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; + err = xattr_check_inode(inode, IHDR(inode, raw_inode), + ITAIL(inode, raw_inode)); + if (err) + return err; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) @@ -4615,22 +5105,62 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } -static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) - +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, + const char *function, unsigned int line) { + const char *err_str; + if (flags & EXT4_IGET_EA_INODE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "missing EA_INODE flag"; + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + err_str = "missing EA_INODE flag"; + goto error; + } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - EXT4_I(inode)->i_file_acl) - return "ea_inode with extended attributes"; + EXT4_I(inode)->i_file_acl) { + err_str = "ea_inode with extended attributes"; + goto error; + } } else { - if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "unexpected EA_INODE flag"; + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + /* + * open_by_handle_at() could provide an old inode number + * that has since been reused for an ea_inode; this does + * not indicate filesystem corruption + */ + if (flags & EXT4_IGET_HANDLE) + return -ESTALE; + err_str = "unexpected EA_INODE flag"; + goto error; + } } - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) - return "unexpected bad inode w/o EXT4_IGET_BAD"; - return NULL; + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; + goto error; + } + return 0; + +error: + ext4_error_inode(inode, function, line, 0, "%s", err_str); + return -EFSCORRUPTED; +} + +void ext4_set_inode_mapping_order(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + u16 min_order, max_order; + + max_order = EXT4_SB(sb)->s_max_folio_order; + if (!max_order) + return; + + min_order = EXT4_SB(sb)->s_min_folio_order; + if (!min_order && !S_ISREG(inode->i_mode)) + return; + + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + max_order = min_order; + + mapping_set_folio_order_range(inode->i_mapping, min_order, max_order); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -4642,7 +5172,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; - const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4651,12 +5180,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if ((!(flags & EXT4_IGET_SPECIAL) && - ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || - ino == le32_to_cpu(es->s_usr_quota_inum) || - ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum) || - ino == le32_to_cpu(es->s_orphan_file_inum))) || + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) @@ -4670,11 +5194,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); + if (!(inode_state_read_once(inode) & I_NEW)) { + ret = check_igot_inode(inode, flags, function, line); + if (ret) { iput(inode); - return ERR_PTR(-EFSCORRUPTED); + return ERR_PTR(ret); } return inode; } @@ -4710,15 +5234,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || @@ -4749,7 +5272,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); @@ -4780,13 +5302,22 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); + /* Detect invalid flag combination - can't have both inline data and extents */ + if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_error_inode(inode, function, line, 0, + "inode has both inline data and extents flags"); + ret = -EFSCORRUPTED; + goto bad_inode; + } inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -4797,7 +5328,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ - if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + if (!ext4_has_feature_dir_index(sb) && + ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); @@ -4859,8 +5391,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } EXT4_INODE_GET_CTIME(inode, raw_inode); - EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_GET_ATIME(inode, raw_inode); + EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { @@ -4916,10 +5448,19 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { - inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } @@ -4940,16 +5481,32 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } - if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; goto bad_inode; } + ext4_set_inode_mapping_order(inode); + + ret = check_igot_inode(inode, flags, function, line); + /* + * -ESTALE here means there is nothing inherently wrong with the inode, + * it's just not an inode we can return for an fhandle lookup. + */ + if (ret == -ESTALE) { + brelse(iloc.bh); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-ESTALE); + } + if (ret) + goto bad_inode; brelse(iloc.bh); + /* Initialize the "no ACL's" state for the simple cases */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl) + cache_no_acl(inode); unlock_new_inode(inode); return inode; @@ -4977,13 +5534,13 @@ static void __ext4_update_other_inode_time(struct super_block *sb, if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); @@ -5131,12 +5688,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || - sb_rdonly(inode->i_sb)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { @@ -5186,8 +5743,9 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = 0; + tid_t commit_tid; int ret; + bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* @@ -5212,12 +5770,14 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) folio_put(folio); if (ret != -EBUSY) return; - commit_tid = 0; + has_transaction = false; read_lock(&journal->j_state_lock); - if (journal->j_committing_transaction) + if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; + has_transaction = true; + } read_unlock(&journal->j_state_lock); - if (commit_tid) + if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } @@ -5255,8 +5815,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + error = ext4_emergency_state(inode->i_sb); + if (unlikely(error)) + return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -5363,6 +5924,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto out_mmap_sem; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -5373,11 +5942,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, orphan = 1; } /* - * Update c/mtime on truncate up, ext4_truncate() will - * update c/mtime in shrink case below + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. */ - if (!shrink) - inode->i_mtime = inode_set_ctime_current(inode); + if (!shrink) { + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + } if (shrink) ext4_fc_track_range(handle, inode, @@ -5395,9 +5970,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code @@ -5408,6 +5981,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; @@ -5514,6 +6090,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, } } + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; @@ -5592,8 +6180,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5601,13 +6188,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int ret; /* - * How many index blocks need to touch to map @lblocks logical blocks - * to @pextents physical extents? + * How many index and leaf blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); - ret = idxblocks; - /* * Now let's see how many group bitmaps and group descriptors need * to account @@ -5620,7 +6205,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; + ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); @@ -5629,25 +6214,19 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, } /* - * Calculate the total number of credits to reserve to fit - * the modification of a single pages into a single transaction, - * which may include multiple chunks of block allocations. - * - * This could be called via ext4_write_begin() - * - * We need to consider the worse case, when - * one new block per extent. + * Calculate the journal credits for modifying the number of blocks + * in a single extent within one transaction. 'nrblocks' is used only + * for non-extent inodes. For extent type inodes, 'nrblocks' can be + * zero if the exact number of blocks is unknown. */ -int ext4_writepage_trans_blocks(struct inode *inode) +int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) { - int bpp = ext4_journal_blocks_per_page(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, bpp); - + ret = ext4_meta_trans_blocks(inode, nrblocks, 1); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret += bpp; + ret += nrblocks; return ret; } @@ -5674,9 +6253,10 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) { put_bh(iloc->bh); - return -EIO; + return err; } ext4_fc_track_inode(handle, inode); @@ -5700,8 +6280,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { @@ -5712,6 +6293,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; @@ -5955,14 +6537,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ - if (val) { - filemap_invalidate_lock(inode->i_mapping); - err = filemap_write_and_wait(inode->i_mapping); - if (err < 0) { - filemap_invalidate_unlock(inode->i_mapping); - return err; - } + filemap_invalidate_lock(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + filemap_invalidate_unlock(inode->i_mapping); + return err; } + /* Before switch the inode journalling mode evict all the page cache. */ + truncate_pagecache(inode, 0); alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); @@ -5982,17 +6564,17 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (err < 0) { jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); + filemap_invalidate_unlock(inode->i_mapping); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); + ext4_set_inode_mapping_order(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); - - if (val) - filemap_invalidate_unlock(inode->i_mapping); + filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ @@ -6016,6 +6598,55 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, return !buffer_mapped(bh); } +static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, + get_block_t get_block) +{ + handle_t *handle; + loff_t size; + unsigned long len; + int credits; + int ret; + + credits = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + folio_lock(folio); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { + ret = -EFAULT; + goto out_error; + } + + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); + + ret = ext4_block_write_begin(handle, folio, 0, len, get_block); + if (ret) + goto out_error; + + if (!ext4_should_journal_data(inode)) { + block_commit_write(folio, 0, len); + folio_mark_dirty(folio); + } else { + ret = ext4_journal_folio_buffers(handle, folio, len); + if (ret) + goto out_error; + } + ext4_journal_stop(handle); + folio_wait_stable(folio); + return ret; + +out_error: + folio_unlock(folio); + ext4_journal_stop(handle); + return ret; +} + vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -6027,8 +6658,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - handle_t *handle; - get_block_t *get_block; + get_block_t *get_block = ext4_get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) @@ -6096,46 +6726,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; - else - get_block = ext4_get_block; retry_alloc: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - /* - * Data journalling can't use block_page_mkwrite() because it - * will set_buffer_dirty() before do_journal_get_write_access() - * thus might hit warning messages for dirty metadata buffers. - */ - if (!ext4_should_journal_data(inode)) { - err = block_page_mkwrite(vma, vmf, get_block); - } else { - folio_lock(folio); - size = i_size_read(inode); - /* Page got truncated from under us? */ - if (folio->mapping != mapping || folio_pos(folio) > size) { - ret = VM_FAULT_NOPAGE; - goto out_error; - } - - len = folio_size(folio); - if (folio_pos(folio) + len > size) - len = size - folio_pos(folio); - - err = __block_write_begin(&folio->page, 0, len, ext4_get_block); - if (!err) { - ret = VM_FAULT_SIGBUS; - if (ext4_journal_folio_buffers(handle, folio, len)) - goto out_error; - } else { - folio_unlock(folio); - } - } - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + /* Start journal and allocate blocks */ + err = ext4_block_page_mkwrite(inode, folio, get_block); + if (err == -EAGAIN || + (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_alloc; out_ret: ret = vmf_fs_error(err); @@ -6143,8 +6738,4 @@ out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; -out_error: - folio_unlock(folio); - ext4_journal_stop(handle); - goto out; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b0349f451863..7ce0fc40aec2 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -27,14 +27,16 @@ #include "fsmap.h" #include <trace/events/ext4.h> -typedef void ext4_update_sb_callback(struct ext4_super_block *es, - const void *arg); +typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, + struct ext4_super_block *es, + const void *arg); /* * Superblock modification callback function for changing file system * label */ -static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setlabel(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { /* Sanity check, this should never happen */ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); @@ -46,7 +48,8 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) * Superblock modification callback function for changing file system * UUID. */ -static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setuuid(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); } @@ -71,7 +74,7 @@ int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, goto out_err; lock_buffer(bh); - func(es, arg); + func(sbi, es, arg); ext4_superblock_csum_set(sb); unlock_buffer(bh); @@ -142,16 +145,16 @@ static int ext4_update_backup_sb(struct super_block *sb, es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); - if (ext4_has_metadata_csum(sb) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + if (ext4_has_feature_metadata_csum(sb) && + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); unlock_buffer(bh); goto out_bh; } - func(es, arg); - if (ext4_has_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + func(EXT4_SB(sb), es, arg); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -312,13 +315,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; unsigned long tmp; + struct timespec64 ts1, ts2; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_atime, inode2->i_atime); - swap(inode1->i_mtime, inode2->i_mtime); + + ts1 = inode_get_atime(inode1); + ts2 = inode_get_atime(inode2); + inode_set_atime_to_ts(inode1, ts2); + inode_set_atime_to_ts(inode2, ts1); + + ts1 = inode_get_mtime(inode1); + ts2 = inode_get_mtime(inode2); + inode_set_mtime_to_ts(inode1, ts2); + inode_set_mtime_to_ts(inode2, ts1); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; @@ -342,11 +354,11 @@ void ext4_reset_inode_seed(struct inode *inode) __le32 gen = cpu_to_le32(inode->i_generation); __u32 csum; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } /* @@ -458,7 +470,7 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { @@ -802,7 +814,7 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; - if (ext4_forced_shutdown(sbi)) + if (ext4_forced_shutdown(sb)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); @@ -810,11 +822,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(sb->s_bdev); + ret = bdev_freeze(sb->s_bdev); if (ret) return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - thaw_bdev(sb->s_bdev); + bdev_thaw(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); @@ -971,7 +983,7 @@ group_add_out: return err; } -int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct ext4_inode_info *ei = EXT4_I(inode); @@ -988,7 +1000,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ext4_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); u32 flags = fa->flags; @@ -1141,9 +1153,8 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label */ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); - memset(label, 0, sizeof(label)); lock_buffer(sbi->s_sbh); - strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); + memtostr_pad(label, sbi->s_es->s_volume_name); unlock_buffer(sbi->s_sbh); if (copy_to_user(user_label, label, sizeof(label))) @@ -1197,7 +1208,8 @@ static int ext4_ioctl_setuuid(struct file *filp, * If any checksums (group descriptors or metadata) are being used * then the checksum seed feature is required to change the UUID. */ - if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb)) + if (((ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb)) && !ext4_has_feature_csum_seed(sb)) || ext4_has_feature_stable_inodes(sb)) return -EOPNOTSUPP; @@ -1221,6 +1233,299 @@ static int ext4_ioctl_setuuid(struct file *filp, return ret; } + +#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \ + EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \ + EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \ + EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \ + EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \ + EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \ + EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \ + EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \ + EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \ + EXT4_TUNE_FL_ENCODING_FLAGS) + +#define EXT4_TUNE_SET_COMPAT_SUPP \ + (EXT4_FEATURE_COMPAT_DIR_INDEX | \ + EXT4_FEATURE_COMPAT_STABLE_INODES) +#define EXT4_TUNE_SET_INCOMPAT_SUPP \ + (EXT4_FEATURE_INCOMPAT_EXTENTS | \ + EXT4_FEATURE_INCOMPAT_EA_INODE | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD) +#define EXT4_TUNE_SET_RO_COMPAT_SUPP \ + (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_PROJECT | \ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0) + +#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \ + SB_ENC_NO_COMPAT_FALLBACK_FL) + +static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi, + struct ext4_tune_sb_params __user *params) +{ + struct ext4_tune_sb_params ret; + struct ext4_super_block *es = sbi->s_es; + + memset(&ret, 0, sizeof(ret)); + ret.set_flags = TUNE_OPS_SUPPORTED; + ret.errors_behavior = le16_to_cpu(es->s_errors); + ret.mnt_count = le16_to_cpu(es->s_mnt_count); + ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count); + ret.checkinterval = le32_to_cpu(es->s_checkinterval); + ret.last_check_time = le32_to_cpu(es->s_lastcheck); + ret.reserved_blocks = ext4_r_blocks_count(es); + ret.blocks_count = ext4_blocks_count(es); + ret.reserved_uid = ext4_get_resuid(es); + ret.reserved_gid = ext4_get_resgid(es); + ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts); + ret.def_hash_alg = es->s_def_hash_version; + ret.raid_stride = le16_to_cpu(es->s_raid_stride); + ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width); + ret.encoding = le16_to_cpu(es->s_encoding); + ret.encoding_flags = le16_to_cpu(es->s_encoding_flags); + strscpy_pad(ret.mount_opts, es->s_mount_opts); + ret.feature_compat = le32_to_cpu(es->s_feature_compat); + ret.feature_incompat = le32_to_cpu(es->s_feature_incompat); + ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat); + ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP; + ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP; + ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP; + ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP; + ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP; + ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP; + if (copy_to_user(params, &ret, sizeof(ret))) + return -EFAULT; + return 0; +} + +static void ext4_sb_setparams(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + const struct ext4_tune_sb_params *params = arg; + + if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) + es->s_errors = cpu_to_le16(params->errors_behavior); + if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT) + es->s_mnt_count = cpu_to_le16(params->mnt_count); + if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT) + es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count); + if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL) + es->s_checkinterval = cpu_to_le32(params->checkinterval); + if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME) + es->s_lastcheck = cpu_to_le32(params->last_check_time); + if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) { + ext4_fsblk_t blk = params->reserved_blocks; + + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) { + int uid = params->reserved_uid; + + es->s_def_resuid = cpu_to_le16(uid & 0xFFFF); + es->s_def_resuid_hi = cpu_to_le16(uid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) { + int gid = params->reserved_gid; + + es->s_def_resgid = cpu_to_le16(gid & 0xFFFF); + es->s_def_resgid_hi = cpu_to_le16(gid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS) + es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts); + if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + es->s_def_hash_version = params->def_hash_alg; + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE) + es->s_raid_stride = cpu_to_le16(params->raid_stride); + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH) + es->s_raid_stripe_width = + cpu_to_le32(params->raid_stripe_width); + if (params->set_flags & EXT4_TUNE_FL_ENCODING) + es->s_encoding = cpu_to_le16(params->encoding); + if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) + es->s_encoding_flags = cpu_to_le16(params->encoding_flags); + strscpy_pad(es->s_mount_opts, params->mount_opts); + if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + es->s_feature_compat |= + cpu_to_le32(params->set_feature_compat_mask); + es->s_feature_incompat |= + cpu_to_le32(params->set_feature_incompat_mask); + es->s_feature_ro_compat |= + cpu_to_le32(params->set_feature_ro_compat_mask); + es->s_feature_compat &= + ~cpu_to_le32(params->clear_feature_compat_mask); + es->s_feature_incompat &= + ~cpu_to_le32(params->clear_feature_incompat_mask); + es->s_feature_ro_compat &= + ~cpu_to_le32(params->clear_feature_ro_compat_mask); + if (params->set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX) + es->s_def_hash_version = sbi->s_def_hash_version; + if (params->set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CSUM_SEED) + es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed); + } + if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK) + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); +} + +static int ext4_ioctl_set_tune_sb(struct file *filp, + struct ext4_tune_sb_params __user *in) +{ + struct ext4_tune_sb_params params; + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int enabling_casefold = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(¶ms, in, sizeof(params))) + return -EFAULT; + + if (strnlen(params.mount_opts, sizeof(params.mount_opts)) == + sizeof(params.mount_opts)) + return -E2BIG; + + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) + return -EOPNOTSUPP; + + if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) && + (params.errors_behavior > EXT4_ERRORS_PANIC)) + return -EINVAL; + + if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) && + (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2)) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) && + ((params.def_hash_alg > DX_HASH_LAST) || + (params.def_hash_alg == DX_HASH_SIPHASH))) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_FEATURES) && + (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES)) + return -EINVAL; + + if (params.set_flags & EXT4_TUNE_FL_FEATURES) { + params.set_feature_compat_mask = + params.feature_compat & + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask = + params.feature_incompat & + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask = + params.feature_ro_compat & + ~le32_to_cpu(es->s_feature_ro_compat); + params.clear_feature_compat_mask = + ~params.feature_compat & + le32_to_cpu(es->s_feature_compat); + params.clear_feature_incompat_mask = + ~params.feature_incompat & + le32_to_cpu(es->s_feature_incompat); + params.clear_feature_ro_compat_mask = + ~params.feature_ro_compat & + le32_to_cpu(es->s_feature_ro_compat); + params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES; + } + if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + if ((params.set_feature_compat_mask & + ~EXT4_TUNE_SET_COMPAT_SUPP) || + (params.set_feature_incompat_mask & + ~EXT4_TUNE_SET_INCOMPAT_SUPP) || + (params.set_feature_ro_compat_mask & + ~EXT4_TUNE_SET_RO_COMPAT_SUPP) || + (params.clear_feature_compat_mask & + ~EXT4_TUNE_CLEAR_COMPAT_SUPP) || + (params.clear_feature_incompat_mask & + ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) || + (params.clear_feature_ro_compat_mask & + ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP)) + return -EOPNOTSUPP; + + /* + * Filter out the features that are already set from + * the set_mask. + */ + params.set_feature_compat_mask &= + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask &= + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask &= + ~le32_to_cpu(es->s_feature_ro_compat); + if ((params.set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CASEFOLD)) { + enabling_casefold = 1; + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) { + params.encoding = EXT4_ENC_UTF8_12_1; + params.set_flags |= EXT4_TUNE_FL_ENCODING; + } + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) { + params.encoding_flags = 0; + params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS; + } + } + if ((params.set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX)) { + uuid_t uu; + + memcpy(&uu, sbi->s_hash_seed, UUID_SIZE); + if (uuid_is_null(&uu)) + generate_random_uuid((char *) + &sbi->s_hash_seed); + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + else if (sbi->s_def_hash_version == 0) + sbi->s_def_hash_version = DX_HASH_HALF_MD4; + if (!(es->s_flags & + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) && + !(es->s_flags & + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) { +#ifdef __CHAR_UNSIGNED__ + sbi->s_hash_unsigned = 3; +#else + sbi->s_hash_unsigned = 0; +#endif + } + } + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding == 0) + params.encoding = EXT4_ENC_UTF8_12_1; + else if (params.encoding != EXT4_ENC_UTF8_12_1) + return -EINVAL; + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding_flags & ~SB_ENC_SUPP_MASK) + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms); + mnt_drop_write_file(filp); + + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + + return ret; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1245,7 +1550,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!inode_owner_or_capable(idmap, inode)) return -EPERM; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; @@ -1322,7 +1627,6 @@ group_extend_out: case EXT4_IOC_MOVE_EXT: { struct move_extent me; - struct fd donor; int err; if (!(filp->f_mode & FMODE_READ) || @@ -1334,40 +1638,24 @@ group_extend_out: return -EFAULT; me.moved_len = 0; - donor = fdget(me.donor_fd); - if (!donor.file) + CLASS(fd, donor)(me.donor_fd); + if (fd_empty(donor)) return -EBADF; - if (!(donor.file->f_mode & FMODE_WRITE)) { - err = -EBADF; - goto mext_out; - } - - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online defrag not supported with bigalloc"); - err = -EOPNOTSUPP; - goto mext_out; - } else if (IS_DAX(inode)) { - ext4_msg(sb, KERN_ERR, - "Online defrag not supported with DAX"); - err = -EOPNOTSUPP; - goto mext_out; - } + if (!(fd_file(donor)->f_mode & FMODE_WRITE)) + return -EBADF; err = mnt_want_write_file(filp); if (err) - goto mext_out; + return err; - err = ext4_move_extents(filp, donor.file, me.orig_start, + err = ext4_move_extents(filp, fd_file(donor), me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; -mext_out: - fdput(donor); return err; } @@ -1503,8 +1791,14 @@ resizefs_out: return 0; } case EXT4_IOC_PRECACHE_EXTENTS: - return ext4_ext_precache(inode); + { + int ret; + inode_lock_shared(inode); + ret = ext4_ext_precache(inode); + inode_unlock_shared(inode); + return ret; + } case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; @@ -1608,6 +1902,11 @@ resizefs_out: return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); case EXT4_IOC_SETFSUUID: return ext4_ioctl_setuuid(filp, (const void __user *)arg); + case EXT4_IOC_GET_TUNE_SB_PARAM: + return ext4_ioctl_get_tune_sb(EXT4_SB(sb), + (void __user *)arg); + case EXT4_IOC_SET_TUNE_SB_PARAM: + return ext4_ioctl_set_tune_sb(filp, (void __user *)arg); default: return -ENOTTY; } @@ -1695,7 +1994,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } #endif -static void set_overhead(struct ext4_super_block *es, const void *arg) +static void set_overhead(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } @@ -1704,7 +2004,7 @@ int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sb_rdonly(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb)) return 0; if (!force && (sbi->s_overhead == 0 || diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c new file mode 100644 index 000000000000..a9416b20ff64 --- /dev/null +++ b/fs/ext4/mballoc-test.c @@ -0,0 +1,999 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 multiblocks allocation. + */ + +#include <kunit/test.h> +#include <kunit/static_stub.h> +#include <linux/random.h> + +#include "ext4.h" + +struct mbt_grp_ctx { + struct buffer_head bitmap_bh; + /* desc and gd_bh are just the place holders for now */ + struct ext4_group_desc desc; + struct buffer_head gd_bh; +}; + +struct mbt_ctx { + struct mbt_grp_ctx *grp_ctx; +}; + +struct mbt_ext4_super_block { + struct ext4_super_block es; + struct ext4_sb_info sbi; + struct mbt_ctx mbt_ctx; +}; + +#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi)) +#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) +#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) + +static struct inode *mbt_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (!ei) + return NULL; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); + + return &ei->vfs_inode; +} + +static void mbt_free_inode(struct inode *inode) +{ + kfree(EXT4_I(inode)); +} + +static const struct super_operations mbt_sops = { + .alloc_inode = mbt_alloc_inode, + .free_inode = mbt_free_inode, +}; + +static void mbt_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static struct file_system_type mbt_fs_type = { + .name = "mballoc test", + .kill_sb = mbt_kill_sb, +}; + +static int mbt_mb_init(struct super_block *sb) +{ + ext4_fsblk_t block; + int ret; + + /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL); + if (sb->s_bdev == NULL) + return -ENOMEM; + + sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL); + if (sb->s_bdev->bd_queue == NULL) { + kfree(sb->s_bdev); + return -ENOMEM; + } + + /* + * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache = + * new_inode(sb); + */ + INIT_LIST_HEAD(&sb->s_inodes); + sb->s_op = &mbt_sops; + + ret = ext4_mb_init(sb); + if (ret != 0) + goto err_out; + + block = ext4_count_free_clusters(sb); + ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block, + GFP_KERNEL); + if (ret != 0) + goto err_mb_release; + + ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (ret != 0) + goto err_freeclusters; + + return 0; + +err_freeclusters: + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); +err_mb_release: + ext4_mb_release(sb); +err_out: + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); + return ret; +} + +static void mbt_mb_release(struct super_block *sb) +{ + percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter); + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); + ext4_mb_release(sb); + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); +} + +static int mbt_set(struct super_block *sb, void *data) +{ + return 0; +} + +static struct super_block *mbt_ext4_alloc_super_block(void) +{ + struct mbt_ext4_super_block *fsb; + struct super_block *sb; + struct ext4_sb_info *sbi; + + fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + if (fsb == NULL) + return NULL; + + sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL); + if (IS_ERR(sb)) + goto out; + + sbi = &fsb->sbi; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) + goto out_deactivate; + + bgl_lock_init(sbi->s_blockgroup_lock); + + sbi->s_es = &fsb->es; + sbi->s_sb = sb; + sb->s_fs_info = sbi; + + up_write(&sb->s_umount); + return sb; + +out_deactivate: + deactivate_locked_super(sb); +out: + kfree(fsb); + return NULL; +} + +static void mbt_ext4_free_super_block(struct super_block *sb) +{ + struct mbt_ext4_super_block *fsb = MBT_SB(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + kfree(sbi->s_blockgroup_lock); + deactivate_super(sb); + kfree(fsb); +} + +struct mbt_ext4_block_layout { + unsigned char blocksize_bits; + unsigned int cluster_bits; + uint32_t blocks_per_group; + ext4_group_t group_count; + uint16_t desc_size; +}; + +static void mbt_init_sb_layout(struct super_block *sb, + struct mbt_ext4_block_layout *layout) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sb->s_blocksize = 1UL << layout->blocksize_bits; + sb->s_blocksize_bits = layout->blocksize_bits; + + sbi->s_groups_count = layout->group_count; + sbi->s_blocks_per_group = layout->blocks_per_group; + sbi->s_cluster_bits = layout->cluster_bits; + sbi->s_cluster_ratio = 1U << layout->cluster_bits; + sbi->s_clusters_per_group = layout->blocks_per_group >> + layout->cluster_bits; + sbi->s_desc_size = layout->desc_size; + sbi->s_desc_per_block_bits = + sb->s_blocksize_bits - (fls(layout->desc_size) - 1); + sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits; + + es->s_first_data_block = cpu_to_le32(0); + es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * + layout->group_count); +} + +static int mbt_grp_ctx_init(struct super_block *sb, + struct mbt_grp_ctx *grp_ctx) +{ + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); + if (grp_ctx->bitmap_bh.b_data == NULL) + return -ENOMEM; + mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max); + ext4_free_group_clusters_set(sb, &grp_ctx->desc, max); + + return 0; +} + +static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx) +{ + kfree(grp_ctx->bitmap_bh.b_data); + grp_ctx->bitmap_bh.b_data = NULL; +} + +static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, + unsigned int start, unsigned int len) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); +} + +static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + return grp_ctx->bitmap_bh.b_data; +} + +/* called after mbt_init_sb_layout */ +static int mbt_ctx_init(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx), + GFP_KERNEL); + if (ctx->grp_ctx == NULL) + return -ENOMEM; + + for (i = 0; i < ngroups; i++) + if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i])) + goto out; + + /* + * first data block(first cluster in first group) is used by + * metadata, mark it used to avoid to alloc data block at first + * block which will fail ext4_sb_block_valid check. + */ + mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); + ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc, + EXT4_CLUSTERS_PER_GROUP(sb) - 1); + + return 0; +out: + while (i-- > 0) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); + return -ENOMEM; +} + +static void mbt_ctx_release(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); +} + +static struct buffer_head * +ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + /* paired with brelse from caller of ext4_read_block_bitmap_nowait */ + get_bh(&grp_ctx->bitmap_bh); + return &grp_ctx->bitmap_bh; +} + +static int ext4_wait_block_bitmap_stub(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + /* + * real ext4_wait_block_bitmap will set these flags and + * functions like ext4_mb_init_cache will verify the flags. + */ + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + set_buffer_verified(bh); + return 0; +} + +static struct ext4_group_desc * +ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group, + struct buffer_head **bh) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + if (bh != NULL) + *bh = &grp_ctx->gd_bh; + + return &grp_ctx->desc; +} + +static int +ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, + ext4_grpblk_t *ret_changed) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh; + + if (state) + mb_set_bits(bitmap_bh->b_data, blkoff, len); + else + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + + return 0; +} + +#define TEST_GOAL_GROUP 1 +static int mbt_kunit_init(struct kunit *test) +{ + struct mbt_ext4_block_layout *layout = + (struct mbt_ext4_block_layout *)(test->param_value); + struct super_block *sb; + int ret; + + sb = mbt_ext4_alloc_super_block(); + if (sb == NULL) + return -ENOMEM; + + mbt_init_sb_layout(sb, layout); + + ret = mbt_ctx_init(sb); + if (ret != 0) { + mbt_ext4_free_super_block(sb); + return ret; + } + + test->priv = sb; + kunit_activate_static_stub(test, + ext4_read_block_bitmap_nowait, + ext4_read_block_bitmap_nowait_stub); + kunit_activate_static_stub(test, + ext4_wait_block_bitmap, + ext4_wait_block_bitmap_stub); + kunit_activate_static_stub(test, + ext4_get_group_desc, + ext4_get_group_desc_stub); + kunit_activate_static_stub(test, + ext4_mb_mark_context, + ext4_mb_mark_context_stub); + + /* stub function will be called in mbt_mb_init->ext4_mb_init */ + if (mbt_mb_init(sb) != 0) { + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); + return -ENOMEM; + } + + return 0; +} + +static void mbt_kunit_exit(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + + mbt_mb_release(sb); + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); +} + +static void test_new_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_request ar; + ext4_group_t i, goal_group = TEST_GOAL_GROUP; + int err = 0; + ext4_fsblk_t found; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + + inode->i_sb = sb; + ar.inode = inode; + + /* get block at goal */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, + "failed to alloc block at goal, expected %llu found %llu", + ar.goal, found); + + /* get block after goal in goal group */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, + "failed to alloc block after goal in goal group, expected %llu found %llu", + ar.goal + 1, found); + + /* get block after goal group */ + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, goal_group + 1), found, + "failed to alloc block after goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, goal_group + 1), found); + + /* get block before goal group */ + for (i = goal_group; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, + "failed to alloc block before goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found); + + /* no block available, fail to allocate block */ + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_NE_MSG(test, err, 0, + "unexpectedly get block when no block is available"); +} + +#define TEST_RANGE_COUNT 8 + +struct test_range { + ext4_grpblk_t start; + ext4_grpblk_t len; +}; + +static void +mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges, + int count) +{ + ext4_grpblk_t start, len, max; + int i; + + max = EXT4_CLUSTERS_PER_GROUP(sb) / count; + for (i = 0; i < count; i++) { + start = get_random_u32() % max; + len = get_random_u32() % max; + len = min(len, max - start); + + ranges[i].start = start + i * max; + ranges[i].len = len; + } +} + +static void +validate_free_blocks_simple(struct kunit *test, struct super_block *sb, + ext4_group_t goal_group, ext4_grpblk_t start, + ext4_grpblk_t len) +{ + void *bitmap; + ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + + for (i = 0; i < ext4_get_groups_count(sb); i++) { + if (i == goal_group) + continue; + + bitmap = mbt_ctx_bitmap(sb, i); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ_MSG(test, bit, max, + "free block on unexpected group %d", i); + } + + bitmap = mbt_ctx_bitmap(sb, goal_group); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, bit, start); + + bit = mb_find_next_bit(bitmap, max, bit + 1); + KUNIT_ASSERT_EQ(test, bit, start + len); +} + +static void +test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group, + ext4_grpblk_t start, ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode; + ext4_fsblk_t block; + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + if (len == 0) + return; + + block = ext4_group_first_block_no(sb, goal_group) + + EXT4_C2B(sbi, start); + ext4_free_blocks_simple(inode, block, len); + validate_free_blocks_simple(test, sb, goal_group, start, len); + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); +} + +static void test_free_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + struct test_range ranges[TEST_RANGE_COUNT]; + + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, max); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_free_blocks_simple_range(test, TEST_GOAL_GROUP, + ranges[i].start, ranges[i].len); +} + +static void +test_mark_diskspace_used_range(struct kunit *test, + struct ext4_allocation_context *ac, + ext4_grpblk_t start, + ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + int ret; + void *bitmap; + ext4_grpblk_t i, max; + + /* ext4_mb_mark_diskspace_used will BUG if len is 0 */ + if (len == 0) + return; + + ac->ac_b_ex.fe_group = TEST_GOAL_GROUP; + ac->ac_b_ex.fe_start = start; + ac->ac_b_ex.fe_len = len; + + bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); + memset(bitmap, 0, sb->s_blocksize); + ret = ext4_mb_mark_diskspace_used(ac, NULL, 0); + KUNIT_ASSERT_EQ(test, ret, 0); + + max = EXT4_CLUSTERS_PER_GROUP(sb); + i = mb_find_next_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, i, start); + i = mb_find_next_zero_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, i, start + len); + i = mb_find_next_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, max, i); +} + +static void test_mark_diskspace_used(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_context ac; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + ac.ac_status = AC_STATUS_FOUND; + ac.ac_sb = sb; + ac.ac_inode = inode; + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mark_diskspace_used_range(test, &ac, ranges[i].start, + ranges[i].len); +} + +static void mbt_generate_buddy(struct super_block *sb, void *buddy, + void *bitmap, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + uint32_t order, off; + void *bb, *bb_h; + int max; + + memset(buddy, 0xff, sb->s_blocksize); + memset(grp, 0, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)])); + + bb = bitmap; + max = EXT4_CLUSTERS_PER_GROUP(sb); + bb_h = buddy + sbi->s_mb_offsets[1]; + + off = mb_find_next_zero_bit(bb, max, 0); + grp->bb_first_free = off; + while (off < max) { + grp->bb_counters[0]++; + grp->bb_free++; + + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + grp->bb_free++; + grp->bb_counters[0]--; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[1]++; + grp->bb_largest_free_order = 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + + for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) { + bb = buddy + sbi->s_mb_offsets[order]; + bb_h = buddy + sbi->s_mb_offsets[order + 1]; + max = max >> 1; + off = mb_find_next_zero_bit(bb, max, 0); + + while (off < max) { + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + mb_set_bits(bb, off, 2); + grp->bb_counters[order] -= 2; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[order + 1]++; + grp->bb_largest_free_order = order + 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + } + + max = EXT4_CLUSTERS_PER_GROUP(sb); + off = mb_find_next_zero_bit(bitmap, max, 0); + while (off < max) { + grp->bb_fragments++; + + off = mb_find_next_bit(bitmap, max, off + 1); + if (off + 1 >= max) + break; + + off = mb_find_next_zero_bit(bitmap, max, off + 1); + } +} + +static void +mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1, + struct ext4_group_info *grp2) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + KUNIT_ASSERT_EQ(test, grp1->bb_first_free, + grp2->bb_first_free); + KUNIT_ASSERT_EQ(test, grp1->bb_fragments, + grp2->bb_fragments); + KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free); + KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order, + grp2->bb_largest_free_order); + + for (i = 1; i < MB_NUM_ORDERS(sb); i++) { + KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i], + grp2->bb_counters[i], + "bb_counters[%d] diffs, expected %d, generated %d", + i, grp1->bb_counters[i], + grp2->bb_counters[i]); + } +} + +static void +do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, + void *mbt_buddy, struct ext4_group_info *mbt_grp, + void *ext4_buddy, struct ext4_group_info *ext4_grp) +{ + int i; + + mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp); + + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + ext4_grp->bb_counters[i] = 0; + /* needed by validation in ext4_mb_generate_buddy */ + ext4_grp->bb_free = mbt_grp->bb_free; + memset(ext4_buddy, 0xff, sb->s_blocksize); + ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, + ext4_grp); + + KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, mbt_grp, ext4_grp); +} + +static void test_mb_generate_buddy(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *expected_bb, *generate_bb; + struct ext4_group_info *expected_grp, *generate_grp; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb); + generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb); + expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp); + generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP); + KUNIT_ASSERT_NOT_NULL(test, generate_grp); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) { + mb_set_bits(bitmap, ranges[i].start, ranges[i].len); + do_test_generate_buddy(test, sb, bitmap, expected_bb, + expected_grp, generate_bb, generate_grp); + } +} + +static void +test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int i; + + /* mb_mark_used only accepts non-zero len */ + if (len == 0) + return; + + ex.fe_start = start; + ex.fe_len = len; + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + mb_set_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free -= len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); +} + +static void test_mb_mark_used(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_mark_used_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +static void +test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + /* mb_free_blocks will WARN if len is 0 */ + if (len == 0) + return; + + ext4_lock_group(sb, e4b->bd_group); + mb_free_blocks(NULL, e4b, start, len); + ext4_unlock_group(sb, e4b->bd_group); + + mb_clear_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free += len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); + +} + +static void test_mb_free_blocks(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + struct ext4_free_extent ex; + int ret; + int i; + struct test_range ranges[TEST_RANGE_COUNT]; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_start = 0; + ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb); + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + grp->bb_free = 0; + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; + memset(bitmap, 0xff, sb->s_blocksize); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_free_blocks_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +#define COUNT_FOR_ESTIMATE 100000 +static void test_mb_mark_used_cost(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i, j; + unsigned long start, end, all = 0; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_group = TEST_GOAL_GROUP; + for (j = 0; j < COUNT_FOR_ESTIMATE; j++) { + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + start = jiffies; + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ex.fe_start = ranges[i].start; + ex.fe_len = ranges[i].len; + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + end = jiffies; + all += (end - start); + + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_free_blocks(NULL, &e4b, ranges[i].start, + ranges[i].len); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + } + + kunit_info(test, "costed jiffies %lu\n", all); + ext4_mb_unload_buddy(&e4b); +} + +static const struct mbt_ext4_block_layout mbt_test_layouts[] = { + { + .blocksize_bits = 10, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 12, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 16, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, +}; + +static void mbt_show_layout(const struct mbt_ext4_block_layout *layout, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d " + "blocks_per_group=%d group_count=%d desc_size=%d\n", + layout->blocksize_bits, layout->cluster_bits, + layout->blocks_per_group, layout->group_count, + layout->desc_size); +} +KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); + +static struct kunit_case mbt_test_cases[] = { + KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params, + { .speed = KUNIT_SPEED_SLOW }), + {} +}; + +static struct kunit_suite mbt_test_suite = { + .name = "ext4_mballoc_test", + .init = mbt_kunit_init, + .exit = mbt_kunit_exit, + .test_cases = mbt_test_cases, +}; + +kunit_test_suites(&mbt_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 21b903fe546e..56d50fd3310b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -16,7 +16,9 @@ #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> +#include <linux/freezer.h> #include <trace/events/ext4.h> +#include <kunit/static_stub.h> /* * MUSTDO: @@ -96,14 +98,14 @@ * block bitmap and buddy information. The information are stored in the * inode as: * - * { page } + * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we - * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / - * blocksize) blocks. So it can have information regarding groups_per_page - * which is blocks_per_page/2 + * take up 2 blocks. A folio can contain blocks_per_folio (folio_size / + * blocksize) blocks. So it can have information regarding groups_per_folio + * which is blocks_per_folio/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. @@ -130,25 +132,30 @@ * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * - * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) + * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) * - * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) + * Locking: Writers use xa_lock, readers use rcu_read_lock. * - * This is an array of lists where the index in the array represents the + * This is an array of xarrays where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of - * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total - * number of buddy bitmap orders possible) number of lists. Group-infos are - * placed in appropriate lists. + * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total + * number of buddy bitmap orders possible) number of xarrays. Group-infos are + * placed in appropriate xarrays. * - * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) + * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) * - * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) + * Locking: Writers use xa_lock, readers use rcu_read_lock. * - * This is an array of lists where in the i-th list there are groups with + * This is an array of xarrays where in the i-th xarray there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. - * Note that we don't bother with a special list for completely empty groups - * so we only have MB_NUM_ORDERS(sb) lists. + * Note that we don't bother with a special xarray for completely empty + * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed + * in appropriate xarrays. + * + * In xarray, the index is the block group number, the value is the block group + * information, and a non-empty value indicates the block group is present in + * the current xarray. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for @@ -185,7 +192,7 @@ * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req - * /sys/fs/ext4/<partition>/mb_linear_limit + * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -207,7 +214,7 @@ * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for - * rotational devices that may result in higher seek times. "mb_linear_limit" + * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices @@ -416,12 +423,10 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); -static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, enum criteria cr); +static int ext4_mb_scan_group(struct ext4_allocation_context *ac, + ext4_group_t group); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, @@ -564,14 +569,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -677,7 +682,25 @@ do { \ } \ } while (0) -static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, +/* + * Perform buddy integrity check with the following steps: + * + * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap): + * For each pair of adjacent orders, if a higher-order bit is set (indicating a free block), + * at most one of the two corresponding lower-order bits may be clear (free). + * + * 2. Order-0 (bitmap) validation, performed on bit pairs: + * - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits + * must not be free (0). + * - If both bits in a pair are clear (0, free), then exactly one of the corresponding + * higher-order bits must be free (0). + * + * 3. Preallocation (pa) list validation: + * For each preallocated block (pa) in the group: + * - Verify that pa_pstart falls within the bounds of this block group. + * - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1). + */ +static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; @@ -696,7 +719,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) - return 0; + return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); @@ -718,15 +741,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, continue; } - /* both bits in buddy2 must be 1 */ - MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); - MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); - - for (j = 0; j < (1 << order); j++) { - k = (i * (1 << order)) + j; - MB_CHECK_ASSERT( - !mb_test_bit(k, e4b->bd_bitmap)); - } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); @@ -742,15 +756,21 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, fragments++; fstart = i; } - continue; + } else { + fstart = -1; } - fstart = -1; - /* check used bits only */ - for (j = 0; j < e4b->bd_blkbits + 1; j++) { - buddy2 = mb_find_buddy(e4b, j, &max2); - k = i >> j; - MB_CHECK_ASSERT(k < max2); - MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); + if (!(i & 1)) { + int in_use, zero_bit_count = 0; + + in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy); + for (j = 1; j < e4b->bd_blkbits + 2; j++) { + buddy2 = mb_find_buddy(e4b, j, &max2); + k = i >> j; + MB_CHECK_ASSERT(k < max2); + if (!mb_test_bit(k, buddy2)) + zero_bit_count++; + } + MB_CHECK_ASSERT(zero_bit_count == !in_use); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); @@ -758,17 +778,18 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) - return NULL; + return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + if (!pa->pa_len) + continue; ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } - return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ @@ -832,6 +853,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) return 0; if (order == MB_NUM_ORDERS(sb)) order--; + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) + order = MB_NUM_ORDERS(sb) - 1; return order; } @@ -840,142 +863,175 @@ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int new_order; + int new, old; - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) return; - new_order = mb_avg_fragment_size_order(sb, - grp->bb_free / grp->bb_fragments); - if (new_order == grp->bb_avg_fragment_size_order) + old = grp->bb_avg_fragment_size_order; + new = grp->bb_fragments == 0 ? -1 : + mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); + if (new == old) return; - if (grp->bb_avg_fragment_size_order != -1) { - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); - list_del(&grp->bb_avg_fragment_size_node); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + if (old >= 0) + xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group); + + grp->bb_avg_fragment_size_order = new; + if (new >= 0) { + /* + * Cannot use __GFP_NOFAIL because we hold the group lock. + * Although allocation for insertion may fails, it's not fatal + * as we have linear traversal to fall back on. + */ + int err = xa_insert(&sbi->s_mb_avg_fragment_size[new], + grp->bb_group, grp, GFP_ATOMIC); + if (err) + mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d", + grp->bb_group, new, err); } - grp->bb_avg_fragment_size_order = new_order; - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); - list_add_tail(&grp->bb_avg_fragment_size_node, - &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); +} + +static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, + struct xarray *xa, + ext4_group_t start, ext4_group_t end) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + enum criteria cr = ac->ac_criteria; + ext4_group_t ngroups = ext4_get_groups_count(sb); + unsigned long group = start; + struct ext4_group_info *grp; + + if (WARN_ON_ONCE(end > ngroups || start >= end)) + return 0; + + xa_for_each_range(xa, group, grp, start, end - 1) { + int err; + + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); + + err = ext4_mb_scan_group(ac, grp->bb_group); + if (err || ac->ac_status != AC_STATUS_CONTINUE) + return err; + + cond_resched(); + } + + return 0; +} + +/* + * Find a suitable group of given order from the largest free orders xarray. + */ +static inline int +ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, + int order, ext4_group_t start, + ext4_group_t end) +{ + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order]; + + if (xa_empty(xa)) + return 0; + + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ -static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, + ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *iter, *grp; int i; + int ret = 0; + ext4_group_t start, end; - if (ac->ac_status == AC_STATUS_FOUND) - return; - - if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) - atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); - - grp = NULL; + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { - if (list_empty(&sbi->s_mb_largest_free_orders[i])) - continue; - read_lock(&sbi->s_mb_largest_free_orders_locks[i]); - if (list_empty(&sbi->s_mb_largest_free_orders[i])) { - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - continue; - } - grp = NULL; - list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], - bb_largest_free_order_node) { - if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { - grp = iter; - break; - } - } - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - if (grp) - break; + ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; } - - if (!grp) { - /* Increment cr and search again */ - *new_cr = CR_GOAL_LEN_FAST; - } else { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; + if (start) { + end = start; + start = 0; + goto wrap_around; } + + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + + /* Increment cr and search again if no group is found */ + ac->ac_criteria = CR_GOAL_LEN_FAST; + return ret; } /* - * Find a suitable group of given order from the average fragments list. + * Find a suitable group of given order from the average fragments xarray. */ -static struct ext4_group_info * -ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) +static int +ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, + int order, ext4_group_t start, + ext4_group_t end) { - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; - rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; - struct ext4_group_info *grp = NULL, *iter; - enum criteria cr = ac->ac_criteria; + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order]; - if (list_empty(frag_list)) - return NULL; - read_lock(frag_list_lock); - if (list_empty(frag_list)) { - read_unlock(frag_list_lock); - return NULL; - } - list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { - if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { - grp = iter; - break; - } - } - read_unlock(frag_list_lock); - return grp; + if (xa_empty(xa)) + return 0; + + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ -static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, + ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp = NULL; - int i; - - if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { - if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); + int i, ret = 0; + ext4_group_t start, end; + + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: + i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); + for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; } - - for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); - i < MB_NUM_ORDERS(ac->ac_sb); i++) { - grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); - if (grp) - break; + if (start) { + end = start; + start = 0; + goto wrap_around; } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; - } else { - *new_cr = CR_BEST_AVAIL_LEN; - } + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + /* + * CR_BEST_AVAIL_LEN works based on the concept that we have + * a larger normalized goal len request which can be trimmed to + * a smaller goal len such that it can still satisfy original + * request len. However, allocation request for non-regular + * files never gets normalized. + * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). + */ + if (ac->ac_flags & EXT4_MB_HINT_DATA) + ac->ac_criteria = CR_BEST_AVAIL_LEN; + else + ac->ac_criteria = CR_GOAL_LEN_SLOW; + + return ret; } /* @@ -987,18 +1043,14 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case. */ -static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, + ext4_group_t group) { + int ret = 0; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp = NULL; int i, order, min_order; unsigned long num_stripe_clusters = 0; - - if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { - if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); - } + ext4_group_t start, end; /* * mb_avg_fragment_size_order() returns order in a way that makes @@ -1007,6 +1059,8 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; + if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) + order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; @@ -1028,6 +1082,9 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context if (1 << min_order < ac->ac_o_ex.fe_len) min_order = fls(ac->ac_o_ex.fe_len); + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: for (i = order; i >= min_order; i--) { int frag_order; /* @@ -1050,19 +1107,24 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context frag_order = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); - grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); - if (grp) - break; + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; } - - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; - } else { - /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR_GOAL_LEN_SLOW; + if (start) { + end = start; + start = 0; + goto wrap_around; } + + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + ac->ac_criteria = CR_GOAL_LEN_SLOW; + + return ret; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1077,65 +1139,91 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) } /* - * Return next linear group for allocation. If linear traversal should not be - * performed, this function just returns the same group + * next linear group for allocation. */ -static int -next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups) { - if (!should_optimize_scan(ac)) - goto inc_and_return; - - if (ac->ac_groups_linear_remaining) { - ac->ac_groups_linear_remaining--; - goto inc_and_return; - } - - return group; -inc_and_return: /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ - return group + 1 >= ngroups ? 0 : group + 1; + *group = *group + 1 >= ngroups ? 0 : *group + 1; } -/* - * ext4_mb_choose_next_group: choose next group for allocation. - * - * @ac Allocation Context - * @new_cr This is an output parameter. If the there is no good group - * available at current CR level, this field is updated to indicate - * the new cr level that should be used. - * @group This is an input / output parameter. As an input it indicates the - * next group that the allocator intends to use for allocation. As - * output, this field indicates the next group that should be used as - * determined by the optimization functions. - * @ngroups Total number of groups - */ -static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac, + ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count) { - *new_cr = ac->ac_criteria; + int ret, i; + enum criteria cr = ac->ac_criteria; + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group = *start; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { - *group = next_linear_group(ac, *group, ngroups); - return; + for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) { + ret = ext4_mb_scan_group(ac, group); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + cond_resched(); } - if (*new_cr == CR_POWER2_ALIGNED) { - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); - } else if (*new_cr == CR_GOAL_LEN_FAST) { - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); - } else if (*new_cr == CR_BEST_AVAIL_LEN) { - ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); - } else { + *start = group; + if (count == ngroups) + ac->ac_criteria++; + + /* Processed all groups and haven't found blocks */ + if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); + + return 0; +} + +static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) +{ + int ret = 0; + ext4_group_t start; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); + + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + + /* searching for the right group start from the goal value specified */ + start = ac->ac_g_ex.fe_group; + ac->ac_prefetch_grp = start; + ac->ac_prefetch_nr = 0; + + if (!should_optimize_scan(ac)) + return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups); + + /* + * Optimized scanning can return non adjacent groups which can cause + * seek overhead for rotational disks. So try few linear groups before + * trying optimized scan. + */ + if (sbi->s_mb_max_linear_groups) + ret = ext4_mb_scan_groups_linear(ac, ngroups, &start, + sbi->s_mb_max_linear_groups); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + + switch (ac->ac_criteria) { + case CR_POWER2_ALIGNED: + return ext4_mb_scan_groups_p2_aligned(ac, start); + case CR_GOAL_LEN_FAST: + return ext4_mb_scan_groups_goal_fast(ac, start); + case CR_BEST_AVAIL_LEN: + return ext4_mb_scan_groups_best_avail(ac, start); + default: /* - * TODO: For CR=2, we can arrange groups in an rb tree sorted by - * bb_free. But until that happens, we should never come here. + * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an + * rb tree sorted by bb_free. But until that happens, we should + * never come here. */ WARN_ON(1); } + + return 0; } /* @@ -1146,33 +1234,35 @@ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int i; + int new, old = grp->bb_largest_free_order; - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) - if (grp->bb_counters[i] > 0) + for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) + if (grp->bb_counters[new] > 0) break; + /* No need to move between order lists? */ - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || - i == grp->bb_largest_free_order) { - grp->bb_largest_free_order = i; + if (new == old) return; - } - if (grp->bb_largest_free_order >= 0) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); - list_del_init(&grp->bb_largest_free_order_node); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + if (old >= 0) { + struct xarray *xa = &sbi->s_mb_largest_free_orders[old]; + + if (!xa_empty(xa) && xa_load(xa, grp->bb_group)) + xa_erase(xa, grp->bb_group); } - grp->bb_largest_free_order = i; - if (grp->bb_largest_free_order >= 0 && grp->bb_free) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); - list_add_tail(&grp->bb_largest_free_order_node, - &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + + grp->bb_largest_free_order = new; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { + /* + * Cannot use __GFP_NOFAIL because we hold the group lock. + * Although allocation for insertion may fails, it's not fatal + * as we have linear traversal to fall back on. + */ + int err = xa_insert(&sbi->s_mb_largest_free_orders[new], + grp->bb_group, grp, GFP_ATOMIC); + if (err) + mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d", + grp->bb_group, new, err); } } @@ -1232,32 +1322,49 @@ void ext4_mb_generate_buddy(struct super_block *sb, atomic64_add(period, &sbi->s_mb_generation_time); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) + mb_set_bits(buddy, 0, count); + + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * - * { page } + * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. - * So for each group we take up 2 blocks. A page can - * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. - * So it can have information regarding groups_per_page which - * is blocks_per_page/2 + * So for each group we take up 2 blocks. A folio can + * contain blocks_per_folio (folio_size / blocksize) blocks. + * So it can have information regarding groups_per_folio which + * is blocks_per_folio/2 * * Locking note: This routine takes the block group lock of all groups - * for this page; do not hold this lock when calling this routine! + * for this folio; do not hold this lock when calling this routine! */ - -static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) +static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; - int blocksize; - int blocks_per_page; - int groups_per_page; + unsigned int blocksize; + int blocks_per_folio; + int groups_per_folio; int err = 0; int i; ext4_group_t first_group, group; @@ -1270,31 +1377,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - inode = page->mapping->host; + inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); - blocks_per_page = PAGE_SIZE / blocksize; + blocks_per_folio = folio_size(folio) / blocksize; + WARN_ON_ONCE(!blocks_per_folio); + groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2); - mb_debug(sb, "init page %lu\n", page->index); - - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; + mb_debug(sb, "init folio %lu\n", folio->index); /* allocate buffer_heads to read bitmaps */ - if (groups_per_page > 1) { - i = sizeof(struct buffer_head *) * groups_per_page; + if (groups_per_folio > 1) { + i = sizeof(struct buffer_head *) * groups_per_folio; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; - first_group = page->index * blocks_per_page / 2; - - /* read all groups the page covers into the cache */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + /* read all groups the folio covers into the cache */ + first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2; + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { if (group >= ngroups) break; @@ -1302,12 +1406,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if (!grinfo) continue; /* - * If page is uptodate then we came here after online resize + * If folio is uptodate then we came here after online resize * which added some new uninitialized group info structs, so - * we must skip all initialized uptodate buddies on the page, + * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + if (folio_test_uptodate(folio) && + !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } @@ -1321,7 +1426,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } /* wait for I/O completion */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { int err2; if (!bh[i]) @@ -1331,8 +1436,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) err = err2; } - first_block = page->index * blocks_per_page; - for (i = 0; i < blocks_per_page; i++) { + first_block = EXT4_PG_TO_LBLK(inode, folio->index); + for (i = 0; i < blocks_per_folio; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; @@ -1352,24 +1457,24 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * above * */ - data = page_address(page) + (i * blocksize); + data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* * We place the buddy block and bitmap block * close together */ + grinfo = ext4_get_group_info(sb, group); + if (!grinfo) { + err = -EFSCORRUPTED; + goto out; + } if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(sb, "put buddy for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); - grinfo = ext4_get_group_info(sb, group); - if (!grinfo) { - err = -EFSCORRUPTED; - goto out; - } grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * @@ -1386,8 +1491,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ @@ -1396,7 +1501,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); - ext4_mb_generate_from_freelist(sb, data, group); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be @@ -1405,11 +1510,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) incore = data; } } - SetPageUptodate(page); + folio_mark_uptodate(folio); out: if (bh) { - for (i = 0; i < groups_per_page; i++) + for (i = 0; i < groups_per_folio; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); @@ -1418,68 +1523,71 @@ out: } /* - * Lock the buddy and bitmap pages. This make sure other parallel init_group - * on the same buddy page doesn't happen whild holding the buddy page lock. - * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + * Lock the buddy and bitmap folios. This makes sure other parallel init_group + * on the same buddy folio doesn't happen while holding the buddy folio lock. + * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap + * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0. */ -static int ext4_mb_get_buddy_page_lock(struct super_block *sb, +static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; - int block, pnum, poff; - int blocks_per_page; - struct page *page; + int block, pnum; + struct folio *folio; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; - blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + pnum = EXT4_LBLK_TO_PG(inode, block); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); - if (blocks_per_page >= 2) { - /* buddy and bitmap are on the same page */ + block++; + pnum = EXT4_LBLK_TO_PG(inode, block); + if (folio_contains(folio, pnum)) { + /* buddy and bitmap are on the same folio */ return 0; } - block++; - pnum = block / blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_buddy_page = page; + /* we need another folio for the buddy */ + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); + e4b->bd_buddy_folio = folio; return 0; } -static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) { - unlock_page(e4b->bd_bitmap_page); - put_page(e4b->bd_bitmap_page); + if (e4b->bd_bitmap_folio) { + folio_unlock(e4b->bd_bitmap_folio); + folio_put(e4b->bd_bitmap_folio); } - if (e4b->bd_buddy_page) { - unlock_page(e4b->bd_buddy_page); - put_page(e4b->bd_buddy_page); + if (e4b->bd_buddy_folio) { + folio_unlock(e4b->bd_buddy_folio); + folio_put(e4b->bd_buddy_folio); } } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when + * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack @@ -1488,7 +1596,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) struct ext4_group_info *this_grp; struct ext4_buddy e4b; - struct page *page; + struct folio *folio; int ret = 0; might_sleep(); @@ -1499,14 +1607,14 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) /* * This ensures that we don't reinit the buddy cache - * page which map to the group from which we are already + * folio which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that - * would have pinned buddy page to page cache. - * The call to ext4_mb_get_buddy_page_lock will mark the - * page accessed. + * would have pinned buddy folio to page cache. + * The call to ext4_mb_get_buddy_folio_lock will mark the + * folio accessed. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); + ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group @@ -1515,52 +1623,50 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } - page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL, gfp); + folio = e4b.bd_bitmap_folio; + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - if (e4b.bd_buddy_page == NULL) { + if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in - * the same page we don't need to force + * the same folio we don't need to force * init the buddy */ ret = 0; goto err; } /* init buddy cache */ - page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); + folio = e4b.bd_buddy_folio; + ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } err: - ext4_mb_put_buddy_page_lock(&e4b); + ext4_mb_put_buddy_folio_lock(&e4b); return ret; } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when + * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { - int blocks_per_page; int block; int pnum; - int poff; - struct page *page; + struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1569,7 +1675,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, might_sleep(); mb_debug(sb, "load group %u\n", group); - blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; @@ -1578,8 +1683,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* @@ -1597,105 +1702,114 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, * So for each group we need two blocks. */ block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - - /* we could use find_or_create_page(), but it locks page - * what we'd like to avoid in fast path ... */ - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) + pnum = EXT4_LBLK_TO_PG(inode, block); + + /* Avoid locking the folio in the fast path ... */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) /* - * drop the page reference and try - * to get the page with lock. If we + * drop the folio reference and try + * to get the folio with lock. If we * are not uptodate that implies - * somebody just created the page but - * is yet to initialize the same. So + * somebody just created the folio but + * is yet to initialize it. So * wait for it to initialize. */ - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL, gfp); + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } - mb_cmp_bitmaps(e4b, page_address(page) + - (poff * sb->s_blocksize)); + mb_cmp_bitmaps(e4b, folio_address(folio) + + offset_in_folio(folio, + EXT4_LBLK_TO_B(inode, block))); } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + pnum = EXT4_LBLK_TO_PG(inode, block); + /* buddy and bitmap are on the same folio? */ + if (folio_contains(folio, pnum)) { + folio_get(folio); + goto update_buddy; + } + + /* we need another folio for the buddy */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_buddy_page = page; - e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); +update_buddy: + /* Folios marked accessed already */ + e4b->bd_buddy_folio = folio; + e4b->bd_buddy = folio_address(folio) + + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); return 0; err: - if (page) - put_page(page); - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); + if (!IS_ERR_OR_NULL(folio)) + folio_put(folio); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; @@ -1710,10 +1824,10 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); + if (e4b->bd_buddy_folio) + folio_put(e4b->bd_buddy_folio); } @@ -1891,11 +2005,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); - this_cpu_inc(discard_pa_seq); - e4b->bd_info->bb_free += count; - if (first < e4b->bd_info->bb_first_free) - e4b->bd_info->bb_first_free = first; - /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ @@ -1909,21 +2018,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; + /* + * Fastcommit replay can free already freed blocks which + * corrupts allocation info. Regenerate it. + */ + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + mb_regenerate_buddy(e4b); + goto check; + } + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); - if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block (bit %u); block bitmap corrupt.", - block); - ext4_mark_group_bitmap_corrupted( - sb, e4b->bd_group, + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); - } - goto done; + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, blocknr, + "freeing already freed block (bit %u); block bitmap corrupt.", + block); + return; } + this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; @@ -1948,17 +2067,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); -done: mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); +check: mb_check_buddy(e4b); } static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { - int next = block; - int max, order; + int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); @@ -1976,16 +2094,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, /* find actual order */ order = mb_find_order_for_block(e4b, block); - block = block >> order; - ex->fe_len = 1 << order; - ex->fe_start = block << order; + ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); + ex->fe_start = block; ex->fe_group = e4b->bd_group; - /* calc difference from given start */ - next = next - ex->fe_start; - ex->fe_len -= next; - ex->fe_start += next; + block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { @@ -2023,13 +2137,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) int ord; int mlen = 0; int max = 0; - int cur; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; - bool split = false; + int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); @@ -2054,16 +2167,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain buddy itself */ while (len) { - if (!split) - ord = mb_find_order_for_block(e4b, start); + ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; - if (!split) - buddy = mb_find_buddy(e4b, ord, &max); - else - split = false; + buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; @@ -2077,20 +2186,29 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) if (ret == 0) ret = len | (ord << 16); - /* we have to split large buddy */ BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; - ord--; - cur = (start >> ord) & ~1U; - buddy = mb_find_buddy(e4b, ord, &max); - mb_clear_bit(cur, buddy); - mb_clear_bit(cur + 1, buddy); - e4b->bd_info->bb_counters[ord]++; - e4b->bd_info->bb_counters[ord]++; - split = true; + ord_start = (start >> ord) << ord; + ord_end = ord_start + (1 << ord); + /* first chunk */ + if (start > ord_start) + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + ord_start, start - ord_start, + e4b->bd_info); + + /* last chunk */ + if (start + len < ord_end) { + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + start + len, + ord_end - (start + len), + e4b->bd_info); + break; + } + len = start + len - ord_end; + start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); @@ -2126,23 +2244,23 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_buddy = ret >> 16; /* - * take the page reference. We want the page to be pinned + * take the folio reference. We want the folio to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ - ac->ac_bitmap_page = e4b->bd_bitmap_page; - get_page(ac->ac_bitmap_page); - ac->ac_buddy_page = e4b->bd_buddy_page; - get_page(ac->ac_buddy_page); + ac->ac_bitmap_folio = e4b->bd_bitmap_folio; + folio_get(ac->ac_bitmap_folio); + ac->ac_buddy_folio = e4b->bd_buddy_folio; + folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - spin_lock(&sbi->s_md_lock); - sbi->s_mb_last_group = ac->ac_f_ex.fe_group; - sbi->s_mb_last_start = ac->ac_f_ex.fe_start; - spin_unlock(&sbi->s_md_lock); + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); } + /* * As we've just preallocated more space than * user requested originally, we store allocated @@ -2281,6 +2399,9 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, return; ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { @@ -2288,6 +2409,7 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } @@ -2314,18 +2436,16 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (err) return err; - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { - ext4_mb_unload_buddy(e4b); - return 0; - } - ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && - ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { + ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); @@ -2352,6 +2472,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); @@ -2385,12 +2506,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); - ext4_mark_group_bitmap_corrupted(ac->ac_sb, - e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } ac->ac_found++; @@ -2441,16 +2562,16 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * have free blocks */ + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } - if (ac->ac_criteria < CR_FAST) { + if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough @@ -2472,12 +2593,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -2521,7 +2642,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - stripe = EXT4_B2C(sbi, sbi->s_stripe); + stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { @@ -2539,6 +2660,30 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } +static void __ext4_mb_scan_group(struct ext4_allocation_context *ac) +{ + bool is_stripe_aligned; + struct ext4_sb_info *sbi; + enum criteria cr = ac->ac_criteria; + + ac->ac_groups_scanned++; + if (cr == CR_POWER2_ALIGNED) + return ext4_mb_simple_scan_group(ac, ac->ac_e4b); + + sbi = EXT4_SB(ac->ac_sb); + is_stripe_aligned = false; + if ((sbi->s_stripe >= sbi->s_cluster_ratio) && + !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe))) + is_stripe_aligned = true; + + if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && + is_stripe_aligned) + ext4_mb_scan_aligned(ac, ac->ac_e4b); + + if (ac->ac_status == AC_STATUS_CONTINUE) + ext4_mb_complex_scan_group(ac, ac->ac_e4b); +} + /* * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable @@ -2553,7 +2698,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; @@ -2634,7 +2779,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) + /* + * In all criterias except CR_ANY_FREE we try to avoid groups that + * can't possibly satisfy the full goal request due to insufficient + * free blocks. + */ + if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2650,7 +2800,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, int ret; /* - * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this @@ -2658,7 +2808,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ - if (cr < CR_FAST && + if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2724,6 +2874,37 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, } /* + * Batch reads of the block allocation bitmaps to get + * multiple READs in flight; limit prefetching at inexpensive + * CR, otherwise mballoc can spend a lot of time loading + * imperfect groups + */ +static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + struct ext4_sb_info *sbi; + + if (ac->ac_prefetch_grp != group) + return; + + sbi = EXT4_SB(ac->ac_sb); + if (ext4_mb_cr_expensive(ac->ac_criteria) || + ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) { + unsigned int nr = sbi->s_mb_prefetch; + + if (ext4_has_feature_flex_bg(ac->ac_sb)) { + nr = 1 << sbi->s_log_groups_per_flex; + nr -= group & (nr - 1); + nr = umin(nr, sbi->s_mb_prefetch); + } + + ac->ac_prefetch_nr = nr; + ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr, + &ac->ac_prefetch_ios); + } +} + +/* * Prefetching reads the block bitmap into the buffer cache; but we * need to make sure that the buddy bitmap in the page cache has been * initialized. Note that ext4_mb_init_group() will block if the I/O @@ -2756,24 +2937,58 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, } } +static int ext4_mb_scan_group(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + int ret; + struct super_block *sb = ac->ac_sb; + enum criteria cr = ac->ac_criteria; + + ext4_mb_might_prefetch(ac, group); + + /* prevent unnecessary buddy loading. */ + if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) + return 0; + + /* This now checks without needing the buddy folio */ + ret = ext4_mb_good_group_nolock(ac, group, cr); + if (ret <= 0) { + if (!ac->ac_first_err) + ac->ac_first_err = ret; + return 0; + } + + ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b); + if (ret) + return ret; + + /* skip busy group */ + if (cr >= CR_ANY_FREE) + ext4_lock_group(sb, group); + else if (!ext4_try_lock_group(sb, group)) + goto out_unload; + + /* We need to check again after locking the block group. */ + if (unlikely(!ext4_mb_good_group(ac, group, cr))) + goto out_unlock; + + __ext4_mb_scan_group(ac); + +out_unlock: + ext4_unlock_group(sb, group); +out_unload: + ext4_mb_unload_buddy(ac->ac_e4b); + return ret; +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t prefetch_grp = 0, ngroups, group, i; - enum criteria new_cr, cr = CR_GOAL_LEN_FAST; - int err = 0, first_err = 0; - unsigned int nr = 0, prefetch_ios = 0; - struct ext4_sb_info *sbi; - struct super_block *sb; + ext4_group_t i; + int err = 0; + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int lost; - - sb = ac->ac_sb; - sbi = EXT4_SB(sb); - ngroups = ext4_get_groups_count(sb); - /* non-extent files are limited to low blocks/groups */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) - ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); @@ -2787,8 +3002,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * ac->ac_2order is set only if the fe_len is a power of 2 - * if ac->ac_2order is set we also set criteria to 0 so that we - * try exact allocation using buddy. + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED + * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; @@ -2800,21 +3015,18 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { - /* - * This should tell if fe_len is exactly power of 2 - */ - if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - /* TBD: may be hot point */ - spin_lock(&sbi->s_md_lock); - ac->ac_g_ex.fe_group = sbi->s_mb_last_group; - ac->ac_g_ex.fe_start = sbi->s_mb_last_start; - spin_unlock(&sbi->s_md_lock); + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); + ac->ac_g_ex.fe_start = -1; + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; } /* @@ -2822,99 +3034,21 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * start with CR_GOAL_LEN_FAST, unless it is power of 2 * aligned, in which case let's do that faster approach first. */ + ac->ac_criteria = CR_GOAL_LEN_FAST; if (ac->ac_2order) - cr = CR_POWER2_ALIGNED; -repeat: - for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { - ac->ac_criteria = cr; - /* - * searching for the right group start - * from the goal value specified - */ - group = ac->ac_g_ex.fe_group; - ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; - prefetch_grp = group; - - for (i = 0, new_cr = cr; i < ngroups; i++, - ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { - int ret = 0; + ac->ac_criteria = CR_POWER2_ALIGNED; - cond_resched(); - if (new_cr != cr) { - cr = new_cr; - goto repeat; - } - - /* - * Batch reads of the block allocation bitmaps - * to get multiple READs in flight; limit - * prefetching at cr=0/1, otherwise mballoc can - * spend a lot of time loading imperfect groups - */ - if ((prefetch_grp == group) && - (cr >= CR_FAST || - prefetch_ios < sbi->s_mb_prefetch_limit)) { - nr = sbi->s_mb_prefetch; - if (ext4_has_feature_flex_bg(sb)) { - nr = 1 << sbi->s_log_groups_per_flex; - nr -= group & (nr - 1); - nr = min(nr, sbi->s_mb_prefetch); - } - prefetch_grp = ext4_mb_prefetch(sb, group, - nr, &prefetch_ios); - } - - /* This now checks without needing the buddy page */ - ret = ext4_mb_good_group_nolock(ac, group, cr); - if (ret <= 0) { - if (!first_err) - first_err = ret; - continue; - } - - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) - goto out; - - ext4_lock_group(sb, group); - - /* - * We need to check again after locking the - * block group - */ - ret = ext4_mb_good_group(ac, group, cr); - if (ret == 0) { - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - continue; - } - - ac->ac_groups_scanned++; - if (cr == CR_POWER2_ALIGNED) - ext4_mb_simple_scan_group(ac, &e4b); - else if ((cr == CR_GOAL_LEN_FAST || - cr == CR_BEST_AVAIL_LEN) && - sbi->s_stripe && - !(ac->ac_g_ex.fe_len % - EXT4_B2C(sbi, sbi->s_stripe))) - ext4_mb_scan_aligned(ac, &e4b); - else - ext4_mb_complex_scan_group(ac, &e4b); - - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - - if (ac->ac_status != AC_STATUS_CONTINUE) - break; - } - /* Processed all groups and haven't found blocks */ - if (sbi->s_mb_stats && i == ngroups) - atomic64_inc(&sbi->s_bal_cX_failed[cr]); + ac->ac_e4b = &e4b; + ac->ac_prefetch_ios = 0; + ac->ac_first_err = 0; +repeat: + while (ac->ac_criteria < EXT4_MB_NUM_CRS) { + err = ext4_mb_scan_groups(ac); + if (err) + goto out; - if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) - /* Reset goal length to original goal length before - * falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + if (ac->ac_status != AC_STATUS_CONTINUE) + break; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2925,6 +3059,8 @@ repeat: */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { + int lost; + /* * Someone more lucky has already allocated it. * The only thing we can do is just take first @@ -2940,23 +3076,27 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = CR_ANY_FREE; + ac->ac_criteria = CR_ANY_FREE; goto repeat; } } - if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) { atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC && + ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group) + atomic_inc(&sbi->s_bal_stream_goals); + } out: - if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) - err = first_err; + if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err) + err = ac->ac_first_err; mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, - ac->ac_flags, cr, err); + ac->ac_flags, ac->ac_criteria, err); - if (nr) - ext4_mb_prefetch_fini(sb, prefetch_grp, nr); + if (ac->ac_prefetch_nr) + ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr); return err; } @@ -2988,17 +3128,15 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); - int i; - int err, buddy_loaded = 0; + int i, err; + char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); - struct sg { - struct ext4_group_info info; - ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; - } sg; + DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, + EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) @@ -3006,7 +3144,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); - i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); @@ -3016,24 +3154,26 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - seq_printf(seq, "#%-5u: I/O error\n", group); + seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } - buddy_loaded = 1; - } - - memcpy(&sg, grinfo, i); - - if (buddy_loaded) ext4_mb_unload_buddy(&e4b); + } - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, - sg.info.bb_fragments, sg.info.bb_first_free); + /* + * We care only about free space counters in the group info and + * these are safe to access even after the buddy has been unloaded + */ + memcpy(sg, grinfo, i); + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, + sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? - sg.info.bb_counters[i] : 0); - seq_puts(seq, " ]\n"); - + sg->bb_counters[i] : 0); + seq_puts(seq, " ]"); + if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) + seq_puts(seq, " Block bitmap corrupted!"); + seq_putc(seq, '\n'); return 0; } @@ -3079,8 +3219,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); /* CR_GOAL_LEN_FAST stats */ seq_puts(seq, "\tcr_goal_fast_stats:\n"); @@ -3093,8 +3231,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); /* CR_BEST_AVAIL_LEN stats */ seq_puts(seq, "\tcr_best_avail_stats:\n"); @@ -3108,8 +3244,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); /* CR_GOAL_LEN_SLOW stats */ seq_puts(seq, "\tcr_goal_slow_stats:\n"); @@ -3139,6 +3273,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\tstream_goal_hits: %u\n", + atomic_read(&sbi->s_bal_stream_goals)); seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); @@ -3156,7 +3292,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) -__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; @@ -3186,6 +3321,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; unsigned int count; + unsigned long idx; position--; if (position >= MB_NUM_ORDERS(sb)) { @@ -3194,11 +3330,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "avg_fragment_size_lists:\n"); count = 0; - read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); - list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], - bb_avg_fragment_size_node) + xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp) count++; - read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; @@ -3210,11 +3343,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "max_free_order_lists:\n"); } count = 0; - read_lock(&sbi->s_mb_largest_free_orders_locks[position]); - list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], - bb_largest_free_order_node) + xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp) count++; - read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); @@ -3334,8 +3464,6 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; - INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); - INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; @@ -3382,6 +3510,8 @@ static int ext4_mb_init_backend(struct super_block *sb) * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + ext4_set_inode_mapping_order(sbi->s_buddy_cache); + for (i = 0; i < ngroups; i++) { cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); @@ -3410,10 +3540,11 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); - /* now many real IOs to prefetch within a single allocation at cr=0 - * given cr=0 is an CPU-related optimization we shouldn't try to - * load too many groups, at some point we should start to use what - * we've got in memory. + /* + * now many real IOs to prefetch within a single allocation at + * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related + * optimization we shouldn't try to load too many groups, at some point + * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ @@ -3501,11 +3632,10 @@ static void ext4_discard_work(struct work_struct *work) struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; - INIT_LIST_HEAD(&discard_list); spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); @@ -3545,6 +3675,30 @@ static void ext4_discard_work(struct work_struct *work) ext4_mb_unload_buddy(&e4b); } +static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) +{ + if (!sbi->s_mb_avg_fragment_size) + return; + + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) + xa_destroy(&sbi->s_mb_avg_fragment_size[i]); + + kfree(sbi->s_mb_avg_fragment_size); + sbi->s_mb_avg_fragment_size = NULL; +} + +static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) +{ + if (!sbi->s_mb_largest_free_orders) + return; + + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) + xa_destroy(&sbi->s_mb_largest_free_orders[i]); + + kfree(sbi->s_mb_largest_free_orders); + sbi->s_mb_largest_free_orders = NULL; +} + int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3590,45 +3744,29 @@ int ext4_mb_init(struct super_block *sb) } while (i < MB_NUM_ORDERS(sb)); sbi->s_mb_avg_fragment_size = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size) { ret = -ENOMEM; goto out; } - sbi->s_mb_avg_fragment_size_locks = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), - GFP_KERNEL); - if (!sbi->s_mb_avg_fragment_size_locks) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < MB_NUM_ORDERS(sb); i++) { - INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); - rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); - } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + xa_init(&sbi->s_mb_avg_fragment_size[i]); + sbi->s_mb_largest_free_orders = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders) { ret = -ENOMEM; goto out; } - sbi->s_mb_largest_free_orders_locks = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), - GFP_KERNEL); - if (!sbi->s_mb_largest_free_orders_locks) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < MB_NUM_ORDERS(sb); i++) { - INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); - rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); - } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + xa_init(&sbi->s_mb_largest_free_orders[i]); spin_lock_init(&sbi->s_md_lock); - sbi->s_mb_free_pending = 0; - INIT_LIST_HEAD(&sbi->s_freed_data_list); + atomic_set(&sbi->s_mb_free_pending, 0); + INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); + INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); atomic_set(&sbi->s_retry_alloc_pending, 0); @@ -3664,13 +3802,22 @@ int ext4_mb_init(struct super_block *sb) */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( - sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); + sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); + } + + sbi->s_mb_nr_global_goals = umin(num_possible_cpus(), + DIV_ROUND_UP(sbi->s_groups_count, 4)); + sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals, + sizeof(ext4_group_t), GFP_KERNEL); + if (sbi->s_mb_last_groups == NULL) { + ret = -ENOMEM; + goto out; } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out; + goto out_free_last_groups; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -3695,11 +3842,12 @@ int ext4_mb_init(struct super_block *sb) out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; +out_free_last_groups: + kfree(sbi->s_mb_last_groups); + sbi->s_mb_last_groups = NULL; out: - kfree(sbi->s_mb_avg_fragment_size); - kfree(sbi->s_mb_avg_fragment_size_locks); - kfree(sbi->s_mb_largest_free_orders); - kfree(sbi->s_mb_largest_free_orders_locks); + ext4_mb_avg_fragment_size_destroy(sbi); + ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); @@ -3723,7 +3871,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) return count; } -int ext4_mb_release(struct super_block *sb) +void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; @@ -3766,10 +3914,8 @@ int ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } - kfree(sbi->s_mb_avg_fragment_size); - kfree(sbi->s_mb_avg_fragment_size_locks); - kfree(sbi->s_mb_largest_free_orders); - kfree(sbi->s_mb_largest_free_orders_locks); + ext4_mb_avg_fragment_size_destroy(sbi); + ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); @@ -3799,13 +3945,11 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - - return 0; + kfree(sbi->s_mb_last_groups); } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count, - struct bio **biop) + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; @@ -3814,13 +3958,8 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - if (biop) { - return __blkdev_issue_discard(sb->s_bdev, - (sector_t)discard_block << (sb->s_blocksize_bits - 9), - (sector_t)count << (sb->s_blocksize_bits - 9), - GFP_NOFS, biop); - } else - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, @@ -3837,10 +3976,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); - spin_lock(&EXT4_SB(sb)->s_md_lock); - EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; - spin_unlock(&EXT4_SB(sb)->s_md_lock); - + atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; @@ -3852,18 +3988,15 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); + EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ - put_page(e4b.bd_buddy_page); - put_page(e4b.bd_bitmap_page); + folio_put(e4b.bd_buddy_folio); + folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); @@ -3879,22 +4012,11 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct list_head freed_data_list; - struct list_head *cut_pos = NULL; + LIST_HEAD(freed_data_list); + struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; bool wake; - INIT_LIST_HEAD(&freed_data_list); - - spin_lock(&sbi->s_md_lock); - list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { - if (entry->efd_tid != commit_tid) - break; - cut_pos = &entry->efd_list; - } - if (cut_pos) - list_cut_position(&freed_data_list, &sbi->s_freed_data_list, - cut_pos); - spin_unlock(&sbi->s_md_lock); + list_replace_init(s_freed_head, &freed_data_list); list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); @@ -3905,7 +4027,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) - queue_work(system_unbound_wq, &sbi->s_discard_work); + queue_work(system_dfl_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); @@ -3952,6 +4074,111 @@ void ext4_exit_mballoc(void) ext4_groupinfo_destroy_slabs(); } +#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 +#define EXT4_MB_SYNC_UPDATE 0x0002 +static int +ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + int err; + unsigned int i, already, changed = len; + + KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, + handle, sb, state, group, blkoff, len, + flags, ret_changed); + + if (ret_changed) + *ret_changed = 0; + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) + return PTR_ERR(bitmap_bh); + + if (handle) { + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + + err = -EIO; + gdp = ext4_get_group_desc(sb, group, &gdp_bh); + if (!gdp) + goto out_err; + + if (handle) { + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdp_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + } + + if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { + already = 0; + for (i = 0; i < len; i++) + if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == + state) + already++; + changed = len - already; + } + + if (state) { + mb_set_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) - changed); + } else { + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) + changed); + } + + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_unlock_group(sb, group); + if (ret_changed) + *ret_changed = changed; + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group); + struct flex_groups *fg = sbi_array_rcu_deref(sbi, + s_flex_groups, flex_group); + + if (state) + atomic64_sub(changed, &fg->free_clusters); + else + atomic64_add(changed, &fg->free_clusters); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err) + goto out_err; + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); + if (err) + goto out_err; + + if (flags & EXT4_MB_SYNC_UPDATE) { + sync_dirty_buffer(bitmap_bh); + sync_dirty_buffer(gdp_bh); + } + +out_err: + brelse(bitmap_bh); + return err; +} /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps @@ -3961,13 +4188,13 @@ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { - struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; + int flags = 0; + ext4_grpblk_t changed; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); @@ -3975,32 +4202,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); - if (IS_ERR(bitmap_bh)) { - return PTR_ERR(bitmap_bh); - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto out_err; - - err = -EIO; - gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); if (!gdp) - goto out_err; - + return -EIO; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); - BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); - if (err) - goto out_err; - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " @@ -4009,41 +4217,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + err = ext4_mb_mark_context(handle, sb, true, + ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len, + 0, NULL); if (!err) err = -EFSCORRUPTED; - goto out_err; + return err; } - ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < ac->ac_b_ex.fe_len; i++) { - BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, - bitmap_bh->b_data)); - } - } + flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); - } - len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_group_clusters_set(sb, gdp, len); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); + err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, + flags, &changed); + + if (err && changed == 0) + return err; - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != ac->ac_b_ex.fe_len); +#endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative @@ -4053,21 +4249,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, - ac->ac_b_ex.fe_group); - atomic64_sub(ac->ac_b_ex.fe_len, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (err) - goto out_err; - err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); - -out_err: - brelse(bitmap_bh); return err; } @@ -4076,17 +4257,13 @@ out_err: * blocks in bitmaps and update counters. */ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, - int len, int state) + int len, bool state) { - struct buffer_head *bitmap_bh = NULL; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, err; - int already; - unsigned int clen, clen_changed, thisgrp_len; + int err = 0; + unsigned int clen, thisgrp_len; while (len > 0) { ext4_get_group_no_and_offset(sb, block, &group, &blkoff); @@ -4107,80 +4284,21 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, ext4_error(sb, "Marking blocks in system zone - " "Block = %llu, len = %u", block, thisgrp_len); - bitmap_bh = NULL; - break; - } - - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; break; } - err = -EIO; - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) - break; - - ext4_lock_group(sb, group); - already = 0; - for (i = 0; i < clen; i++) - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == - !state) - already++; - - clen_changed = clen - already; - if (state) - mb_set_bits(bitmap_bh->b_data, blkoff, clen); - else - mb_clear_bits(bitmap_bh->b_data, blkoff, clen); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, group, gdp)); - } - if (state) - clen = ext4_free_group_clusters(sb, gdp) - clen_changed; - else - clen = ext4_free_group_clusters(sb, gdp) + clen_changed; - - ext4_free_group_clusters_set(sb, gdp, clen); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); - - ext4_unlock_group(sb, group); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, group); - struct flex_groups *fg = sbi_array_rcu_deref(sbi, - s_flex_groups, flex_group); - - if (state) - atomic64_sub(clen_changed, &fg->free_clusters); - else - atomic64_add(clen_changed, &fg->free_clusters); - - } - - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); - if (err) - break; - sync_dirty_buffer(bitmap_bh); - err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(gdp_bh); + err = ext4_mb_mark_context(NULL, sb, state, + group, blkoff, clen, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); if (err) break; block += thisgrp_len; len -= thisgrp_len; - brelse(bitmap_bh); BUG_ON(len < 0); } - - if (err) - brelse(bitmap_bh); } /* @@ -4222,12 +4340,13 @@ ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_ static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t start, ext4_lblk_t end) + ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + ext4_lblk_t tmp_pa_start; + loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); @@ -4236,7 +4355,7 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) @@ -4258,14 +4377,14 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t *start, ext4_lblk_t *end) + ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; - ext4_lblk_t new_start, new_end; - ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; + ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; + loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; @@ -4284,7 +4403,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); @@ -4364,8 +4483,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, } if (left_pa) { - left_pa_end = - left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } @@ -4404,8 +4522,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; - ext4_lblk_t end; - loff_t size, start_off; + loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; @@ -4432,7 +4549,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -4491,6 +4608,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start = max(start, rounddown(ac->ac_o_ex.fe_logical, (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); + /* avoid unnecessary preallocation that may trigger assertions */ + if (start + size > EXT_MAX_BLOCKS) + size = EXT_MAX_BLOCKS - start; + /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; @@ -4621,7 +4742,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the - * pages in the ext4_allocation_context so + * folios in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ return; @@ -4766,7 +4887,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; - loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4862,9 +4982,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ - tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { + if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } @@ -4962,32 +5080,6 @@ try_group_pa: } /* - * the function goes through all block freed in the group - * but not yet committed and marks them used in in-core bitmap. - * buddy must be generated from this bitmap - * Need to be called with the ext4 group lock held - */ -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group) -{ - struct rb_node *n; - struct ext4_group_info *grp; - struct ext4_free_data *entry; - - grp = ext4_get_group_info(sb, group); - if (!grp) - return; - n = rb_first(&(grp->bb_free_root)); - - while (n) { - entry = rb_entry(n, struct ext4_free_data, efd_node); - mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); - n = rb_next(n); - } - return; -} - -/* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held @@ -5180,12 +5272,21 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { - int new_bex_start; - int new_bex_end; + struct ext4_free_extent ex = { + .fe_logical = ac->ac_g_ex.fe_logical, + .fe_len = ac->ac_orig_goal_len, + }; + loff_t orig_goal_end = extent_logical_end(sbi, &ex); + loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex); - /* we can't allocate as much as normalizer wants. - * so, found space must get proper lstart - * to cover original request */ + /* + * We can't allocate as much as normalizer wants, so we try + * to get proper lstart to cover the original request, except + * when the goal doesn't cover the original request as below: + * + * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048 + * best_ex:0/200(200) -> adjusted: 1848/2048(200) + */ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); @@ -5197,32 +5298,25 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * 1. Check if best ex can be kept at end of goal (before * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and - * still cover original start + * still cover original end * 3. Else, keep the best ex at start of original request. */ - new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len); - new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical >= new_bex_start) - goto adjust_bex; + ex.fe_len = ac->ac_b_ex.fe_len; - new_bex_start = ac->ac_g_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical < new_bex_end) + ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); + if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; - new_bex_start = ac->ac_o_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + ex.fe_logical = ac->ac_g_ex.fe_logical; + if (o_ex_end <= extent_logical_end(sbi, &ex)) + goto adjust_bex; + ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: - ac->ac_b_ex.fe_logical = new_bex_start; + ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); - BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); - BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len))); + BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; @@ -5326,7 +5420,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { @@ -5376,11 +5470,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, */ } atomic_add(free, &sbi->s_mb_discarded); - - return 0; } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { @@ -5394,13 +5486,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); - return 0; + return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); - - return 0; } /* @@ -5419,7 +5509,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; @@ -5448,7 +5538,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, goto out_dbg; } - INIT_LIST_HEAD(&list); ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { @@ -5522,21 +5611,20 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode, unsigned int needed) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; - if (!S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) return; - } if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -5544,17 +5632,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, - atomic_read(&ei->i_prealloc_active), needed); - - INIT_LIST_HEAD(&list); - - if (needed == 0) - needed = UINT_MAX; + atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); - for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); @@ -5578,7 +5661,6 @@ repeat: spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); - needed--; continue; } @@ -5671,7 +5753,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5705,7 +5787,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" @@ -5729,7 +5811,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); - mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); + mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); @@ -5738,12 +5820,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) #else static inline void ext4_mb_show_pa(struct super_block *sb) { - return; } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); - return; } #endif @@ -5769,7 +5849,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; @@ -5865,13 +5945,11 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, { ext4_group_t group = 0; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); - INIT_LIST_HEAD(&discard_list); - spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, @@ -5984,18 +6062,15 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ - if (lg_prealloc_count > 8) { + if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); - return; - } - return ; } /* * release all resource we used in allocation */ -static int ext4_mb_release_context(struct ext4_allocation_context *ac) +static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; @@ -6025,14 +6100,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) ext4_mb_put_pa(ac, ac->ac_sb, pa); } - if (ac->ac_bitmap_page) - put_page(ac->ac_bitmap_page); - if (ac->ac_buddy_page) - put_page(ac->ac_buddy_page); + if (ac->ac_bitmap_folio) + folio_put(ac->ac_bitmap_folio); + if (ac->ac_buddy_folio) + folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - return 0; } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) @@ -6082,7 +6156,7 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, } out_dbg: - mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); + mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } @@ -6102,7 +6176,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || @@ -6147,9 +6221,10 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) } block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); - ext4_mb_mark_bb(sb, block, 1, 1); + ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; + *errp = 0; return block; } @@ -6305,28 +6380,63 @@ out: * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ -static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, - struct ext4_free_data *entry, - struct ext4_free_data *new_entry, - struct rb_root *entry_rb_root) +static inline bool +ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) { - if ((entry->efd_tid != new_entry->efd_tid) || - (entry->efd_group != new_entry->efd_group)) - return; - if (entry->efd_start_cluster + entry->efd_count == - new_entry->efd_start_cluster) { - new_entry->efd_start_cluster = entry->efd_start_cluster; - new_entry->efd_count += entry->efd_count; - } else if (new_entry->efd_start_cluster + new_entry->efd_count == - entry->efd_start_cluster) { - new_entry->efd_count += entry->efd_count; - } else - return; + if (entry1->efd_tid != entry2->efd_tid) + return false; + if (entry1->efd_start_cluster + entry1->efd_count != + entry2->efd_start_cluster) + return false; + if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group)) + return false; + return true; +} + +static inline void +ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + entry1->efd_count += entry2->efd_count; spin_lock(&sbi->s_md_lock); - list_del(&entry->efd_list); + list_del(&entry2->efd_list); spin_unlock(&sbi->s_md_lock); - rb_erase(&entry->efd_node, entry_rb_root); - kmem_cache_free(ext4_free_data_cachep, entry); + rb_erase(&entry2->efd_node, root); + kmem_cache_free(ext4_free_data_cachep, entry2); +} + +static inline void +ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry) +{ + struct ext4_free_data *prev; + struct rb_node *node; + + node = rb_prev(&entry->efd_node); + if (!node) + return; + + prev = rb_entry(node, struct ext4_free_data, efd_node); + if (ext4_freed_extents_can_be_merged(prev, entry)) + ext4_merge_freed_extents(sbi, root, prev, entry); +} + +static inline void +ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry) +{ + struct ext4_free_data *next; + struct rb_node *node; + + node = rb_next(&entry->efd_node); + if (!node) + return; + + next = rb_entry(node, struct ext4_free_data, efd_node); + if (ext4_freed_extents_can_be_merged(entry, next)) + ext4_merge_freed_extents(sbi, root, entry, next); } static noinline_for_stack void @@ -6336,16 +6446,17 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; ext4_grpblk_t clusters = new_entry->efd_count; - struct ext4_free_data *entry; + struct ext4_free_data *entry = NULL; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_root *root = &db->bb_free_root; + struct rb_node **n = &root->rb_node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); + BUG_ON(e4b->bd_bitmap_folio == NULL); + BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; @@ -6356,8 +6467,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ - get_page(e4b->bd_buddy_page); - get_page(e4b->bd_bitmap_page); + folio_get(e4b->bd_buddy_folio); + folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; @@ -6376,70 +6487,45 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } } - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &db->bb_free_root); + atomic_add(clusters, &sbi->s_mb_free_pending); + if (!entry) + goto insert; - /* Now try to see the extent can be merged to left and right */ - node = rb_prev(new_node); - if (node) { - entry = rb_entry(node, struct ext4_free_data, efd_node); - ext4_try_merge_freed_extent(sbi, entry, new_entry, - &(db->bb_free_root)); + /* Now try to see the extent can be merged to prev and next */ + if (ext4_freed_extents_can_be_merged(new_entry, entry)) { + entry->efd_start_cluster = cluster; + entry->efd_count += new_entry->efd_count; + kmem_cache_free(ext4_free_data_cachep, new_entry); + ext4_try_merge_freed_extent_prev(sbi, root, entry); + return; } - - node = rb_next(new_node); - if (node) { - entry = rb_entry(node, struct ext4_free_data, efd_node); - ext4_try_merge_freed_extent(sbi, entry, new_entry, - &(db->bb_free_root)); + if (ext4_freed_extents_can_be_merged(entry, new_entry)) { + entry->efd_count += new_entry->efd_count; + kmem_cache_free(ext4_free_data_cachep, new_entry); + ext4_try_merge_freed_extent_next(sbi, root, entry); + return; } +insert: + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, root); spin_lock(&sbi->s_md_lock); - list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); - sbi->s_mb_free_pending += clusters; + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); spin_unlock(&sbi->s_md_lock); } static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { - struct buffer_head *bitmap_bh; struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; ext4_group_t group; ext4_grpblk_t blkoff; - int already_freed = 0, err, i; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - pr_warn("Failed to read block bitmap\n"); - return; - } - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) - goto err_out; - - for (i = 0; i < count; i++) { - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) - already_freed++; - } - mb_clear_bits(bitmap_bh->b_data, blkoff, count); - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); - if (err) - goto err_out; - ext4_free_group_clusters_set( - sb, gdp, ext4_free_group_clusters(sb, gdp) + - count - already_freed); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); - ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(bitmap_bh); - sync_dirty_buffer(gdp_bh); - -err_out: - brelse(bitmap_bh); + ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); } /** @@ -6455,19 +6541,17 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int flags) { - struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; - struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; - int ret; + int mark_flags = 0; + ext4_grpblk_t changed; sbi = EXT4_SB(sb); @@ -6476,7 +6560,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ - goto error_return; + goto error_out; } flags |= EXT4_FREE_BLOCKS_VALIDATED; @@ -6500,55 +6584,35 @@ do_more: flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto error_return; - } - gdp = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!gdp) { - err = -EIO; - goto error_return; - } + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) + goto error_out; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ - goto error_return; + goto error_clean; } - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); - if (err) - goto error_return; #ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < count_clusters; i++) - BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); - } + mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif - trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + count_clusters, mark_flags, &changed); - /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ - err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, - GFP_NOFS|__GFP_NOFAIL); - if (err) - goto error_return; + + if (err && changed == 0) + goto error_clean; + +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != count_clusters); +#endif /* * We need to make sure we don't reuse the freed block until after the @@ -6572,42 +6636,32 @@ do_more: new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { - /* need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, - count_clusters, NULL); - if (err && err != -EOPNOTSUPP) + count_clusters); + /* + * Ignore EOPNOTSUPP error. This is consistent with + * what happens when using journal. + */ + if (err == -EOPNOTSUPP) + err = 0; + if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, err); - } else - EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); + } + + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_group_clusters(sb, gdp) + count_clusters; - ext4_free_group_clusters_set(sb, gdp, ret); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(count_clusters, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - /* * on a bigalloc file system, defer the s_freeclusters_counter * update to the caller (ext4_remove_space and friends) so they @@ -6620,30 +6674,19 @@ do_more: count_clusters); } - ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - if (overflow && !err) { block += count; count = overflow; - put_bh(bitmap_bh); + ext4_mb_unload_buddy(&e4b); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } -error_return: - brelse(bitmap_bh); + +error_clean: + ext4_mb_unload_buddy(&e4b); +error_out: ext4_std_error(sb, err); - return; } /** @@ -6740,13 +6783,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) - bh = sb_find_get_block(inode->i_sb, block + i); + bh = sb_find_get_block_nonatomic(inode->i_sb, + block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } ext4_mb_clear_bb(handle, inode, block, count, flags); - return; } /** @@ -6761,23 +6804,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; - unsigned int i; - struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int err = 0, ret, free_clusters_count; - ext4_grpblk_t clusters_freed; + int err = 0; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; + ext4_grpblk_t changed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - if (count == 0) + if (cluster_count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -6789,99 +6828,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; - goto error_return; + goto error_out; } - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto error_return; - } - - desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) { - err = -EIO; - goto error_return; - } + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_out; if (!ext4_sb_block_valid(sb, NULL, block, count)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; - goto error_return; + goto error_clean; } - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto error_return; + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, + &changed); + if (err && changed == 0) + goto error_clean; - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); - if (err) - goto error_return; - - for (i = 0, clusters_freed = 0; i < cluster_count; i++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { - ext4_error(sb, "bit already cleared for block %llu", - (ext4_fsblk_t)(block + i)); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - clusters_freed++; - } - } - - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; + if (changed != cluster_count) + ext4_error(sb, "bit already cleared in group %u", block_group); - /* - * need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); mb_free_blocks(NULL, &e4b, bit, cluster_count); - free_clusters_count = clusters_freed + - ext4_free_group_clusters(sb, desc); - ext4_free_group_clusters_set(sb, desc, free_clusters_count); - ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - clusters_freed); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(clusters_freed, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } + changed); +error_clean: ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - -error_return: - brelse(bitmap_bh); +error_out: ext4_std_error(sb, err); return err; } @@ -6920,24 +6899,51 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count, NULL); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; } +static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, + ext4_group_t grp) +{ + unsigned long nr_clusters_in_group; + + if (grp < (ext4_get_groups_count(sb) - 1)) + nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb); + else + nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp)) + >> EXT4_CLUSTER_BITS(sb); + + return nr_clusters_in_group - 1; +} + +static bool ext4_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { - ext4_grpblk_t next, count, free_count; + ext4_grpblk_t next, count, free_count, last, origin_start; + bool set_trimmed = false; void *bitmap; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return 0; + + last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; + if (start == 0 && max >= last) + set_trimmed = true; + origin_start = start; + start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -6945,22 +6951,23 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; - next = mb_find_next_bit(bitmap, max + 1, start); + + next = mb_find_next_bit(bitmap, last + 1, start); + if (origin_start == 0 && next >= last) + set_trimmed = true; if ((next - start) >= minblocks) { int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) - break; + return count; count += next - start; } free_count += next - start; start = next + 1; - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } + if (ext4_trim_interrupted()) + return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); @@ -6972,6 +6979,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) break; } + if (set_trimmed) + EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); + return count; } @@ -6982,7 +6992,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count - * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy @@ -6992,7 +7001,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks, bool set_trimmed) + ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; @@ -7009,13 +7018,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < EXT4_SB(sb)->s_last_trim_minblks) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0 && set_trimmed) - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } else { + else ret = 0; - } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -7048,7 +7054,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); - bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -7067,10 +7072,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks - 1) { + if (end >= max_blks - 1) end = max_blks - 1; - eof = true; - } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -7084,9 +7087,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - whole_group = true; for (group = first_group; group <= last_group; group++) { + if (ext4_trim_interrupted()) + break; grp = ext4_get_group_info(sb, group); if (!grp) continue; @@ -7103,13 +7107,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) { + if (group == last_group) end = last_cluster; - whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; - } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen, whole_group); + end, minlen); if (cnt < 0) { ret = cnt; break; @@ -7137,13 +7139,14 @@ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, + ext4_grpblk_t first, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; - ext4_grpblk_t next; + ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; @@ -7154,11 +7157,19 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - + if (meta_formatter && start != first) { + if (start > end) + start = end; + ext4_unlock_group(sb, group); + error = meta_formatter(sb, group, first, start - first, + priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) @@ -7180,3 +7191,7 @@ out_unload: return error; } + +#ifdef CONFIG_EXT4_KUNIT_TESTS +#include "mballoc-test.c" +#endif diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index df6b5e7c2274..15a049f05d04 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,15 +187,19 @@ struct ext4_allocation_context { struct ext4_free_extent ac_f_ex; /* - * goal len can change in CR1.5, so save the original len. This is - * used while adjusting the PA window and for accounting. + * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. + * This is used while adjusting the PA window and for accounting. */ ext4_grpblk_t ac_orig_goal_len; - __u32 ac_groups_considered; + ext4_group_t ac_prefetch_grp; + unsigned int ac_prefetch_ios; + unsigned int ac_prefetch_nr; + + int ac_first_err; + __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; - __u16 ac_groups_linear_remaining; __u16 ac_found; __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; @@ -205,8 +209,10 @@ struct ext4_allocation_context { __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ - struct page *ac_bitmap_page; - struct page *ac_buddy_page; + + struct ext4_buddy *ac_e4b; + struct folio *ac_bitmap_folio; + struct folio *ac_buddy_folio; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -216,9 +222,9 @@ struct ext4_allocation_context { #define AC_STATUS_BREAK 3 struct ext4_buddy { - struct page *bd_buddy_page; + struct folio *bd_buddy_folio; void *bd_buddy; - struct page *bd_bitmap_page; + struct folio *bd_bitmap_folio; void *bd_bitmap; struct ext4_group_info *bd_info; struct super_block *bd_sb; @@ -233,6 +239,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } +static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, + struct ext4_free_extent *fex) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); +} + +static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, + struct ext4_prealloc_space *pa) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); +} + typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, @@ -246,6 +266,7 @@ ext4_mballoc_query_range( ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index d98ac2af8199..1b0dfd963d3f 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -37,7 +37,6 @@ static int finish_range(handle_t *handle, struct inode *inode, path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); - path = NULL; goto err_out; } @@ -53,7 +52,9 @@ static int finish_range(handle_t *handle, struct inode *inode, retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); if (retval < 0) goto err_out; - retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); + path = ext4_ext_insert_extent(handle, inode, path, &newext, 0); + if (IS_ERR(path)) + retval = PTR_ERR(path); err_out: up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); @@ -663,8 +664,8 @@ int ext4_ind_migrate(struct inode *inode) if (unlikely(ret2 && !ret)) ret = ret2; errout: - ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0aaf38ffcb6e..6f57c181ff77 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -14,14 +14,14 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) int offset = offsetof(struct mmp_struct, mmp_checksum); __u32 csum; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset); return cpu_to_le32(csum); } static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); @@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); @@ -57,16 +57,12 @@ static int write_mmp_block_thawed(struct super_block *sb, static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { - int err; - /* * We protect against freezing so that we don't create dirty buffers * on frozen filesystem. */ - sb_start_write(sb); - err = write_mmp_block_thawed(sb, bh); - sb_end_write(sb); - return err; + scoped_guard(super_write, sb) + return write_mmp_block_thawed(sb, bh); } /* @@ -94,7 +90,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } lock_buffer(*bh); - ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL); + ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false); if (ret) goto warn_exit; @@ -162,7 +158,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !sb_rdonly(sb)) { + while (!kthread_should_stop() && !ext4_emergency_state(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); @@ -231,9 +227,9 @@ static int kmmpd(void *data) * Adjust the mmp_check_interval depending on how much time * it took for the MMP block to be written. */ - mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, - EXT4_MMP_MAX_CHECK_INTERVAL), - EXT4_MMP_MIN_CHECK_INTERVAL); + mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MIN_CHECK_INTERVAL, + EXT4_MMP_MAX_CHECK_INTERVAL); mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 18a9e7c47975..0550fd30fd10 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -13,32 +13,14 @@ #include "ext4.h" #include "ext4_extents.h" -/** - * get_ext_path() - Find an extent path for designated logical block number. - * @inode: inode to be searched - * @lblock: logical block number to find an extent path - * @ppath: pointer to an extent path pointer (for output) - * - * ext4_find_extent wrapper. Return 0 on success, or a negative error value - * on failure. - */ -static inline int -get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **ppath) -{ - struct ext4_ext_path *path; - - path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); - if (IS_ERR(path)) - return PTR_ERR(path); - if (path[ext_depth(inode)].p_ext == NULL) { - ext4_free_ext_path(path); - *ppath = NULL; - return -ENODATA; - } - *ppath = path; - return 0; -} +#include <trace/events/ext4.h> + +struct mext_data { + struct inode *orig_inode; /* Origin file inode */ + struct inode *donor_inode; /* Donor file inode */ + struct ext4_map_blocks orig_map;/* Origin file's move mapping */ + ext4_lblk_t donor_lblk; /* Start block of the donor file */ +}; /** * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem @@ -56,7 +38,6 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second) } else { down_write(&EXT4_I(second)->i_data_sem); down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); - } } @@ -75,57 +56,14 @@ ext4_double_up_write_data_sem(struct inode *orig_inode, up_write(&EXT4_I(donor_inode)->i_data_sem); } -/** - * mext_check_coverage - Check that all extents in range has the same type - * - * @inode: inode in question - * @from: block offset of inode - * @count: block count to be checked - * @unwritten: extents expected to be unwritten - * @err: pointer to save error value - * - * Return 1 if all extents in range has expected type, and zero otherwise. - */ -static int -mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, - int unwritten, int *err) -{ - struct ext4_ext_path *path = NULL; - struct ext4_extent *ext; - int ret = 0; - ext4_lblk_t last = from + count; - while (from < last) { - *err = get_ext_path(inode, from, &path); - if (*err) - goto out; - ext = path[ext_depth(inode)].p_ext; - if (unwritten != ext4_ext_is_unwritten(ext)) - goto out; - from += ext4_ext_get_actual_len(ext); - } - ret = 1; -out: - ext4_free_ext_path(path); - return ret; -} - -/** - * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 - * - * @inode1: the inode structure - * @inode2: the inode structure - * @index1: folio index - * @index2: folio index - * @folio: result folio vector - * - * Grab two locked folio for inode's by inode order - */ -static int -mext_folio_double_lock(struct inode *inode1, struct inode *inode2, - pgoff_t index1, pgoff_t index2, struct folio *folio[2]) +/* Grab and lock folio on both @inode1 and @inode2 by inode order. */ +static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index1, pgoff_t index2, size_t len, + struct folio *folio[2]) { struct address_space *mapping[2]; unsigned int flags; + fgf_t fgp_flags = FGP_WRITEBEGIN; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { @@ -138,14 +76,15 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, } flags = memalloc_nofs_save(); - folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, + fgp_flags |= fgf_set_order(len); + folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, mapping_gfp_mask(mapping[0])); if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); return PTR_ERR(folio[0]); } - folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, + folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); if (IS_ERR(folio[1])) { @@ -166,15 +105,24 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, return 0; } -/* Force page buffers uptodate w/o dropping page's lock */ -static int -mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) +static void mext_folio_double_unlock(struct folio *folio[2]) +{ + folio_unlock(folio[0]); + folio_put(folio[0]); + folio_unlock(folio[1]); + folio_put(folio[1]); +} + +/* Force folio buffers uptodate w/o dropping folio's lock */ +static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to) { struct inode *inode = folio->mapping->host; sector_t block; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + struct buffer_head *bh, *head; unsigned int blocksize, block_start, block_end; - int i, err, nr = 0, partial = 0; + int nr = 0; + bool partial = false; + BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -183,317 +131,369 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) blocksize = i_blocksize(inode); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } - - block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); - for (bh = head, block_start = 0; bh != head || !block_start; - block++, block_start = block_end, bh = bh->b_this_page) { + if (!head) + head = create_empty_buffers(folio, blocksize, 0); + + block = folio_pos(folio) >> inode->i_blkbits; + block_end = 0; + bh = head; + do { + block_start = block_end; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) - partial = 1; + partial = true; continue; } if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { - err = ext4_get_block(inode, block, bh, 0); - if (err) { - folio_set_error(folio); + int err = ext4_get_block(inode, block, bh, 0); + if (err) return err; - } if (!buffer_mapped(bh)) { folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); continue; } } - BUG_ON(nr >= MAX_BUF_PER_PAGE); - arr[nr++] = bh; - } + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + ext4_read_bh_nowait(bh, 0, NULL, false); + nr++; + } while (block++, (bh = bh->b_this_page) != head); + /* No io required */ if (!nr) goto out; - for (i = 0; i < nr; i++) { - bh = arr[i]; - if (!bh_uptodate_or_lock(bh)) { - err = ext4_read_bh(bh, 0, NULL); - if (err) - return err; - } - } + bh = head; + do { + if (bh_offset(bh) + blocksize <= from) + continue; + if (bh_offset(bh) >= to) + break; + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + continue; + return -EIO; + } while ((bh = bh->b_this_page) != head); out: if (!partial) folio_mark_uptodate(folio); return 0; } -/** - * move_extent_per_page - Move extent data per page - * - * @o_filp: file structure of original file - * @donor_inode: donor inode - * @orig_page_offset: page index on original file - * @donor_page_offset: page index on donor file - * @data_offset_in_page: block index where data swapping starts - * @block_len_in_page: the number of blocks to be swapped - * @unwritten: orig extent is unwritten or not - * @err: pointer to save return value - * - * Save the data in original inode blocks and replace original inode extents - * with donor inode extents by calling ext4_swap_extents(). - * Finally, write out the saved data in new original inode blocks. Return - * replaced block count. +enum mext_move_type {MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, MEXT_COPY_DATA}; + +/* + * Start to move extent between the origin inode and the donor inode, + * hold one folio for each inode and check the candidate moving extent + * mapping status again. */ -static int -move_extent_per_page(struct file *o_filp, struct inode *donor_inode, - pgoff_t orig_page_offset, pgoff_t donor_page_offset, - int data_offset_in_page, - int block_len_in_page, int unwritten, int *err) +static int mext_move_begin(struct mext_data *mext, struct folio *folio[2], + enum mext_move_type *move_type) { - struct inode *orig_inode = file_inode(o_filp); - struct folio *folio[2] = {NULL, NULL}; - handle_t *handle; - ext4_lblk_t orig_blk_offset, donor_blk_offset; - unsigned long blocksize = orig_inode->i_sb->s_blocksize; - unsigned int tmp_data_size, data_size, replaced_size; - int i, err2, jblocks, retries = 0; - int replaced_count = 0; - int from = data_offset_in_page << orig_inode->i_blkbits; - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; - struct super_block *sb = orig_inode->i_sb; - struct buffer_head *bh = NULL; + struct inode *orig_inode = mext->orig_inode; + struct inode *donor_inode = mext->donor_inode; + unsigned int blkbits = orig_inode->i_blkbits; + struct ext4_map_blocks donor_map = {0}; + loff_t orig_pos, donor_pos; + size_t move_len; + int ret; + + orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits; + donor_pos = ((loff_t)mext->donor_lblk) << blkbits; + ret = mext_folio_double_lock(orig_inode, donor_inode, + orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT, + ((size_t)mext->orig_map.m_len) << blkbits, folio); + if (ret) + return ret; /* - * It needs twice the amount of ordinary journal buffers because - * inode and donor_inode may change each different metadata blocks. + * Check the origin inode's mapping information again under the + * folio lock, as we do not hold the i_data_sem at all times, and + * it may change during the concurrent write-back operation. */ -again: - *err = 0; - jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; - handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); + if (mext->orig_map.m_seq != READ_ONCE(EXT4_I(orig_inode)->i_es_seq)) { + ret = -ESTALE; + goto error; + } + + /* Adjust the moving length according to the length of shorter folio. */ + move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos, + folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos); + move_len >>= blkbits; + if (move_len < mext->orig_map.m_len) + mext->orig_map.m_len = move_len; + + donor_map.m_lblk = mext->donor_lblk; + donor_map.m_len = mext->orig_map.m_len; + donor_map.m_flags = 0; + ret = ext4_map_blocks(NULL, donor_inode, &donor_map, 0); + if (ret < 0) + goto error; + + /* Adjust the moving length according to the donor mapping length. */ + mext->orig_map.m_len = donor_map.m_len; + + /* Skip moving if the donor range is a hole or a delalloc extent. */ + if (!(donor_map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) + *move_type = MEXT_SKIP_EXTENT; + /* If both mapping ranges are unwritten, no need to copy data. */ + else if ((mext->orig_map.m_flags & EXT4_MAP_UNWRITTEN) && + (donor_map.m_flags & EXT4_MAP_UNWRITTEN)) + *move_type = MEXT_MOVE_EXTENT; + else + *move_type = MEXT_COPY_DATA; + + return 0; +error: + mext_folio_double_unlock(folio); + return ret; +} + +/* + * Re-create the new moved mapping buffers of the original inode and commit + * the entire written range. + */ +static int mext_folio_mkwrite(struct inode *inode, struct folio *folio, + size_t from, size_t to) +{ + unsigned int blocksize = i_blocksize(inode); + struct buffer_head *bh, *head; + size_t block_start, block_end; + sector_t block; + int ret; + + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, blocksize, 0); + + block = folio_pos(folio) >> inode->i_blkbits; + block_end = 0; + bh = head; + do { + block_start = block_end; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) + continue; + + ret = ext4_get_block(inode, block, bh, 0); + if (ret) + return ret; + } while (block++, (bh = bh->b_this_page) != head); + + block_commit_write(folio, from, to); + return 0; +} + +/* + * Save the data in original inode extent blocks and replace one folio size + * aligned original inode extent with one or one partial donor inode extent, + * and then write out the saved data in new original inode blocks. Pass out + * the replaced block count through m_len. Return 0 on success, and an error + * code otherwise. + */ +static int mext_move_extent(struct mext_data *mext, u64 *m_len) +{ + struct inode *orig_inode = mext->orig_inode; + struct inode *donor_inode = mext->donor_inode; + struct ext4_map_blocks *orig_map = &mext->orig_map; + unsigned int blkbits = orig_inode->i_blkbits; + struct folio *folio[2] = {NULL, NULL}; + loff_t from, length; + enum mext_move_type move_type = 0; + handle_t *handle; + u64 r_len = 0; + unsigned int credits; + int ret, ret2; + + *m_len = 0; + trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode, + mext->donor_lblk); + credits = ext4_chunk_trans_extent(orig_inode, 0) * 2; + handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits); if (IS_ERR(handle)) { - *err = PTR_ERR(handle); - return 0; + ret = PTR_ERR(handle); + goto out; } - orig_blk_offset = orig_page_offset * blocks_per_page + - data_offset_in_page; - - donor_blk_offset = donor_page_offset * blocks_per_page + - data_offset_in_page; - - /* Calculate data_size */ - if ((orig_blk_offset + block_len_in_page - 1) == - ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { - /* Replace the last block */ - tmp_data_size = orig_inode->i_size & (blocksize - 1); - /* - * If data_size equal zero, it shows data_size is multiples of - * blocksize. So we set appropriate value. - */ - if (tmp_data_size == 0) - tmp_data_size = blocksize; - - data_size = tmp_data_size + - ((block_len_in_page - 1) << orig_inode->i_blkbits); - } else - data_size = block_len_in_page << orig_inode->i_blkbits; - - replaced_size = data_size; - - *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, - donor_page_offset, folio); - if (unlikely(*err < 0)) - goto stop_journal; + ret = mext_move_begin(mext, folio, &move_type); + if (ret) + goto stop_handle; + + if (move_type == MEXT_SKIP_EXTENT) + goto unlock; + /* - * If orig extent was unwritten it can become initialized - * at any time after i_data_sem was dropped, in order to - * serialize with delalloc we have recheck extent while we - * hold page's lock, if it is still the case data copy is not - * necessary, just swap data blocks between orig and donor. + * Copy the data. First, read the original inode data into the page + * cache. Then, release the existing mapping relationships and swap + * the extent. Finally, re-establish the new mapping relationships + * and dirty the page cache. */ + if (move_type == MEXT_COPY_DATA) { + from = offset_in_folio(folio[0], + ((loff_t)orig_map->m_lblk) << blkbits); + length = ((loff_t)orig_map->m_len) << blkbits; - VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); - VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); - VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); - - if (unwritten) { - ext4_double_down_write_data_sem(orig_inode, donor_inode); - /* If any of extents in range became initialized we have to - * fallback to data copying */ - unwritten = mext_check_coverage(orig_inode, orig_blk_offset, - block_len_in_page, 1, err); - if (*err) - goto drop_data_sem; - - unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, - block_len_in_page, 1, err); - if (*err) - goto drop_data_sem; - - if (!unwritten) { - ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto data_copy; - } - if (!filemap_release_folio(folio[0], 0) || - !filemap_release_folio(folio[1], 0)) { - *err = -EBUSY; - goto drop_data_sem; - } - replaced_count = ext4_swap_extents(handle, orig_inode, - donor_inode, orig_blk_offset, - donor_blk_offset, - block_len_in_page, 1, err); - drop_data_sem: - ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto unlock_folios; + ret = mext_folio_mkuptodate(folio[0], from, from + length); + if (ret) + goto unlock; } -data_copy: - *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); - if (*err) - goto unlock_folios; - /* At this point all buffers in range are uptodate, old mapping layout - * is no longer required, try to drop it now. */ if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { - *err = -EBUSY; - goto unlock_folios; + ret = -EBUSY; + goto unlock; } + + /* Move extent */ ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, - orig_blk_offset, donor_blk_offset, - block_len_in_page, 1, err); + *m_len = ext4_swap_extents(handle, orig_inode, donor_inode, + orig_map->m_lblk, mext->donor_lblk, + orig_map->m_len, 1, &ret); ext4_double_up_write_data_sem(orig_inode, donor_inode); - if (*err) { - if (replaced_count) { - block_len_in_page = replaced_count; - replaced_size = - block_len_in_page << orig_inode->i_blkbits; - } else - goto unlock_folios; - } - /* Perform all necessary steps similar write_begin()/write_end() - * but keeping in mind that i_size will not change */ - if (!folio_buffers(folio[0])) - create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); - bh = folio_buffers(folio[0]); - for (i = 0; i < data_offset_in_page; i++) - bh = bh->b_this_page; - for (i = 0; i < block_len_in_page; i++) { - *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); - if (*err < 0) - goto repair_branches; - bh = bh->b_this_page; - } - block_commit_write(&folio[0]->page, from, from + replaced_size); + /* A short-length swap cannot occur after a successful swap extent. */ + if (WARN_ON_ONCE(!ret && (*m_len != orig_map->m_len))) + ret = -EIO; - /* Even in case of data=writeback it is reasonable to pin - * inode to transaction, to prevent unexpected data loss */ - *err = ext4_jbd2_inode_add_write(handle, orig_inode, - (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); + if (!(*m_len) || (move_type == MEXT_MOVE_EXTENT)) + goto unlock; -unlock_folios: - folio_unlock(folio[0]); - folio_put(folio[0]); - folio_unlock(folio[1]); - folio_put(folio[1]); -stop_journal: + /* Copy data */ + length = (*m_len) << blkbits; + ret2 = mext_folio_mkwrite(orig_inode, folio[0], from, from + length); + if (ret2) { + if (!ret) + ret = ret2; + goto repair_branches; + } + /* + * Even in case of data=writeback it is reasonable to pin + * inode to transaction, to prevent unexpected data loss. + */ + ret2 = ext4_jbd2_inode_add_write(handle, orig_inode, + ((loff_t)orig_map->m_lblk) << blkbits, length); + if (!ret) + ret = ret2; +unlock: + mext_folio_double_unlock(folio); +stop_handle: ext4_journal_stop(handle); - if (*err == -ENOSPC && - ext4_should_retry_alloc(sb, &retries)) - goto again; - /* Buffer was busy because probably is pinned to journal transaction, - * force transaction commit may help to free it. */ - if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) - goto again; - return replaced_count; +out: + trace_ext4_move_extent_exit(orig_inode, orig_map->m_lblk, donor_inode, + mext->donor_lblk, orig_map->m_len, *m_len, + move_type, ret); + return ret; repair_branches: - /* - * This should never ever happen! - * Extents are swapped already, but we are not able to copy data. - * Try to swap extents to it's original places - */ - ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, - orig_blk_offset, donor_blk_offset, - block_len_in_page, 0, &err2); - ext4_double_up_write_data_sem(orig_inode, donor_inode); - if (replaced_count != block_len_in_page) { - ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), - EIO, "Unable to copy data block," - " data will be lost."); - *err = -EIO; + ret2 = 0; + r_len = ext4_swap_extents(handle, donor_inode, orig_inode, + mext->donor_lblk, orig_map->m_lblk, + *m_len, 0, &ret2); + if (ret2 || r_len != *m_len) { + ext4_error_inode_block(orig_inode, (sector_t)(orig_map->m_lblk), + EIO, "Unable to copy data block, data will be lost!"); + ret = -EIO; } - replaced_count = 0; - goto unlock_folios; + *m_len = 0; + goto unlock; } -/** - * mext_check_arguments - Check whether move extent can be done - * - * @orig_inode: original inode - * @donor_inode: donor inode - * @orig_start: logical start offset in block for orig - * @donor_start: logical start offset in block for donor - * @len: the number of blocks to be moved - * - * Check the arguments of ext4_move_extents() whether the files can be - * exchanged with each other. - * Return 0 on success, or a negative error value on failure. +/* + * Check the validity of the basic filesystem environment and the + * inodes' support status. */ -static int -mext_check_arguments(struct inode *orig_inode, - struct inode *donor_inode, __u64 orig_start, - __u64 donor_start, __u64 *len) +static int mext_check_validity(struct inode *orig_inode, + struct inode *donor_inode) { - __u64 orig_eof, donor_eof; - unsigned int blkbits = orig_inode->i_blkbits; - unsigned int blocksize = 1 << blkbits; + struct super_block *sb = orig_inode->i_sb; + + /* origin and donor should be different inodes */ + if (orig_inode == donor_inode) { + ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* origin and donor should belone to the same filesystem */ + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (IS_DAX(orig_inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with DAX"); + return -EOPNOTSUPP; + } + + /* + * TODO: it's not obvious how to swap blocks for inodes with full + * journaling enabled. + */ + if (ext4_should_journal_data(orig_inode) || + ext4_should_journal_data(donor_inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with data journaling"); + return -EOPNOTSUPP; + } - orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; - donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; + if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + /* Ext4 move extent supports only extent based file */ + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS)) || + !(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported for non-extent files"); + return -EOPNOTSUPP; + } if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { - ext4_debug("ext4 move extent: suid or sgid is set" - " to donor file [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) { + ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); return -EPERM; + } /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); + orig_inode->i_ino, donor_inode->i_ino); return -ETXTBSY; } - if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { + if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EOPNOTSUPP; - } - - /* Ext4 move extent supports only extent based file */ - if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { - ext4_debug("ext4 move extent: orig file is not extents " - "based file [ino:orig %lu]\n", orig_inode->i_ino); - return -EOPNOTSUPP; - } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { - ext4_debug("ext4 move extent: donor file is not extents " - "based file [ino:donor %lu]\n", donor_inode->i_ino); + orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; } @@ -502,12 +502,25 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + return 0; +} + +/* + * Check the moving range of ext4_move_extents() whether the files can be + * exchanged with each other, and adjust the length to fit within the file + * size. Return 0 on success, or a negative error value on failure. + */ +static int mext_check_adjust_range(struct inode *orig_inode, + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) +{ + __u64 orig_eof, donor_eof; + /* Start offset should be same */ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { - ext4_debug("ext4 move extent: orig and donor's start " - "offsets are not aligned [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); + ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -516,11 +529,14 @@ mext_check_arguments(struct inode *orig_inode, (*len > EXT_MAX_BLOCKS) || (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { - ext4_debug("ext4 move extent: Can't handle over [%u] blocks " - "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, - orig_inode->i_ino, donor_inode->i_ino); + ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n", + EXT_MAX_BLOCKS, + orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } + + orig_eof = EXT4_B_TO_LBLK(orig_inode, i_size_read(orig_inode)); + donor_eof = EXT4_B_TO_LBLK(donor_inode, i_size_read(donor_inode)); if (orig_eof <= orig_start) *len = 0; else if (orig_eof < orig_start + *len - 1) @@ -530,9 +546,8 @@ mext_check_arguments(struct inode *orig_inode, else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { - ext4_debug("ext4 move extent: len should not be 0 " - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, - donor_inode->i_ino); + ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -551,151 +566,89 @@ mext_check_arguments(struct inode *orig_inode, * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. - * */ -int -ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, - __u64 donor_blk, __u64 len, __u64 *moved_len) +int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, + __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); - struct ext4_ext_path *path = NULL; - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; - ext4_lblk_t o_end, o_start = orig_blk; - ext4_lblk_t d_start = donor_blk; + struct mext_data mext; + struct super_block *sb = orig_inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int retries = 0; + u64 m_len; int ret; - if (orig_inode->i_sb != donor_inode->i_sb) { - ext4_debug("ext4 move extent: The argument files " - "should be in same FS [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* orig and donor should be different inodes */ - if (orig_inode == donor_inode) { - ext4_debug("ext4 move extent: The argument files should not " - "be same inode [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* Regular file check */ - if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { - ext4_debug("ext4 move extent: The argument files should be " - "regular file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* TODO: it's not obvious how to swap blocks for inodes with full - journaling enabled */ - if (ext4_should_journal_data(orig_inode) || - ext4_should_journal_data(donor_inode)) { - ext4_msg(orig_inode->i_sb, KERN_ERR, - "Online defrag not supported with data journaling"); - return -EOPNOTSUPP; - } - - if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { - ext4_msg(orig_inode->i_sb, KERN_ERR, - "Online defrag not supported for encrypted files"); - return -EOPNOTSUPP; - } + *moved_len = 0; /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); + ret = mext_check_validity(orig_inode, donor_inode); + if (ret) + goto out; + /* Wait for all existing dio workers */ inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); - /* Protect extent tree against block allocations via delalloc */ - ext4_double_down_write_data_sem(orig_inode, donor_inode); - /* Check the filesystem environment whether move_extent can be done */ - ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, - donor_blk, &len); + /* Check and adjust the specified move_extent range. */ + ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk, + donor_blk, &len); if (ret) goto out; - o_end = o_start + len; - while (o_start < o_end) { - struct ext4_extent *ex; - ext4_lblk_t cur_blk, next_blk; - pgoff_t orig_page_index, donor_page_index; - int offset_in_page; - int unwritten, cur_len; + mext.orig_inode = orig_inode; + mext.donor_inode = donor_inode; + while (len) { + mext.orig_map.m_lblk = orig_blk; + mext.orig_map.m_len = len; + mext.orig_map.m_flags = 0; + mext.donor_lblk = donor_blk; - ret = get_ext_path(orig_inode, o_start, &path); - if (ret) + ret = ext4_map_blocks(NULL, orig_inode, &mext.orig_map, 0); + if (ret < 0) goto out; - ex = path[path->p_depth].p_ext; - cur_blk = le32_to_cpu(ex->ee_block); - cur_len = ext4_ext_get_actual_len(ex); - /* Check hole before the start pos */ - if (cur_blk + cur_len - 1 < o_start) { - next_blk = ext4_ext_next_allocated_block(path); - if (next_blk == EXT_MAX_BLOCKS) { - ret = -ENODATA; - goto out; + + /* Skip moving if it is a hole or a delalloc extent. */ + if (mext.orig_map.m_flags & + (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) { + ret = mext_move_extent(&mext, &m_len); + *moved_len += m_len; + if (!ret) + goto next; + + /* Move failed or partially failed. */ + if (m_len) { + orig_blk += m_len; + donor_blk += m_len; + len -= m_len; } - d_start += next_blk - o_start; - o_start = next_blk; - continue; - /* Check hole after the start pos */ - } else if (cur_blk > o_start) { - /* Skip hole */ - d_start += cur_blk - o_start; - o_start = cur_blk; - /* Extent inside requested range ?*/ - if (cur_blk >= o_end) - goto out; - } else { /* in_range(o_start, o_blk, o_len) */ - cur_len += cur_blk - o_start; + if (ret == -ESTALE) + continue; + if (ret == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + continue; + if (ret == -EBUSY && + sbi->s_journal && retries++ < 4 && + jbd2_journal_force_commit_nested(sbi->s_journal)) + continue; + + goto out; } - unwritten = ext4_ext_is_unwritten(ex); - if (o_end - o_start < cur_len) - cur_len = o_end - o_start; - - orig_page_index = o_start >> (PAGE_SHIFT - - orig_inode->i_blkbits); - donor_page_index = d_start >> (PAGE_SHIFT - - donor_inode->i_blkbits); - offset_in_page = o_start % blocks_per_page; - if (cur_len > blocks_per_page - offset_in_page) - cur_len = blocks_per_page - offset_in_page; - /* - * Up semaphore to avoid following problems: - * a. transaction deadlock among ext4_journal_start, - * ->write_begin via pagefault, and jbd2_journal_commit - * b. racing with ->read_folio, ->write_begin, and - * ext4_get_block in move_extent_per_page - */ - ext4_double_up_write_data_sem(orig_inode, donor_inode); - /* Swap original branches with new branches */ - move_extent_per_page(o_filp, donor_inode, - orig_page_index, donor_page_index, - offset_in_page, cur_len, - unwritten, &ret); - ext4_double_down_write_data_sem(orig_inode, donor_inode); - if (ret < 0) - break; - o_start += cur_len; - d_start += cur_len; +next: + orig_blk += mext.orig_map.m_len; + donor_blk += mext.orig_map.m_len; + len -= mext.orig_map.m_len; + retries = 0; } - *moved_len = o_start - orig_blk; - if (*moved_len > len) - *moved_len = len; out: if (*moved_len) { - ext4_discard_preallocations(orig_inode, 0); - ext4_discard_preallocations(donor_inode, 0); + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); } - ext4_free_ext_path(path); - ext4_double_up_write_data_sem(orig_inode, donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); - return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 933ad03f4f58..c4b5e252af0e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -151,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, return bh; } - if (!bh && (type == INDEX || type == DIRENT_HTREE)) { + /* The first directory block must not be a hole. */ + if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) { ext4_error_inode(inode, func, line, block, - "Directory hole found for htree %s block", - (type == INDEX) ? "index" : "leaf"); + "Directory hole found for htree %s block %u", + (type == INDEX) ? "index" : "leaf", block); return ERR_PTR(-EFSCORRUPTED); } if (!bh) @@ -175,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, brelse(bh); return ERR_PTR(-EFSCORRUPTED); } - if (!ext4_has_metadata_csum(inode->i_sb) || + if (!ext4_has_feature_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; @@ -290,36 +291,6 @@ struct dx_tail { __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; -static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); -static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); -static inline unsigned dx_get_hash(struct dx_entry *entry); -static void dx_set_hash(struct dx_entry *entry, unsigned value); -static unsigned dx_get_count(struct dx_entry *entries); -static unsigned dx_get_limit(struct dx_entry *entries); -static void dx_set_count(struct dx_entry *entries, unsigned value); -static void dx_set_limit(struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit(struct inode *dir, unsigned infosize); -static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(struct ext4_filename *fname, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame); -static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct inode *dir, struct buffer_head *bh, - struct dx_hash_info *hinfo, - struct dx_map_entry *map_tail); -static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, - char *to, struct dx_map_entry *offsets, - int count, unsigned int blocksize); -static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, - unsigned int blocksize); -static void dx_insert_block(struct dx_frame *frame, - u32 hash, ext4_lblk_t block); -static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); @@ -343,17 +314,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + - (EXT4_BLOCK_SIZE(inode->i_sb) - - sizeof(struct ext4_dir_entry_tail))); - while (d < top && d->rec_len) + (blocksize - sizeof(struct ext4_dir_entry_tail))); + while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + - le16_to_cpu(d->rec_len)); + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; @@ -364,7 +335,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, #endif if (t->det_reserved_zero1 || - le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != + sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; @@ -374,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } @@ -396,7 +367,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); @@ -417,7 +388,7 @@ static void ext4_dirblock_csum_set(struct inode *inode, { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); @@ -445,13 +416,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); + unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); - if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + if (rlen == blocksize) count_offset = 8; - else if (le16_to_cpu(dirent->rec_len) == 12) { + else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); - if (le16_to_cpu(dp->rec_len) != - EXT4_BLOCK_SIZE(inode->i_sb) - 12) + if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || @@ -469,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; @@ -477,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(csum, (__u8 *)t, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } @@ -491,7 +462,7 @@ static int ext4_dx_csum_verify(struct inode *inode, struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -520,7 +491,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -609,7 +580,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -619,7 +590,7 @@ static inline unsigned dx_node_limit(struct inode *dir) unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -1073,7 +1044,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; - int csum = ext4_has_metadata_csum(dir->i_sb); + int csum = ext4_has_feature_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -1105,7 +1076,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, - (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + EXT4_LBLK_TO_B(dir, block) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; @@ -1315,8 +1286,9 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; + int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { @@ -1335,11 +1307,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; - map_tail->size = le16_to_cpu(de->rec_len); + map_tail->size = ext4_rec_len_from_disk(de->rec_len, + blocksize); count++; cond_resched(); } - de = ext4_next_entry(de, dir->i_sb->s_blocksize); + de = ext4_next_entry(de, blocksize); } return count; } @@ -1386,83 +1359,32 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) } #if IS_ENABLED(CONFIG_UNICODE) -/* - * Test whether a case-insensitive directory entry matches the filename - * being searched for. If quick is set, assume the name being looked up - * is already in the casefolded form. - * - * Returns: 0 if the directory entry matches, more than 0 if it - * doesn't match or less than zero on error. - */ -static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, - u8 *de_name, size_t de_name_len, bool quick) -{ - const struct super_block *sb = parent->i_sb; - const struct unicode_map *um = sb->s_encoding; - struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); - struct qstr entry = QSTR_INIT(de_name, de_name_len); - int ret; - - if (IS_ENCRYPTED(parent)) { - const struct fscrypt_str encrypted_name = - FSTR_INIT(de_name, de_name_len); - - decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); - if (!decrypted_name.name) - return -ENOMEM; - ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, - &decrypted_name); - if (ret < 0) - goto out; - entry.name = decrypted_name.name; - entry.len = decrypted_name.len; - } - - if (quick) - ret = utf8_strncasecmp_folded(um, name, &entry); - else - ret = utf8_strncasecmp(um, name, &entry); - if (ret < 0) { - /* Handle invalid character sequence as either an error - * or as an opaque byte sequence. - */ - if (sb_has_strict_encoding(sb)) - ret = -EINVAL; - else if (name->len != entry.len) - ret = 1; - else - ret = !!memcmp(name->name, entry.name, entry.len); - } -out: - kfree(decrypted_name.name); - return ret; -} - int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *name) { - struct fscrypt_str *cf_name = &name->cf_name; + struct qstr *cf_name = &name->cf_name; + unsigned char *buf; struct dx_hash_info *hinfo = &name->hinfo; int len; - if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding || + if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; } - cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); - if (!cf_name->name) + buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS); + if (!buf) return -ENOMEM; - len = utf8_casefold(dir->i_sb->s_encoding, - iname, cf_name->name, - EXT4_NAME_LEN); + len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN); if (len <= 0) { - kfree(cf_name->name); - cf_name->name = NULL; + kfree(buf); + buf = NULL; } + cf_name->name = buf; cf_name->len = (unsigned) len; + if (!IS_ENCRYPTED(dir)) return 0; @@ -1496,24 +1418,32 @@ static bool ext4_match(struct inode *parent, #endif #if IS_ENABLED(CONFIG_UNICODE) - if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && + if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { - if (fname->cf_name.name) { - struct qstr cf = {.name = fname->cf_name.name, - .len = fname->cf_name.len}; - if (IS_ENCRYPTED(parent)) { - if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || - fname->hinfo.minor_hash != - EXT4_DIRENT_MINOR_HASH(de)) { - - return false; - } - } - return !ext4_ci_compare(parent, &cf, de->name, - de->name_len, true); - } - return !ext4_ci_compare(parent, fname->usr_fname, de->name, - de->name_len, false); + /* + * Just checking IS_ENCRYPTED(parent) below is not + * sufficient to decide whether one can use the hash for + * skipping the string comparison, because the key might + * have been added right after + * ext4_fname_setup_ci_filename(). In this case, a hash + * mismatch will be a false negative. Therefore, make + * sure cf_name was properly initialized before + * considering the calculated hash. + */ + if (sb_no_casefold_compat_fallback(parent->i_sb) && + IS_ENCRYPTED(parent) && fname->cf_name.name && + (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || + fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) + return false; + /* + * Treat comparison errors as not a match. The + * only case where it happens is on a disk + * corruption or ENOMEM. + */ + + return generic_ci_match(parent, fname->usr_fname, + &fname->cf_name, de->name, + de->name_len) > 0; } #endif @@ -1521,7 +1451,7 @@ static bool ext4_match(struct inode *parent, } /* - * Returns 0 if not found, -1 on failure, and 1 on success + * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, @@ -1542,7 +1472,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, * a full check */ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, buf_size, offset)) - return -1; + return -EFSCORRUPTED; *res_dir = de; return 1; } @@ -1550,7 +1480,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) - return -1; + return -EFSCORRUPTED; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } @@ -1613,7 +1543,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, &has_inline_data); if (inlined) *inlined = has_inline_data; - if (has_inline_data) + if (has_inline_data || IS_ERR(ret)) goto cleanup_and_exit; } @@ -1634,10 +1564,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + else if (!sb_no_casefold_compat_fallback(dir->i_sb) && + *res_dir == NULL && IS_CASEFOLDED(dir)) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " + "failed, falling back\n")); + else goto cleanup_and_exit; - dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " - "falling back\n")); ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); @@ -1695,15 +1630,17 @@ restart: } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, - block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + EXT4_LBLK_TO_B(dir, block), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; goto cleanup_and_exit; } else { brelse(bh); - if (i < 0) + if (i < 0) { + ret = ERR_PTR(i); goto cleanup_and_exit; + } } next: if (++block >= nblocks) @@ -1758,7 +1695,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) @@ -1774,7 +1710,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { - struct super_block * sb = dir->i_sb; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; @@ -1785,7 +1720,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) - return (struct buffer_head *) frame; + return ERR_CAST(frame); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); @@ -1793,12 +1728,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, goto errout; retval = search_dirblock(bh, dir, fname, - block << EXT4_BLOCK_SIZE_BITS(sb), - res_dir); + EXT4_LBLK_TO_B(dir, block), res_dir); if (retval == 1) goto success; brelse(bh); - if (retval == -1) { + if (retval < 0) { bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } @@ -1826,7 +1760,7 @@ success: static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; - struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_2 *de = NULL; struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) @@ -1866,8 +1800,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } } -#if IS_ENABLED(CONFIG_UNICODE) - if (!inode && IS_CASEFOLDED(dir)) { + if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry @@ -1875,7 +1808,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi */ return NULL; } -#endif + return d_splice_alias(inode, dentry); } @@ -1883,7 +1816,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - struct ext4_dir_entry_2 * de; + struct ext4_dir_entry_2 * de = NULL; struct buffer_head *bh; bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); @@ -1984,14 +1917,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int csum_size = 0; int err = 0, i; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - return (struct ext4_dir_entry_2 *) bh2; + return ERR_CAST(bh2); } BUFFER_TRACE(*bh, "get_write_access"); @@ -2034,11 +1967,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, * split it in half by count; each resulting block will have at least * half the space free. */ - if (i > 0) + if (i >= 0) split = count - move; else split = count/2; + if (WARN_ON_ONCE(split == 0)) { + /* Should never happen, but avoid out-of-bounds access below */ + ext4_error_inode_block(dir, (*bh)->b_blocknr, 0, + "bad indexed directory? hash=%08x:%08x count=%d move=%u", + hinfo->hash, hinfo->minor_hash, count, move); + err = -EFSCORRUPTED; + goto out; + } + hash2 = map[split].hash; continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", @@ -2082,15 +2024,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, return de; journal_error: + ext4_std_error(dir->i_sb, err); +out: brelse(*bh); brelse(bh2); *bh = NULL; - ext4_std_error(dir->i_sb, err); return ERR_PTR(err); } -int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) @@ -2172,11 +2114,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, int csum_size = 0; int err, err2; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + err = ext4_find_dest_de(dir, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; @@ -2203,7 +2145,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); @@ -2214,6 +2156,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err ? err : err2; } +static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root) +{ + struct fake_dirent *fde; + const char *error_msg; + unsigned int rlen; + unsigned int blocksize = dir->i_sb->s_blocksize; + char *blockend = (char *)root + dir->i_sb->s_blocksize; + + fde = &root->dot; + if (unlikely(fde->name_len != 1)) { + error_msg = "invalid name_len for '.'"; + goto corrupted; + } + if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) { + error_msg = "invalid name for '.'"; + goto corrupted; + } + rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); + if (unlikely((char *)fde + rlen >= blockend)) { + error_msg = "invalid rec_len for '.'"; + goto corrupted; + } + + fde = &root->dotdot; + if (unlikely(fde->name_len != 2)) { + error_msg = "invalid name_len for '..'"; + goto corrupted; + } + if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) { + error_msg = "invalid name for '..'"; + goto corrupted; + } + rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); + if (unlikely((char *)fde + rlen >= blockend)) { + error_msg = "invalid rec_len for '..'"; + goto corrupted; + } + + return true; + +corrupted: + EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended", + error_msg); + return false; +} + /* * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. @@ -2235,7 +2223,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct fake_dirent *fde; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; @@ -2248,17 +2236,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, brelse(bh); return retval; } + root = (struct dx_root *) bh->b_data; + if (!ext4_check_dx_root(dir, root)) { + brelse(bh); + return -EFSCORRUPTED; + } /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); - if ((char *) de >= (((char *) root) + blocksize)) { - EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); - brelse(bh); - return -EFSCORRUPTED; - } len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ @@ -2276,8 +2264,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, - (data2 + (blocksize - csum_size) - - (char *) de))) { + (char *)de - data2)) { brelse(bh2); brelse(bh); return -EFSCORRUPTED; @@ -2380,22 +2367,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_lblk_t block, blocks; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; blocksize = sb->s_blocksize; - if (!dentry->d_name.len) - return -EINVAL; if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; -#if IS_ENABLED(CONFIG_UNICODE) - if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; -#endif retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); if (retval) @@ -2416,7 +2398,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; @@ -2566,8 +2548,10 @@ again: BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); @@ -2578,8 +2562,10 @@ again: err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); @@ -2598,8 +2584,10 @@ again: dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); @@ -2624,8 +2612,10 @@ again: "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; @@ -2722,7 +2712,7 @@ static int ext4_delete_entry(handle_t *handle, return err; } - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); @@ -2799,6 +2789,7 @@ static int ext4_add_nondir(handle_t *handle, return err; } drop_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; @@ -2896,7 +2887,7 @@ retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + 4 + EXT4_XATTR_TRANS_BLOCKS); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); @@ -2922,48 +2913,59 @@ err_unlock_inode: return err; } -struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, - struct ext4_dir_entry_2 *de, - int blocksize, int csum_size, - unsigned int parent_ino, int dotdot_real_len) +int ext4_init_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh, unsigned int parent_ino, + void *inline_buf, int inline_size) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data; + size_t blocksize = bh->b_size; + int csum_size = 0, header_size; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); - strcpy(de->name, "."); + memcpy(de->name, ".", 2); ext4_set_de_type(inode->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(parent_ino); de->name_len = 2; - if (!dotdot_real_len) - de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + ext4_dir_rec_len(1, NULL)), - blocksize); - else + memcpy(de->name, "..", 3); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + if (inline_buf) { de->rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(de->name_len, NULL), blocksize); - strcpy(de->name, ".."); - ext4_set_de_type(inode->i_sb, de, S_IFDIR); + de = ext4_next_entry(de, blocksize); + header_size = (char *)de - bh->b_data; + memcpy((void *)de, inline_buf, inline_size); + ext4_update_final_de(bh->b_data, inline_size + header_size, + blocksize - csum_size); + } else { + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + ext4_dir_rec_len(1, NULL)), + blocksize); + } - return ext4_next_entry(de, blocksize); + if (csum_size) + ext4_initialize_dirent_tail(bh, blocksize); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + return ext4_handle_dirty_dirblock(handle, inode, bh); } int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode) { struct buffer_head *dir_block = NULL; - struct ext4_dir_entry_2 *de; ext4_lblk_t block = 0; - unsigned int blocksize = dir->i_sb->s_blocksize; - int csum_size = 0; int err; - if (ext4_has_metadata_csum(dir->i_sb)) - csum_size = sizeof(struct ext4_dir_entry_tail); - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { err = ext4_try_create_inline_dir(handle, dir, inode); if (err < 0 && err != -ENOSPC) @@ -2972,39 +2974,30 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, goto out; } + set_nlink(inode, 2); inode->i_size = 0; dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); - de = (struct ext4_dir_entry_2 *)dir_block->b_data; - ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); - set_nlink(inode, 2); - if (csum_size) - ext4_initialize_dirent_tail(dir_block, blocksize); - - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirblock(handle, inode, dir_block); - if (err) - goto out; - set_buffer_verified(dir_block); + err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); out: brelse(dir_block); return err; } -static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) - return -EMLINK; + return ERR_PTR(-EMLINK); err = dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -3054,7 +3047,7 @@ out_stop: out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return ERR_PTR(err); } /* @@ -3082,17 +3075,15 @@ bool ext4_empty_dir(struct inode *inode) EXT4_ERROR_INODE(inode, "invalid size"); return false; } - /* The first directory block must not be a hole, - * so treat it as DIRENT_HTREE - */ - bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); + bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) return false; de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || - le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { + le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 || + de->name[0] != '.') { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); return false; @@ -3101,7 +3092,8 @@ bool ext4_empty_dir(struct inode *inode) de = ext4_next_entry(de, sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || - le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || + de->name[0] != '.' || de->name[1] != '.') { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return false; @@ -3139,11 +3131,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) int retval; struct inode *inode; struct buffer_head *bh; - struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_2 *de = NULL; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; /* Initialize quotas before so that eventual writes go in * separate transaction */ @@ -3197,7 +3190,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) @@ -3207,16 +3200,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); -#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ - if (IS_CASEFOLDED(dir)) + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); -#endif end_rmdir: brelse(bh); @@ -3231,7 +3222,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, { int retval = -ENOENT; struct buffer_head *bh; - struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_2 *de = NULL; handle_t *handle; int skip_remove_dentry = 0; @@ -3272,7 +3263,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) @@ -3302,8 +3293,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; trace_ext4_unlink_enter(dir, dentry); /* @@ -3318,16 +3310,15 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto out_trace; retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); -#if IS_ENABLED(CONFIG_UNICODE) + /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ - if (IS_CASEFOLDED(dir)) + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); -#endif out_trace: trace_ext4_unlink_exit(dentry, retval); @@ -3370,8 +3361,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) - return -EIO; + err = ext4_emergency_state(dir->i_sb); + if (unlikely(err)) + return err; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); @@ -3412,7 +3404,6 @@ retry: inode->i_op = &ext4_symlink_inode_operations; } else { inode->i_op = &ext4_fast_symlink_inode_operations; - inode->i_link = (char *)&EXT4_I(inode)->i_data; } } @@ -3428,6 +3419,9 @@ retry: disk_link.len); inode->i_size = disk_link.len - 1; EXT4_I(inode)->i_disksize = inode->i_size; + if (!IS_ENCRYPTED(inode)) + inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data, + inode->i_size); } err = ext4_add_nondir(handle, dentry, &inode); if (handle) @@ -3437,6 +3431,7 @@ retry: err_drop_inode: clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) @@ -3529,10 +3524,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct ext4_dir_entry_2 *de; unsigned int offset; - /* The first directory block must not be a hole, so - * treat it as DIRENT_HTREE - */ - bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); + bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) { *retval = PTR_ERR(bh); return NULL; @@ -3542,7 +3534,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || - strcmp(".", de->name)) { + de->name_len != 1 || de->name[0] != '.') { EXT4_ERROR_INODE(inode, "directory missing '.'"); brelse(bh); *retval = -EFSCORRUPTED; @@ -3553,7 +3545,8 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, de = ext4_next_entry(de, inode->i_sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || - le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || + de->name[0] != '.' || de->name[1] != '.') { EXT4_ERROR_INODE(inode, "directory missing '..'"); brelse(bh); *retval = -EFSCORRUPTED; @@ -3586,10 +3579,14 @@ struct ext4_renament { int dir_inlined; }; -static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross) { int retval; + ent->is_dir = true; + if (!is_cross) + return 0; + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, &retval, &ent->parent_de, &ent->dir_inlined); @@ -3607,6 +3604,9 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, { int retval; + if (!ent->dir_bh) + return 0; + ent->parent_de->inode = cpu_to_le32(dir_ino); BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); if (!ent->dir_inlined) { @@ -3642,7 +3642,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); - ent->dir->i_mtime = inode_set_ctime_current(ent->dir); + inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir)); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3686,7 +3686,7 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, { int retval = -ENOENT; struct buffer_head *bh; - struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_2 *de = NULL; bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) @@ -3895,7 +3895,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } - retval = ext4_rename_dir_prepare(handle, &old); + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } @@ -3957,9 +3957,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, ext4_dec_count(new.inode); inode_set_ctime_current(new.inode); } - old.dir->i_mtime = inode_set_ctime_current(old.dir); + inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); - if (old.dir_bh) { + if (old.is_dir) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; @@ -3982,7 +3982,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (unlikely(retval)) goto end_rename; - if (S_ISDIR(old.inode->i_mode)) { + if (old.is_dir) { /* * We disable fast commits here that's because the * replay code is not yet capable of changing dot dot @@ -4021,6 +4021,7 @@ end_rename: ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); + ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); @@ -4108,14 +4109,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { - old.is_dir = true; - retval = ext4_rename_dir_prepare(handle, &old); + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } if (S_ISDIR(new.inode->i_mode)) { - new.is_dir = true; - retval = ext4_rename_dir_prepare(handle, &new); + retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir); if (retval) goto end_rename; } @@ -4187,8 +4186,9 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) - return -EIO; + err = ext4_emergency_state(old_dir->i_sb); + if (unlikely(err)) + return err; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index e5b47dda3317..c9b93b670b0f 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -8,6 +8,8 @@ #include "ext4.h" #include "ext4_jbd2.h" +#define EXT4_MAX_ORPHAN_FILE_BLOCKS 512 + static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { int i, j, start; @@ -107,13 +109,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!sbi->s_journal || is_bad_inode(inode)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); - /* - * Inode orphaned in orphan file or in orphan list? - */ - if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || - !list_empty(&EXT4_I(inode)->i_orphan)) + if (ext4_inode_orphan_tracked(inode)) return 0; /* @@ -236,7 +234,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); @@ -517,7 +515,7 @@ void ext4_release_orphan_info(struct super_block *sb) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); - kfree(oi->of_binfo); + kvfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( @@ -537,13 +535,13 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); - calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } @@ -560,10 +558,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, - inodes_per_ob * sizeof(__u32)); + csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } @@ -588,10 +585,22 @@ int ext4_init_orphan_info(struct super_block *sb) ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } + /* + * This is just an artificial limit to prevent corrupted fs from + * consuming absurd amounts of memory when pinning blocks of orphan + * file in memory. + */ + if (inode->i_size > (EXT4_MAX_ORPHAN_FILE_BLOCKS << inode->i_blkbits)) { + ext4_msg(sb, KERN_ERR, "orphan file too big: %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto out_put; + } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; - oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), - GFP_KERNEL); + oi->of_binfo = kvmalloc_array(oi->of_blocks, + sizeof(struct ext4_orphan_block), + GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; @@ -630,7 +639,7 @@ int ext4_init_orphan_info(struct super_block *sb) out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); - kfree(oi->of_binfo); + kvfree(oi->of_binfo); out_put: iput(inode); return ret; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3621f29ec671..39abfeec5f36 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -117,7 +117,6 @@ static void ext4_finish_bio(struct bio *bio) if (bio->bi_status) { int err = blk_status_to_errno(bio->bi_status); - folio_set_error(folio); mapping_set_error(folio->mapping, err); } bh = head = folio_buffers(folio); @@ -165,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) } /* - * Check a range of space and convert unwritten extents to written. Note that + * On successful IO, check a range of space and convert unwritten extents to + * written. On IO failure, check if journal abort is needed. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from * direct IO is achieved) and also waits for PageWriteback bits. Thus we @@ -176,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; handle_t *handle = io_end->handle; + struct super_block *sb = inode->i_sb; int ret = 0; ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { - ext4_msg(inode->i_sb, KERN_EMERG, + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + + if (test_opt(sb, DATA_ERR_ABORT)) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } + if (ret < 0 && !ext4_emergency_state(sb) && + io_end->flag & EXT4_IO_END_UNWRITTEN) { + ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -218,6 +234,18 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) #endif } +static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec)) + return true; + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && + io_end->flag & EXT4_IO_END_FAILED && + !ext4_emergency_state(io_end->inode->i_sb)) + return true; + return false; +} + /* Add the io_end to per-inode completed end_io list. */ static void ext4_add_complete_io(ext4_io_end_t *io_end) { @@ -226,9 +254,12 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - /* Only reserved conversions from writeback should enter here */ - WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - WARN_ON(!io_end->handle && sbi->s_journal); + /* Only reserved conversions or pending IO errors will enter here. */ + WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && + !io_end->handle && sbi->s_journal); + WARN_ON(!io_end->bio); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) @@ -253,7 +284,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, while (!list_empty(&unwritten)) { io_end = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); list_del_init(&io_end->list); err = ext4_end_io_end(io_end); @@ -264,7 +295,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode, } /* - * work on completed IO, to convert unwritten extents to extents + * Used to convert unwritten extents to written extents upon IO completion, + * or used to abort the journal upon IO errors. */ void ext4_end_io_rsv_work(struct work_struct *work) { @@ -289,29 +321,22 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || - list_empty(&io_end->list_vec)) { - ext4_release_io_end(io_end); - return; - } - ext4_add_complete_io(io_end); + if (ext4_io_end_defer_completion(io_end)) + return ext4_add_complete_io(io_end); + + ext4_release_io_end(io_end); } } int ext4_put_io_end(ext4_io_end_t *io_end) { - int err = 0; - if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_io_end_vec(io_end->handle, - io_end); - io_end->handle = NULL; - ext4_clear_io_unwritten_flag(io_end); - } + if (ext4_io_end_defer_completion(io_end)) + return ext4_end_io_end(io_end); + ext4_release_io_end(io_end); } - return err; + return 0; } ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) @@ -345,11 +370,12 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (ext4_io_end_defer_completion(io_end)) { /* * Link bio into list hanging from io_end. We have to do it * atomically as bio completions can be racing against each @@ -418,11 +444,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io, submit_and_retry: ext4_io_submit(io); } - if (io->io_bio == NULL) + if (io->io_bio == NULL) { io_submit_init_bio(io, bh); + io->io_bio->bi_write_hint = inode->i_write_hint; + } if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } @@ -441,10 +469,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); - folio_clear_error(folio); - /* - * Comments copied from block_write_full_page: + * Comments copied from block_write_full_folio: * * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped @@ -521,9 +547,9 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, * first page of the bio. Otherwise it can deadlock. */ if (io->io_bio) - gfp_flags = GFP_NOWAIT | __GFP_NOWARN; + gfp_flags = GFP_NOWAIT; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3e7d160f543f..e7f2350c725b 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - struct folio *folio = fi.folio; - - if (bio->bi_status) - folio_clear_uptodate(folio); - else - folio_mark_uptodate(folio); - folio_unlock(folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); @@ -220,41 +213,44 @@ int ext4_mpage_readpages(struct inode *inode, { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; - sector_t blocks[MAX_BUF_PER_PAGE]; + sector_t first_block; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; - unsigned int nr_pages = rac ? readahead_count(rac) : 1; + unsigned int nr_pages, folio_pages; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; - for (; nr_pages; nr_pages--) { + nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); + for (; nr_pages; nr_pages -= folio_pages) { int fully_mapped = 1; - unsigned first_hole = blocks_per_page; + unsigned int first_hole; + unsigned int blocks_per_folio; if (rac) folio = readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; - block_in_file = next_block = - (sector_t)folio->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + nr_pages * blocks_per_page; + blocks_per_folio = folio_size(folio) >> blkbits; + first_hole = blocks_per_folio; + block_in_file = next_block = EXT4_PG_TO_LBLK(inode, folio->index); + last_block = EXT4_PG_TO_LBLK(inode, folio->index + nr_pages); last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) @@ -270,16 +266,15 @@ int ext4_mpage_readpages(struct inode *inode, unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; + first_block = map.m_pblk + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; - blocks[page_block] = map.m_pblk + map_offset + - relative_block; page_block++; block_in_file++; } @@ -289,14 +284,13 @@ int ext4_mpage_readpages(struct inode *inode, * Then do more ext4_map_blocks() calls until we are * done with this folio. */ - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: - folio_set_error(folio); folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); @@ -305,39 +299,39 @@ int ext4_mpage_readpages(struct inode *inode, } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ - if (page_block && blocks[page_block-1] != map.m_pblk-1) + if (!page_block) + first_block = map.m_pblk; + else if (first_block + page_block != map.m_pblk) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; - blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } } - if (first_hole != blocks_per_page) { + if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, true); continue; } } else if (fully_mapped) { @@ -348,7 +342,7 @@ int ext4_mpage_readpages(struct inode *inode, * This folio will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != blocks[0] - 1 || + if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); @@ -364,7 +358,7 @@ int ext4_mpage_readpages(struct inode *inode, fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); - bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; @@ -376,11 +370,11 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || - (first_hole != blocks_per_page)) { + (first_hole != blocks_per_folio)) { submit_bio(bio); bio = NULL; } else - last_block_in_bio = blocks[blocks_per_page - 1]; + last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0361c20910de..050f26168d97 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -10,8 +10,6 @@ */ -#define EXT4FS_DEBUG - #include <linux/errno.h> #include <linux/slab.h> #include <linux/jiffies.h> @@ -57,7 +55,7 @@ int ext4_resize_begin(struct super_block *sb) * If the reserved GDT blocks is non-zero, the resize_inode feature * should always be set. */ - if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && + if (sbi->s_es->s_reserved_gdt_blocks && !ext4_has_feature_resize_inode(sb)) { ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); return -EFSCORRUPTED; @@ -69,9 +67,9 @@ int ext4_resize_begin(struct super_block *sb) * bad time to do it anyways. */ if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + le32_to_cpu(sbi->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", - (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + (unsigned long long)sbi->s_sbh->b_blocknr); return -EPERM; } @@ -79,7 +77,7 @@ int ext4_resize_begin(struct super_block *sb) * We are not allowed to do online-resizing on a filesystem mounted * with error, because it can destroy the filesystem easily. */ - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + if (sbi->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " "so online resizing is not allowed"); return -EPERM; @@ -91,7 +89,7 @@ int ext4_resize_begin(struct super_block *sb) } if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, - &EXT4_SB(sb)->s_ext4_flags)) + &sbi->s_ext4_flags)) ret = -EBUSY; return ret; @@ -106,18 +104,6 @@ int ext4_resize_end(struct super_block *sb, bool update_backups) return 0; } -static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, - ext4_group_t group) { - return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << - EXT4_DESC_PER_BLOCK_BITS(sb); -} - -static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, - ext4_group_t group) { - group = ext4_meta_bg_first_group(sb, group); - return ext4_group_first_block_no(sb, group); -} - static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, ext4_group_t group) { ext4_grpblk_t overhead; @@ -154,8 +140,9 @@ static int verify_group_input(struct super_block *sb, overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; - input->free_clusters_count = free_blocks_count = - input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + free_blocks_count = input->blocks_count - 2 - overhead - + sbi->s_itb_per_group; + input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " @@ -231,35 +218,55 @@ struct ext4_new_flex_group_data { in the flex group */ __u16 *bg_flags; /* block group flags of groups in @groups */ + ext4_group_t resize_bg; /* number of allocated + new_group_data */ ext4_group_t count; /* number of groups in @groups */ }; /* - * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of - * @flexbg_size. + * Avoiding memory allocation failures due to too many groups added each time. + */ +#define MAX_RESIZE_BG 16384 + +/* + * alloc_flex_gd() allocates an ext4_new_flex_group_data that satisfies the + * resizing from @o_group to @n_group, its size is typically @flexbg_size. * * Returns NULL on failure otherwise address of the allocated structure. */ -static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size, + ext4_group_t o_group, ext4_group_t n_group) { + ext4_group_t last_group; + unsigned int max_resize_bg; struct ext4_new_flex_group_data *flex_gd; flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); if (flex_gd == NULL) goto out3; - if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data)) - goto out2; - flex_gd->count = flexbg_size; + max_resize_bg = umin(flexbg_size, MAX_RESIZE_BG); + flex_gd->resize_bg = max_resize_bg; + + /* Avoid allocating large 'groups' array if not needed */ + last_group = o_group | (flex_gd->resize_bg - 1); + if (n_group <= last_group) + flex_gd->resize_bg = 1 << fls(n_group - o_group); + else if (n_group - last_group < flex_gd->resize_bg) + flex_gd->resize_bg = 1 << max(fls(last_group - o_group), + fls(n_group - last_group)); + + if (WARN_ON_ONCE(flex_gd->resize_bg > max_resize_bg)) + flex_gd->resize_bg = max_resize_bg; - flex_gd->groups = kmalloc_array(flexbg_size, + flex_gd->groups = kmalloc_array(flex_gd->resize_bg, sizeof(struct ext4_new_group_data), GFP_NOFS); if (flex_gd->groups == NULL) goto out2; - flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16), + flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16), GFP_NOFS); if (flex_gd->bg_flags == NULL) goto out1; @@ -296,7 +303,7 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) */ static int ext4_alloc_group_tables(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, - int flexbg_size) + unsigned int flexbg_size) { struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t start_blk; @@ -397,12 +404,12 @@ next_group: group = group_data[0].group; printk(KERN_DEBUG "EXT4-fs: adding a flex group with " - "%d groups, flexbg size is %d:\n", flex_gd->count, + "%u groups, flexbg size is %u:\n", flex_gd->count, flexbg_size); for (i = 0; i < flex_gd->count; i++) { ext4_debug( - "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n", + "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n", ext4_bg_has_super(sb, group + i) ? "normal" : "no-super", group + i, group_data[i].blocks_count, @@ -460,8 +467,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, last_cluster); - for (count2 = count; count > 0; - count -= count2, first_cluster += count2) { + for (; count > 0; count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; @@ -560,13 +566,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) goto handle_itb; - if (meta_bg == 1) { - ext4_group_t first_group; - first_group = ext4_meta_bg_first_group(sb, group); - if (first_group != group + 1 && - first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) - goto handle_itb; - } + if (meta_bg == 1) + goto handle_itb; block = start + ext4_bg_has_super(sb, group); /* Copy all of the GDT blocks into the backup in this group */ @@ -614,7 +615,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, } handle_itb: - /* Initialize group tables of the grop @group */ + /* Initialize group tables of the group @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; @@ -704,16 +705,14 @@ handle_ib: block = start; } - if (count) { - err = set_flexbg_block_bitmap(sb, handle, - flex_gd, - EXT4_B2C(sbi, start), - EXT4_B2C(sbi, - start + count - - 1)); - if (err) - goto out; - } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); + if (err) + goto out; } out: @@ -952,7 +951,13 @@ errout: } /* - * add_new_gdb_meta_bg is the sister of add_new_gdb. + * If there is no available space in the existing block group descriptors for + * the new block group and there are no reserved block group descriptors, then + * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set + * to the first block group that is managed using meta_bg and s_first_meta_bg + * must be a multiple of EXT4_DESC_PER_BLOCK(sb). + * This function will be called when first group of meta_bg is added to bring + * new group descriptors block of new added meta_bg. */ static int add_new_gdb_meta_bg(struct super_block *sb, handle_t *handle, ext4_group_t group) { @@ -962,8 +967,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb, unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int err; - gdblock = ext4_meta_bg_first_block_no(sb, group) + - ext4_bg_has_super(sb, group); + gdblock = ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); @@ -1087,9 +1092,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; - /* printk("reserving backup %lu[%u] = %lu\n", - primary[i]->b_blocknr, gdbackups, - blk + primary[i]->b_blocknr); */ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) @@ -1116,8 +1118,8 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, struct ext4_super_block *es = (struct ext4_super_block *) data; es->s_block_group_nr = cpu_to_le16(group); - if (ext4_has_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); } /* @@ -1191,8 +1193,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, sb, bh, - EXT4_JTR_NONE))) + EXT4_JTR_NONE))) { + brelse(bh); break; + } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) @@ -1296,7 +1300,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { - if (ext4_read_bh(bh, 0, NULL) < 0) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { brelse(bh); return NULL; } @@ -1311,14 +1315,13 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, { struct buffer_head *bh; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); if (!bh) return -EIO; - ext4_inode_bitmap_csum_set(sb, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, bh); brelse(bh); bh = ext4_get_bitmap(sb, group_data->block_bitmap); @@ -1600,8 +1603,10 @@ exit_journal: int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); - int meta_bg = ext4_has_feature_meta_bg(sb); - sector_t old_gdb = 0; + int meta_bg = ext4_has_feature_meta_bg(sb) && + gdb_num >= le32_to_cpu(es->s_first_meta_bg); + sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - + ext4_group_first_block_no(sb, 0); update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); @@ -1610,11 +1615,8 @@ exit_journal: gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); - if (old_gdb == gdb_bh->b_blocknr) - continue; - update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, - gdb_bh->b_size, meta_bg); - old_gdb = gdb_bh->b_blocknr; + update_backups(sb, gdb_bh->b_blocknr - padding_blocks, + gdb_bh->b_data, gdb_bh->b_size, meta_bg); } } exit: @@ -1623,8 +1625,7 @@ exit: static int ext4_setup_next_flex_gd(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, - ext4_fsblk_t n_blocks_count, - unsigned long flexbg_size) + ext4_fsblk_t n_blocks_count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; @@ -1648,7 +1649,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, BUG_ON(last); ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); - last_group = group | (flexbg_size - 1); + last_group = group | (flex_gd->resize_bg - 1); if (last_group > n_group) last_group = n_group; @@ -1980,9 +1981,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) errout: ret = ext4_journal_stop(handle); - if (!err) - err = ret; - return ret; + return err ? err : ret; invalid_resize_inode: ext4_error(sb, "corrupted/inconsistent resize inode"); @@ -2010,8 +2009,9 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_fsblk_t o_blocks_count; ext4_fsblk_t n_blocks_count_retry = 0; unsigned long last_update_time = 0; - int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; + int err = 0; int meta_bg; + unsigned int flexbg_size = ext4_flex_bg_size(sbi); /* See if the device is actually as big as what was requested */ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0); @@ -2086,7 +2086,7 @@ retry: } } - if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { + if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) { err = ext4_convert_meta_bg(sb, resize_inode); if (err) goto out; @@ -2143,7 +2143,7 @@ retry: if (err) goto out; - flex_gd = alloc_flex_gd(flexbg_size); + flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group); if (flex_gd == NULL) { err = -ENOMEM; goto out; @@ -2152,8 +2152,7 @@ retry: /* Add flex groups. Note that a regular group is a * flex group with 1 group. */ - while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, - flexbg_size)) { + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) { if (time_is_before_jiffies(last_update_time + HZ * 10)) { if (last_update_time) ext4_msg(sb, KERN_INFO, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 73547d2334fd..87205660c5d0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); -static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, @@ -161,8 +160,14 @@ MODULE_ALIAS("ext3"); static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io) + bh_end_io_t *end_io, bool simu_fail) { + if (simu_fail) { + clear_buffer_uptodate(bh); + unlock_buffer(bh); + return; + } + /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we @@ -176,7 +181,7 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io) + bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); @@ -184,10 +189,11 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, unlock_buffer(bh); return; } - __ext4_read_bh(bh, op_flags, end_io); + __ext4_read_bh(bh, op_flags, end_io, simu_fail); } -int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) +int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); @@ -196,7 +202,7 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io return 0; } - __ext4_read_bh(bh, op_flags, end_io); + __ext4_read_bh(bh, op_flags, end_io, simu_fail); wait_on_buffer(bh); if (buffer_uptodate(bh)) @@ -208,10 +214,10 @@ int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { - ext4_read_bh_nowait(bh, op_flags, NULL); + ext4_read_bh_nowait(bh, op_flags, NULL, false); return 0; } - return ext4_read_bh(bh, op_flags, NULL); + return ext4_read_bh(bh, op_flags, NULL, false); } /* @@ -244,22 +250,38 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { - return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS) | __GFP_MOVABLE; + + return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { - return __ext4_sb_bread_gfp(sb, block, 0, 0); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS); + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); +} + +struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, + sector_t block) +{ + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL; + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { - struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); + struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, + sb->s_blocksize, GFP_NOWAIT); if (likely(bh)) { if (trylock_buffer(bh)) - ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false); brelse(bh); } } @@ -273,14 +295,12 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } -__le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es) +__le32 ext4_superblock_csum(struct ext4_super_block *es) { - struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; - csum = ext4_chksum(sbi, ~0, (char *)es, offset); + csum = ext4_chksum(~0, (char *)es, offset); return cpu_to_le32(csum); } @@ -288,20 +308,20 @@ __le32 ext4_superblock_csum(struct super_block *sb, static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; - return es->s_checksum == ext4_superblock_csum(sb, es); + return es->s_checksum == ext4_superblock_csum(es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -339,9 +359,9 @@ __u32 ext4_free_group_clusters(struct super_block *sb, __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { - return le16_to_cpu(bg->bg_free_inodes_count_lo) | + return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); + (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, @@ -395,9 +415,9 @@ void ext4_free_group_clusters_set(struct super_block *sb, void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { - bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count)); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); + WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16)); } void ext4_used_dirs_set(struct super_block *sb, @@ -435,59 +455,69 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) /* - * The del_gendisk() function uninitializes the disk-specific data - * structures, including the bdi structure, without telling anyone - * else. Once this happens, any attempt to call mark_buffer_dirty() - * (for example, by ext4_commit_super), will cause a kernel OOPS. - * This is a kludge to prevent these oops until we can put in a proper - * hook in del_gendisk() to inform the VFS and file system layers. + * The ext4_maybe_update_superblock() function checks and updates the + * superblock if needed. + * + * This function is designed to update the on-disk superblock only under + * certain conditions to prevent excessive disk writes and unnecessary + * waking of the disk from sleep. The superblock will be updated if: + * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last + * superblock update + * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the + * last superblock update. + * + * @sb: The superblock */ -static int block_device_ejected(struct super_block *sb) +static void ext4_maybe_update_superblock(struct super_block *sb) { - struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = inode_to_bdi(bd_inode); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + journal_t *journal = sbi->s_journal; + time64_t now; + __u64 last_update; + __u64 lifetime_write_kbytes; + __u64 diff_size; + + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !(sb->s_flags & SB_ACTIVE) || !journal || + journal->j_flags & JBD2_UNMOUNT) + return; + + now = ktime_get_real_seconds(); + last_update = ext4_get_tstamp(es, s_wtime); - return bdi->dev == NULL; + if (likely(now - last_update < sbi->s_sb_update_sec)) + return; + + lifetime_write_kbytes = sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1); + + /* Get the number of kilobytes not written to disk to account + * for statistics and compare with a multiple of 16 MB. This + * is used to determine when the next superblock commit should + * occur (i.e. not more often than once per 16MB if there was + * less written in an hour). + */ + diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); + + if (diff_size > sbi->s_sb_update_kb) + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); - - spin_lock(&sbi->s_md_lock); - while (!list_empty(&txn->t_private_list)) { - jce = list_entry(txn->t_private_list.next, - struct ext4_journal_cb_entry, jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - jce->jce_func(sb, jce, error); - spin_lock(&sbi->s_md_lock); - } - spin_unlock(&sbi->s_md_lock); + ext4_maybe_update_superblock(sb); } -/* - * This writepage callback for write_cache_pages() - * takes care of a few cases after page cleaning. - * - * write_cache_pages() already checks for dirty pages - * and calls clear_page_dirty_for_io(), which we want, - * to write protect the pages. - * - * However, we may have to redirty a page (see below.) - */ -static int ext4_journalled_writepage_callback(struct folio *folio, - struct writeback_control *wbc, - void *data) +static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, + struct folio *folio) { - transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; @@ -508,15 +538,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != transaction || - jh->b_next_transaction))) { - folio_redirty_for_writepage(wbc, folio); - goto out; - } + (jh && (jh->b_transaction != jinode->i_transaction || + jh->b_next_transaction))) + return true; } while ((bh = bh->b_this_page) != head); -out: - return AOP_WRITEPAGE_ACTIVATE; + return false; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -528,10 +555,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; + struct folio *folio = NULL; + int error; + + /* + * writeback_iter() already checks for dirty pages and calls + * folio_clear_dirty_for_io(), which we want to write protect the + * folios. + * + * However, we may have to redirty a folio sometimes. + */ + while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { + if (ext4_journalled_writepage_needs_redirty(jinode, folio)) + folio_redirty_for_writepage(&wbc, folio); + folio_unlock(folio); + } - return write_cache_pages(mapping, &wbc, - ext4_journalled_writepage_callback, - jinode->i_transaction); + return error; } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -657,11 +697,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (!continue_fs && !sb_rdonly(sb)) { - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); - if (journal) - jbd2_journal_abort(journal, -EIO); - } + if (!continue_fs && !ext4_emergency_ro(sb) && journal) + jbd2_journal_abort(journal, -error); if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); @@ -669,10 +706,14 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we - * defer superblock flushing to a workqueue. + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. */ - if (continue_fs && journal) - schedule_work(&EXT4_SB(sb)->s_error_work); + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } @@ -687,22 +728,23 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_id); } - if (sb_rdonly(sb) || continue_fs) + if (ext4_emergency_ro(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* - * Make sure updated value of ->s_mount_flags will be visible before - * ->s_flags update + * We don't set SB_RDONLY because that requires sb->s_umount + * semaphore and setting it without proper remount procedure is + * confusing code such as freeze_super() leading to deadlocks + * and other problems. */ - smp_wmb(); - sb->s_flags |= SB_RDONLY; + set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } -static void flush_stashed_error_work(struct work_struct *work) +static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, - s_error_work); + s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; @@ -714,8 +756,11 @@ static void flush_stashed_error_work(struct work_struct *work) * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ - if (!sb_rdonly(sbi->s_sb) && journal) { + if (!ext4_emergency_state(sbi->s_sb) && + !sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; + bool call_notify_err = false; + handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; @@ -723,6 +768,10 @@ static void flush_stashed_error_work(struct work_struct *work) jbd2_journal_stop(handle); goto write_directly; } + + if (sbi->s_add_error_count > 0) + call_notify_err = true; + ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " @@ -736,7 +785,10 @@ static void flush_stashed_error_work(struct work_struct *work) goto write_directly; } jbd2_journal_stop(handle); - ext4_notify_error_sysfs(sbi); + + if (call_notify_err) + ext4_notify_error_sysfs(sbi); + return; } write_directly: @@ -759,7 +811,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -784,7 +836,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -819,7 +871,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -899,7 +951,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_emergency_state(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -993,7 +1045,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -1019,7 +1071,7 @@ __acquires(bitlock) if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } @@ -1097,26 +1149,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -/* - * Open the external journal device - */ -static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &fs_holder_ops); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext4_msg(sb, KERN_ERR, - "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return NULL; -} - static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; @@ -1251,10 +1283,10 @@ static void ext4_put_super(struct super_block *sb) * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL - * Unregister sysfs before flush sbi->s_error_work. + * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. - * flush_stashed_error_work will call start_this_handle may trigger + * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); @@ -1266,18 +1298,17 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } - } + } else + flush_work(&sbi->s_sb_upd_work); ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); @@ -1285,16 +1316,20 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!sb_rdonly(sb) && !aborted) { - ext4_clear_feature_journal_needs_recovery(sb); - ext4_clear_feature_orphan_present(sb); - es->s_state = cpu_to_le16(sbi->s_mount_state); - } - if (!sb_rdonly(sb)) + if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { + if (!aborted) { + ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); + es->s_state = cpu_to_le16(sbi->s_mount_state); + } ext4_commit_super(sb); + } ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); + + WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) && + percpu_counter_sum(&sbi->s_dirtyclusters_counter)); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) @@ -1311,14 +1346,14 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev) { + if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ - sync_blockdev(sbi->s_journal_bdev); - invalidate_bdev(sbi->s_journal_bdev); + sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -1337,8 +1372,6 @@ static void ext4_put_super(struct super_block *sb) */ kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); - if (sbi->s_chksum_driver) - crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); @@ -1363,6 +1396,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); @@ -1373,6 +1407,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; + ei->i_es_seq = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); @@ -1385,16 +1420,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); - mutex_init(&ei->i_fc_lock); + spin_lock_init(&ei->i_fc_lock); return &ei->vfs_inode; } static int ext4_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); @@ -1415,9 +1449,9 @@ static void ext4_free_in_core_inode(struct inode *inode) static void ext4_destroy_inode(struct inode *inode) { - if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + if (ext4_inode_orphan_tracked(inode)) { ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): orphan list check failed!", + "Inode %lu (%p): inode tracked as orphan!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), @@ -1425,7 +1459,8 @@ static void ext4_destroy_inode(struct inode *inode) dump_stack(); } - if (EXT4_I(inode)->i_reserved_data_blocks) + if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) && + WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks)) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), @@ -1446,14 +1481,19 @@ static void init_once(void *foo) init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + ei->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); @@ -1477,7 +1517,7 @@ void ext4_clear_inode(struct inode *inode) ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { @@ -1552,7 +1592,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); -static struct dquot **ext4_get_dquots(struct inode *inode) +static struct dquot __rcu **ext4_get_dquots(struct inode *inode) { return EXT4_I(inode)->i_dquot; } @@ -1606,6 +1646,7 @@ static const struct super_operations ext4_sops = { }; static const struct export_operations ext4_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, @@ -1675,10 +1716,6 @@ static const struct constant_table ext4_param_dax[] = { {} }; -/* String parameter that allows empty argument */ -#define fsparam_string_empty(NAME, OPT) \ - __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) - /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the @@ -1693,8 +1730,8 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), - fsparam_u32 ("resgid", Opt_resgid), - fsparam_u32 ("resuid", Opt_resuid), + fsparam_gid ("resgid", Opt_resgid), + fsparam_uid ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), @@ -1785,7 +1822,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { {} }; -#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 @@ -1875,6 +1911,7 @@ static const struct mount_opts { {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif + {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; @@ -1943,8 +1980,6 @@ struct ext4_fs_context { unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; - unsigned long vals_s_mount_flags; - unsigned long mask_s_mount_flags; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; @@ -1980,6 +2015,9 @@ int ext4_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &ext4_context_ops; + /* i_version is always enabled now */ + fc->sb_flags |= SB_I_VERSION; + return 0; } @@ -2031,8 +2069,7 @@ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; - if (ctx->s_qf_names[qtype]) - kfree(ctx->s_qf_names[qtype]); + kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; @@ -2065,16 +2102,16 @@ static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, } #define EXT4_SET_CTX(name) \ -static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ - unsigned long flag) \ +static inline __maybe_unused \ +void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ -static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ - unsigned long flag) \ +static inline __maybe_unused \ +void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ @@ -2095,20 +2132,12 @@ EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); -static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit) -{ - set_bit(bit, &ctx->mask_s_mount_flags); - set_bit(bit, &ctx->vals_s_mount_flags); -} - static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; struct fs_parse_result result; const struct mount_opts *m; int is_remount; - kuid_t uid; - kgid_t gid; int token; token = fs_parse(fc, ext4_param_specs, param, &result); @@ -2164,9 +2193,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; - case Opt_abort: - ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2253,23 +2279,11 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) { - ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", - result.uint_32); - return -EINVAL; - } - ctx->s_resuid = uid; + ctx->s_resuid = result.uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) { - ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", - result.uint_32); - return -EINVAL; - } - ctx->s_resgid = gid; + ctx->s_resgid = result.gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: @@ -2446,8 +2460,7 @@ static int parse_options(struct fs_context *fc, char *options) param.size = v_len; ret = ext4_parse_param(fc, ¶m); - if (param.string) - kfree(param.string); + kfree(param.string); if (ret < 0) return ret; } @@ -2464,7 +2477,7 @@ static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *s_mount_opts = NULL; + char s_mount_opts[64]; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; @@ -2472,15 +2485,12 @@ static int parse_apply_sb_mount_options(struct super_block *sb, if (!sbi->s_es->s_mount_opts[0]) return 0; - s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, - sizeof(sbi->s_es->s_mount_opts), - GFP_KERNEL); - if (!s_mount_opts) - return ret; + if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts) < 0) + return -E2BIG; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) - goto out_free; + return -ENOMEM; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) @@ -2512,11 +2522,8 @@ parse_failed: ret = 0; out_free: - if (fc) { - ext4_fc_free(fc); - kfree(fc); - } - kfree(s_mount_opts); + ext4_fc_free(fc); + kfree(fc); return ret; } @@ -2754,15 +2761,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc, return -EINVAL; } - if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (blocksize < PAGE_SIZE) - ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an " - "experimental mount option 'dioread_nolock' " - "for blocksize < PAGE_SIZE"); - } - err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; @@ -2782,6 +2780,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -2820,8 +2825,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; - sbi->s_mount_flags &= ~ctx->mask_s_mount_flags; - sbi->s_mount_flags |= ctx->vals_s_mount_flags; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; @@ -2966,11 +2969,11 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || - le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + ext4_get_resuid(es) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || - le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + ext4_get_resgid(es) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); @@ -2986,6 +2989,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (nodefs && sb->s_flags & SB_I_VERSION) + SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & @@ -3034,6 +3039,15 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PUTS("mb_optimize_scan=1"); } + if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) + SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + + if (ext4_emergency_ro(sb)) + SEQ_OPTS_PUTS("emergency_ro"); + + if (ext4_forced_shutdown(sb)) + SEQ_OPTS_PUTS("shutdown"); + ext4_show_quota_options(seq, sb); return 0; } @@ -3050,7 +3064,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset) seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); - seq_puts(seq, "\n"); + seq_putc(seq, '\n'); return rc; } @@ -3201,19 +3215,19 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sbi->s_sb)) { + if (ext4_has_feature_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; - csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; @@ -3581,14 +3595,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#if !IS_ENABLED(CONFIG_UNICODE) - if (ext4_has_feature_casefold(sb)) { + if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } -#endif if (readonly) return 1; @@ -3631,7 +3643,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) */ static void print_daily_error_info(struct timer_list *t) { - struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); + struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; @@ -3691,7 +3703,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && - !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + !ext4_emergency_state(sb) && !sb_rdonly(sb) && + test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; @@ -3715,12 +3728,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = 1; if (!ret) { - start_time = ktime_get_real_ns(); + start_time = ktime_get_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { - elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * + elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -3780,8 +3793,9 @@ static int ext4_lazyinit_thread(void *arg) cont_thread: while (true) { - next_wakeup = MAX_JIFFY_OFFSET; + bool next_wakeup_initialized = false; + next_wakeup = 0; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); @@ -3794,8 +3808,11 @@ cont_thread: lr_request); if (time_before(jiffies, elr->lr_next_sched)) { - if (time_before(elr->lr_next_sched, next_wakeup)) + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { @@ -3823,16 +3840,18 @@ cont_thread: elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } - if (time_before(elr->lr_next_sched, next_wakeup)) + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; - if ((time_after_eq(cur, next_wakeup)) || - (MAX_JIFFY_OFFSET == next_wakeup)) { + if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) { cond_resched(); continue; } @@ -3990,7 +4009,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (sb_rdonly(sb) || + if (ext4_emergency_state(sb) || sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; @@ -4053,7 +4072,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; @@ -4172,7 +4191,7 @@ int ext4_calculate_overhead(struct super_block *sb) unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - char *buf = (char *) get_zeroed_page(GFP_NOFS); + char *buf = kvmalloc(sb->s_blocksize, GFP_NOFS | __GFP_ZERO); if (!buf) return -ENOMEM; @@ -4197,7 +4216,7 @@ int ext4_calculate_overhead(struct super_block *sb) blks = count_overhead(sb, i, buf); overhead += blks; if (blks) - memset(buf, 0, PAGE_SIZE); + memset(buf, 0, sb->s_blocksize); cond_resched(); } @@ -4205,12 +4224,12 @@ int ext4_calculate_overhead(struct super_block *sb) * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->s_journal_bdev) + if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); - if (j_inode) { + if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); @@ -4220,7 +4239,7 @@ int ext4_calculate_overhead(struct super_block *sb) } sbi->s_overhead = overhead; smp_wmb(); - free_page((unsigned long) buf); + kvfree(buf); return 0; } @@ -4341,7 +4360,7 @@ static void ext4_set_def_opts(struct super_block *sb, if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) @@ -4373,8 +4392,7 @@ static void ext4_set_def_opts(struct super_block *sb, ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - if (sb->s_blocksize == PAGE_SIZE) - set_opt(sb, DIOREAD_NOLOCK); + set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) @@ -4394,22 +4412,6 @@ static int ext4_handle_clustersize(struct super_block *sb) } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); - sbi->s_clusters_per_group = - le32_to_cpu(es->s_clusters_per_group); - if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#clusters per group too big: %lu", - sbi->s_clusters_per_group); - return -EINVAL; - } - if (sbi->s_blocks_per_group != - (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { - ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " - "clusters per group (%lu) inconsistent", - sbi->s_blocks_per_group, - sbi->s_clusters_per_group); - return -EINVAL; - } } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, @@ -4423,9 +4425,21 @@ static int ext4_handle_clustersize(struct super_block *sb) sbi->s_blocks_per_group); return -EINVAL; } - sbi->s_clusters_per_group = sbi->s_blocks_per_group; sbi->s_cluster_bits = 0; } + sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + return -EINVAL; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { + ext4_msg(sb, KERN_ERR, + "blocks per group (%lu) and clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, sbi->s_clusters_per_group); + return -EINVAL; + } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ @@ -4435,6 +4449,39 @@ static int ext4_handle_clustersize(struct super_block *sb) return 0; } +/* + * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * With non-bigalloc filesystem awu will be based upon filesystem blocksize + * & bdev awu units. + * With bigalloc it will be based upon bigalloc cluster size & bdev awu units. + * @sb: super block + */ +static void ext4_atomic_write_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *bdev = sb->s_bdev; + unsigned int clustersize = EXT4_CLUSTER_SIZE(sb); + + if (!bdev_can_atomic_write(bdev)) + return; + + if (!ext4_has_feature_extents(sb)) + return; + + sbi->s_awu_min = max(sb->s_blocksize, + bdev_atomic_write_unit_min_bytes(bdev)); + sbi->s_awu_max = min(clustersize, + bdev_atomic_write_unit_max_bytes(bdev)); + if (sbi->s_awu_min && sbi->s_awu_max && + sbi->s_awu_min <= sbi->s_awu_max) { + ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", + sbi->s_awu_min, sbi->s_awu_max); + } else { + sbi->s_awu_min = 0; + sbi->s_awu_max = 0; + } +} + static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -4448,7 +4495,7 @@ static void ext4_fast_commit_init(struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); + mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; @@ -4598,15 +4645,6 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, ext4_orphan_file_block_trigger); - /* Load the checksum driver */ - sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(sbi->s_chksum_driver)) { - int ret = PTR_ERR(sbi->s_chksum_driver); - ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); - sbi->s_chksum_driver = NULL; - return ret; - } - /* Check superblock checksum */ if (!ext4_superblock_csum_verify(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " @@ -4617,8 +4655,9 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + else if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) + sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } @@ -4948,10 +4987,7 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } @@ -4988,6 +5024,59 @@ static int ext4_check_journal_data_mode(struct super_block *sb) return 0; } +static const char *ext4_has_journal_option(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + return "journal_async_commit"; + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) + return "journal_checksum"; + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + return "commit="; + if (EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) + return "data="; + if (test_opt(sb, DATA_ERR_ABORT)) + return "data_err=abort"; + return NULL; +} + +/* + * Limit the maximum folio order to 2048 blocks to prevent overestimation + * of reserve handle credits during the folio writeback in environments + * where the PAGE_SIZE exceeds 4KB. + */ +#define EXT4_MAX_PAGECACHE_ORDER(sb) \ + umin(MAX_PAGECACHE_ORDER, (11 + (sb)->s_blocksize_bits - PAGE_SHIFT)) +static void ext4_set_max_mapping_order(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + sbi->s_max_folio_order = sbi->s_min_folio_order; + else + sbi->s_max_folio_order = EXT4_MAX_PAGECACHE_ORDER(sb); +} + +static int ext4_check_large_folio(struct super_block *sb) +{ + const char *err_str = NULL; + + if (ext4_has_feature_encrypt(sb)) + err_str = "encrypt"; + + if (!err_str) { + ext4_set_max_mapping_order(sb); + } else if (sb->s_blocksize > PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s", + sb->s_blocksize, PAGE_SIZE, err_str); + return -EINVAL; + } + + return 0; +} + static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { @@ -5055,11 +5144,8 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, * If the default block size is not the same as the real block size, * we need to reload it. */ - if (sb->s_blocksize == blocksize) { - *lsb = logical_sb_block; - sbi->s_sbh = bh; - return 0; - } + if (sb->s_blocksize == blocksize) + goto success; /* * bh must be released before kill_bdev(), otherwise @@ -5090,6 +5176,9 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } + +success: + sbi->s_min_folio_order = get_order(blocksize); *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; @@ -5098,16 +5187,27 @@ out: return ret; } -static void ext4_hash_info_init(struct super_block *sb) +static int ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; + sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_def_hash_version > DX_HASH_LAST) { + ext4_msg(sb, KERN_ERR, + "Invalid default hash set in the superblock"); + return -EINVAL; + } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) { + ext4_msg(sb, KERN_ERR, + "SIPHASH is not a valid default hash value"); + return -EINVAL; + } + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) @@ -5125,6 +5225,7 @@ static void ext4_hash_info_init(struct super_block *sb) #endif } } + return 0; } static int ext4_block_group_meta_init(struct super_block *sb, int silent) @@ -5176,6 +5277,18 @@ static int ext4_block_group_meta_init(struct super_block *sb, int silent) return 0; } +/* + * It's hard to get stripe aligned blocks if stripe is not aligned with + * cluster, just disable stripe and alert user to simplify code and avoid + * stripe aligned allocation which will rarely succeed. + */ +static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + return (stripe > 0 && sbi->s_cluster_ratio > 1 && + stripe % sbi->s_cluster_ratio != 0); +} + static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; @@ -5190,7 +5303,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -5209,11 +5322,13 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_set_def_opts(sb, es); - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es)); + sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; + sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; /* * set default s_li_wait_mult for lazyinit, for the case there is @@ -5238,6 +5353,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_apply_options(fc, sb); + err = ext4_check_large_folio(sb); + if (err < 0) + goto failed_mount; + err = ext4_encoding_init(sb, es); if (err) goto failed_mount; @@ -5249,8 +5368,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); - /* i_version is always enabled now */ - sb->s_flags |= SB_I_VERSION; + /* HSM events are allowed by default. */ + sb->s_iflags |= SB_I_ALLOW_HSM; err = ext4_check_feature_compatibility(sb, es, silent); if (err) @@ -5260,7 +5379,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount; - ext4_hash_info_init(sb); + err = ext4_hash_info_init(sb); + if (err) + goto failed_mount; err = ext4_handle_clustersize(sb); if (err) @@ -5272,7 +5393,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); - INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) @@ -5283,13 +5404,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); - /* - * It's hard to get stripe aligned blocks if stripe is not aligned with - * cluster, just disable stripe and alert user to simpfy code and avoid - * stripe aligned allocation which will rarely successes. - */ - if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && - sbi->s_stripe % sbi->s_cluster_ratio != 0) { + if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", @@ -5318,11 +5433,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); + super_set_sysfs_name_bdev(sb); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + spin_lock_init(&sbi->s_bdev_wb_lock); + + ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -5346,36 +5465,25 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; + if (bdev_read_only(sb->s_bdev)) + needs_recovery = 0; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount3a; } else { + const char *journal_option; + /* Nojournal mode, all journal mount options are illegal */ - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit, fs mounted w/o journal"); + journal_option = ext4_has_journal_option(sb); + if (journal_option != NULL) { + ext4_msg(sb, KERN_ERR, + "can't mount with %s, fs mounted w/o journal", + journal_option); goto failed_mount3a; } - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); - goto failed_mount3a; - } - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "commit=%lu, fs mounted w/o journal", - sbi->s_commit_interval / HZ); - goto failed_mount3a; - } - if (EXT4_MOUNT_DATA_FLAGS & - (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "data=, fs mounted w/o journal"); - goto failed_mount3a; - } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); @@ -5456,6 +5564,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount4; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); @@ -5527,19 +5636,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount6; - err = ext4_register_sysfs(sb); - if (err) - goto failed_mount7; - err = ext4_init_orphan_info(sb); if (err) - goto failed_mount8; + goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount9; + goto failed_mount8; } #endif /* CONFIG_QUOTA */ @@ -5547,8 +5652,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ - spin_lock_init(&sbi->s_bdev_wb_lock); - errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); @@ -5565,12 +5669,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount10; + goto failed_mount9; } - if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); + clear_opt(sb, DISCARD); + } if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5582,15 +5688,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); + /* Register sysfs after all initializations are complete. */ + err = ext4_register_sysfs(sb); + if (err) + goto failed_mount9; + return 0; -failed_mount10: +failed_mount9: ext4_quotas_off(sb, EXT4_MAXQUOTAS); -failed_mount9: __maybe_unused +failed_mount8: __maybe_unused ext4_release_orphan_info(sb); -failed_mount8: - ext4_unregister_sysfs(sb); - kobject_put(&sbi->s_kobj); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -5615,23 +5723,17 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: - /* flush s_error_work before sbi destroy */ - flush_work(&sbi->s_error_work); - del_timer_sync(&sbi->s_err_report); + /* flush s_sb_upd_work before sbi destroy */ + flush_work(&sbi->s_sb_upd_work); ext4_stop_mmpd(sbi); + timer_delete_sync(&sbi->s_err_report); ext4_group_desc_free(sbi); failed_mount: - if (sbi->s_chksum_driver) - crypto_free_shash(sbi->s_chksum_driver); - #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif @@ -5642,9 +5744,9 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); - if (sbi->s_journal_bdev) { - invalidate_bdev(sbi->s_journal_bdev); - blkdev_put(sbi->s_journal_bdev, sb); + if (sbi->s_journal_bdev_file) { + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); + bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5726,10 +5828,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; - if (test_opt(sb, DATA_ERR_ABORT)) - journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; - else - journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. @@ -5751,22 +5849,22 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); - return NULL; + return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } - - ext4_debug("Journal inode found at %p: %lld bytes\n", - journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } + + ext4_debug("Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); return journal_inode; } @@ -5785,31 +5883,28 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block) ext4_msg(journal->j_inode->i_sb, KERN_CRIT, "journal bmap failed: block %llu ret %d\n", *block, ret); - jbd2_journal_abort(journal, ret ? ret : -EIO); + jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED); return ret; } *block = map.m_pblk; return 0; } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static journal_t *ext4_open_inode_journal(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; - journal_inode = ext4_get_journal_inode(sb, journal_inum); - if (!journal_inode) - return NULL; + if (IS_ERR(journal_inode)) + return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); - if (!journal) { + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); - return NULL; + return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; @@ -5817,43 +5912,47 @@ static journal_t *ext4_get_journal(struct super_block *sb, return journal; } -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) +static struct file *ext4_get_journal_blkdev(struct super_block *sb, + dev_t j_dev, ext4_fsblk_t *j_start, + ext4_fsblk_t *j_len) { struct buffer_head *bh; - journal_t *journal; - ext4_fsblk_t start; - ext4_fsblk_t len; + struct block_device *bdev; + struct file *bdev_file; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; - struct block_device *bdev; - - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; + int errno; - /* see get_tree_bdev why this is needed and safe */ - up_write(&sb->s_umount); - bdev = ext4_blkdev_get(j_dev, sb); - down_write(&sb->s_umount); - if (bdev == NULL) - return NULL; + bdev_file = bdev_file_open_by_dev(j_dev, + BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, + sb, &fs_holder_ops); + if (IS_ERR(bdev_file)) { + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); + return bdev_file; + } + bdev = file_bdev(bdev_file); blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); + errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; - set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { + set_blocksize(bdev_file, blocksize); + bh = __bread(bdev, sb_block, blocksize); + if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); + errno = -EINVAL; goto out_bdev; } @@ -5861,57 +5960,74 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + errno = -EFSCORRUPTED; + goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && - es->s_checksum != ext4_superblock_csum(sb, es)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "corrupt superblock"); - brelse(bh); - goto out_bdev; + es->s_checksum != ext4_superblock_csum(es)) { + ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + errno = -EFSCORRUPTED; + goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); - brelse(bh); - goto out_bdev; + errno = -EFSCORRUPTED; + goto out_bh; } - len = ext4_blocks_count(es); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ + *j_start = sb_block + 1; + *j_len = ext4_blocks_count(es); + brelse(bh); + return bdev_file; - journal = jbd2_journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); - if (!journal) { +out_bh: + brelse(bh); +out_bdev: + bdev_fput(bdev_file); + return ERR_PTR(errno); +} + +static journal_t *ext4_open_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + journal_t *journal; + ext4_fsblk_t j_start; + ext4_fsblk_t j_len; + struct file *bdev_file; + int errno = 0; + + bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); + + journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, + j_len, sb->s_blocksize); + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); + errno = PTR_ERR(journal); goto out_bdev; } - journal->j_private = sb; - if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); + errno = -EINVAL; goto out_journal; } - EXT4_SB(sb)->s_journal_bdev = bdev; + journal->j_private = sb; + EXT4_SB(sb)->s_journal_bdev_file = bdev_file; ext4_init_journal_params(sb, journal); return journal; out_journal: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: - blkdev_put(bdev, sb); - return NULL; + bdev_fput(bdev_file); + return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, @@ -5943,13 +6059,13 @@ static int ext4_load_journal(struct super_block *sb, } if (journal_inum) { - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; + journal = ext4_open_inode_journal(sb, journal_inum); + if (IS_ERR(journal)) + return PTR_ERR(journal); } else { - journal = ext4_get_dev_journal(sb, journal_dev); - if (!journal) - return -EINVAL; + journal = ext4_open_dev_journal(sb, journal_dev); + if (IS_ERR(journal)) + return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); @@ -6025,8 +6141,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { - EXT4_SB(sb)->s_journal = NULL; - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6044,7 +6159,7 @@ static int ext4_load_journal(struct super_block *sb, return 0; err_out: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6066,7 +6181,7 @@ static void ext4_update_super(struct super_block *sb) * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ - if (!(sb->s_flags & SB_RDONLY)) + if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + @@ -6088,8 +6203,8 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); - strncpy(es->s_first_error_func, sbi->s_first_error_func, - sizeof(es->s_first_error_func)); + strtomem_pad(es->s_first_error_func, + sbi->s_first_error_func, 0); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = @@ -6102,8 +6217,7 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); - strncpy(es->s_last_error_func, sbi->s_last_error_func, - sizeof(es->s_last_error_func)); + strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); @@ -6130,8 +6244,6 @@ static int ext4_commit_super(struct super_block *sb) if (!sbh) return -EINVAL; - if (block_device_ejected(sb)) - return -ENODEV; ext4_update_super(sb); @@ -6264,13 +6376,7 @@ static int ext4_clear_journal_err(struct super_block *sb, */ int ext4_force_commit(struct super_block *sb) { - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; - return ext4_journal_force_commit(journal); + return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -6280,8 +6386,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) - return 0; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); @@ -6329,12 +6436,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) static int ext4_freeze(struct super_block *sb) { int error = 0; - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; + journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ @@ -6368,7 +6470,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) + if (ext4_emergency_state(sb)) return 0; if (EXT4_SB(sb)->s_journal) { @@ -6407,6 +6509,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; + int alloc_ctx; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; @@ -6443,11 +6546,29 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; } + if ((ctx->spec & EXT4_SPEC_s_stripe) && + ext4_is_stripe_incompatible(sb, ctx->s_stripe)) { + ext4_msg(sb, KERN_WARNING, + "stripe (%lu) is not aligned with cluster size (%u), " + "stripe is disabled", + ctx->s_stripe, sbi->s_cluster_ratio); + ctx->s_stripe = 0; + } + + /* + * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause + * two calls to ext4_should_dioread_nolock() to return inconsistent + * values, triggering WARN_ON in ext4_add_complete_io(). we grab + * here s_writepages_rwsem to avoid race between writepages ops and + * remount. + */ + alloc_ctx = ext4_writepages_down_write(sb); ext4_apply_options(fc, sb); + ext4_writepages_up_write(sb, alloc_ctx); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -6484,8 +6605,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) - ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); + if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) && + !test_opt(sb, DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount"); + err = -EINVAL; + goto restore_opts; + } sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -6498,10 +6623,10 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } /* Flush outstanding errors before changing fs state */ - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { + if (ext4_emergency_state(sb)) { err = -EROFS; goto restore_opts; } @@ -6655,6 +6780,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); + /* + * Handle aborting the filesystem as the last thing during remount to + * avoid obsure errors during remount when some option changes fail to + * apply due to shutdown filesystem. + */ + if (test_opt2(sb, ABORT)) + ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); + return 0; restore_opts: @@ -6662,9 +6795,11 @@ restore_opts: * If there was a failing r/w to ro transition, we may need to * re-enable quota */ - if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); + + alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -6673,6 +6808,8 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + ext4_writepages_up_write(sb, alloc_ctx); + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA @@ -6694,6 +6831,7 @@ static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; + bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); @@ -6705,9 +6843,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } @@ -6731,22 +6869,29 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; - if (limit && buf->f_blocks > limit) { + if (limit) { + uint64_t remaining = 0; + curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; - buf->f_blocks = limit; - buf->f_bfree = buf->f_bavail = - (buf->f_blocks > curblock) ? - (buf->f_blocks - curblock) : 0; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); - if (limit && buf->f_files > limit) { - buf->f_files = limit; - buf->f_ffree = - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); @@ -6815,6 +6960,10 @@ static int ext4_write_dquot(struct dquot *dquot) if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to commit dquot type %d", + dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; @@ -6831,6 +6980,10 @@ static int ext4_acquire_dquot(struct dquot *dquot) if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to acquire dquot type %d", + dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; @@ -6841,18 +6994,39 @@ static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; + bool freeze_protected = false; + + /* + * Trying to sb_start_intwrite() in a running transaction + * can result in a deadlock. Further, running transactions + * are already protected from freezing. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(dquot->dq_sb); + freeze_protected = true; + } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to release dquot type %d", + dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; + + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + return ret; } @@ -7071,6 +7245,13 @@ static int ext4_quota_off(struct super_block *sb, int type) err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; + /* + * When the filesystem was remounted read-only first, we cannot cleanup + * inode flags here. Bad luck but people should be using QUOTA feature + * these days anyway. + */ + if (sb_rdonly(sb)) + goto out_put; inode_lock(inode); /* @@ -7085,7 +7266,7 @@ static int ext4_quota_off(struct super_block *sb, int type) } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: @@ -7183,7 +7364,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); @@ -7258,12 +7439,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb) static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL; + struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; kill_block_super(sb); - if (journal_bdev) - blkdev_put(journal_bdev, sb); + if (bdev_file) + bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { @@ -7272,16 +7453,14 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("ext4"); -/* Shared across all ext4 file systems */ -wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - static int __init ext4_init_fs(void) { - int i, err; + int err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; @@ -7289,9 +7468,6 @@ static int __init ext4_init_fs(void) /* Build-time check for flags consistency */ ext4_check_flag_values(); - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) - init_waitqueue_head(&ext4__ioend_wq[i]); - err = ext4_init_es(); if (err) return err; @@ -7378,6 +7554,5 @@ static void __exit ext4_exit_fs(void) MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); -MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 75bf1f88843c..645240cc0229 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); - if (IS_ERR(bh)) - return ERR_CAST(bh); - if (!bh || !ext4_buffer_uptodate(bh)) + if (IS_ERR(bh) || !bh) return ERR_PTR(-ECHILD); + if (!ext4_buffer_uptodate(bh)) { + brelse(bh); + return ERR_PTR(-ECHILD); + } } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 6d332dff79dd..0018e09b867e 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -29,7 +29,10 @@ typedef enum { attr_trigger_test_error, attr_first_error_time, attr_last_error_time, + attr_clusters_in_group, + attr_mb_order, attr_feature, + attr_pointer_pi, attr_pointer_ui, attr_pointer_ul, attr_pointer_u64, @@ -104,7 +107,7 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (ret || val >= clusters) + if (ret || val >= clusters || (s64)val < 0) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); @@ -178,6 +181,9 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) +#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) @@ -207,23 +213,25 @@ EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); +EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, + ext4_sb_info, s_mb_group_prealloc); +EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, + ext4_sb_info, s_mb_best_avail_max_trim_order); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -246,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); +EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec); +EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -297,6 +307,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_prefetch), ATTR_LIST(mb_prefetch_limit), ATTR_LIST(last_trim_minblks), + ATTR_LIST(sb_update_sec), + ATTR_LIST(sb_update_kb), NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -320,6 +332,9 @@ EXT4_ATTR_FEATURE(fast_commit); #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) EXT4_ATTR_FEATURE(encrypted_casefold); #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +EXT4_ATTR_FEATURE(blocksize_gt_pagesize); +#endif static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), @@ -340,6 +355,9 @@ static struct attribute *ext4_feat_attrs[] = { #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) ATTR_LIST(encrypted_casefold), #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ATTR_LIST(blocksize_gt_pagesize), +#endif NULL, }; ATTRIBUTE_GROUPS(ext4_feat); @@ -366,13 +384,45 @@ static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) #define print_tstamp(buf, es, tstamp) \ __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) +static ssize_t ext4_generic_attr_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; + + switch (a->attr_id) { + case attr_inode_readahead: + case attr_clusters_in_group: + case attr_mb_order: + case attr_pointer_pi: + case attr_pointer_ui: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); + case attr_pointer_u8: + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); + case attr_pointer_u64: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); + case attr_pointer_string: + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); + case attr_pointer_atomic: + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); + } + return 0; +} + static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, s_kobj); struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); switch (a->attr_id) { case attr_delayed_allocation_blocks: @@ -391,45 +441,6 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return sysfs_emit(buf, "%llu\n", (unsigned long long) percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); - case attr_inode_readahead: - case attr_pointer_ui: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%u\n", - le32_to_cpup(ptr)); - else - return sysfs_emit(buf, "%u\n", - *((unsigned int *) ptr)); - case attr_pointer_ul: - if (!ptr) - return 0; - return sysfs_emit(buf, "%lu\n", - *((unsigned long *) ptr)); - case attr_pointer_u8: - if (!ptr) - return 0; - return sysfs_emit(buf, "%u\n", - *((unsigned char *) ptr)); - case attr_pointer_u64: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%llu\n", - le64_to_cpup(ptr)); - else - return sysfs_emit(buf, "%llu\n", - *((unsigned long long *) ptr)); - case attr_pointer_string: - if (!ptr) - return 0; - return sysfs_emit(buf, "%.*s\n", a->attr_size, - (char *) ptr); - case attr_pointer_atomic: - if (!ptr) - return 0; - return sysfs_emit(buf, "%d\n", - atomic_read((atomic_t *) ptr)); case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_first_error_time: @@ -438,29 +449,34 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_last_error_time); case attr_journal_task: return journal_task_show(sbi, buf); + default: + return ext4_generic_attr_show(a, sbi, buf); } - - return 0; } -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) +static ssize_t ext4_generic_attr_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t len) { - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); - unsigned long t; int ret; + unsigned int t; + unsigned long lt; + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; switch (a->attr_id) { - case attr_reserved_clusters: - return reserved_clusters_store(sbi, buf, len); + case attr_pointer_pi: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if ((int)t < 0) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ui: - if (!ptr) - return 0; - ret = kstrtoul(skip_spaces(buf), 0, &t); + ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; if (a->attr_ptr == ptr_ext4_super_block_offset) @@ -468,20 +484,50 @@ static ssize_t ext4_attr_store(struct kobject *kobj, else *((unsigned int *) ptr) = t; return len; + case attr_mb_order: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > 64) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; + case attr_clusters_in_group: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > sbi->s_clusters_per_group) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ul: - if (!ptr) - return 0; - ret = kstrtoul(skip_spaces(buf), 0, &t); + ret = kstrtoul(skip_spaces(buf), 0, <); if (ret) return ret; - *((unsigned long *) ptr) = t; + *((unsigned long *) ptr) = lt; return len; + } + return 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + switch (a->attr_id) { + case attr_reserved_clusters: + return reserved_clusters_store(sbi, buf, len); case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); + default: + return ext4_generic_attr_store(a, sbi, buf, len); } - return 0; } static void ext4_sb_release(struct kobject *kobj) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 2f37e1ea3955..415d9c4d8a32 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -76,17 +76,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; void *fsdata = NULL; int res; - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); if (res) return res; - memcpy_to_page(page, offset_in_page(pos), buf, n); + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); if (res < 0) return res; if (res != n) @@ -302,7 +302,7 @@ static int ext4_get_verity_descriptor_location(struct inode *inode, end_lblk = le32_to_cpu(last_extent->ee_block) + ext4_ext_get_actual_len(last_extent); - desc_size_pos = (u64)end_lblk << inode->i_blkbits; + desc_size_pos = EXT4_LBLK_TO_B(inode, end_lblk); ext4_free_ext_path(path); if (desc_size_pos < sizeof(desc_size_disk)) @@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations ext4_verityops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .begin_enable_verity = ext4_begin_enable_verity, .end_enable_verity = ext4_end_enable_verity, .get_verity_descriptor = ext4_get_verity_descriptor, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 281e1bfbbe3e..2e02efbddaac 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; -const struct xattr_handler *ext4_xattr_handlers[] = { +const struct xattr_handler * const ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(csum, (__u8 *)hdr, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + csum = ext4_chksum(csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); @@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); @@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } @@ -251,6 +251,10 @@ check_xattrs(struct inode *inode, struct buffer_head *bh, err_str = "invalid ea_ino"; goto errout; } + if (ea_ino && !size) { + err_str = "invalid size in ea xattr"; + goto errout; + } if (size > EXT4_XATTR_SIZE_MAX) { err_str = "e_value size too large"; goto errout; @@ -308,7 +312,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) -static inline int +int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { @@ -316,9 +320,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, function, line); } -#define xattr_check_inode(inode, header, end) \ - __xattr_check_inode((inode), (header), (end), __func__, __LINE__) - static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) @@ -341,7 +342,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, cmp = name_len - entry->e_name_len; if (!cmp) cmp = memcmp(name, entry->e_name, name_len); - if (cmp <= 0 && (sorted || cmp == 0)) + if (!cmp || (cmp < 0 && sorted)) break; } *pentry = entry; @@ -351,12 +352,12 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { - return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); + return ext4_chksum(sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { - return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) | + return ((u64) inode_get_ctime_sec(ea_inode) << 32) | (u32) inode_peek_iversion_raw(ea_inode); } @@ -368,12 +369,12 @@ static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { - return (u32)ea_inode->i_atime.tv_sec; + return (u32) inode_get_atime_sec(ea_inode); } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { - ea_inode->i_atime.tv_sec = hash; + inode_set_atime(ea_inode, hash, 0); } /* @@ -418,7 +419,7 @@ free_bhs: return ret; } -#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode))) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) @@ -458,7 +459,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); ext4_xattr_inode_set_ref(inode, 1); } else { - inode_lock(inode); + inode_lock_nested(inode, I_MUTEX_XATTR); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); } @@ -649,10 +650,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; + end = ITAIL(inode, raw_inode); entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) @@ -701,7 +699,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) @@ -783,7 +781,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; - void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) @@ -793,14 +790,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); -cleanup: brelse(iloc.bh); return error; } @@ -868,7 +860,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; - void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); @@ -879,10 +870,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - ret = xattr_check_inode(inode, header, end); - if (ret) - goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) @@ -979,7 +966,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; + credits += ext4_chunk_trans_extent(inode, 1) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) @@ -1036,23 +1023,27 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { struct ext4_iloc iloc; - s64 ref_count; + u64 ref_count; int ret; - inode_lock(ea_inode); + inode_lock_nested(ea_inode, I_MUTEX_XATTR); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); if (ret) goto out; ref_count = ext4_xattr_inode_get_ref(ea_inode); + if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) { + ext4_error_inode(ea_inode, __func__, __LINE__, 0, + "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d", + ea_inode->i_ino, ref_count, ref_change); + ret = -EFSCORRUPTED; + goto out; + } ref_count += ref_change; ext4_xattr_inode_set_ref(ea_inode, ref_count); if (ref_change > 0) { - WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", - ea_inode->i_ino, ref_count); - if (ref_count == 1) { WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); @@ -1061,9 +1052,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, ext4_orphan_del(handle, ea_inode); } } else { - WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", - ea_inode->i_ino, ref_count); - if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, "EA inode %lu i_nlink=%u", @@ -1176,15 +1164,28 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; + struct ext4_iloc iloc; bool dirty = false; unsigned int ea_ino; int err; int credits; + void *end; + + if (block_csum) + end = (void *)bh->b_data + bh->b_size; + else { + err = ext4_get_inode_loc(parent, &iloc); + if (err) { + EXT4_ERROR_INODE(parent, "parent inode loc (error %d)", err); + return; + } + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; + } /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; - for (entry = first; !IS_LAST_ENTRY(entry); + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; @@ -1433,6 +1434,12 @@ retry: goto out; memcpy(bh->b_data, buf, csize); + /* + * Zero out block tail to avoid writing uninitialized memory + * to disk. + */ + if (csize < blocksize) + memset(bh->b_data + csize, 0, blocksize - csize); set_buffer_uptodate(bh); ext4_handle_dirty_metadata(handle, ea_inode, bh); @@ -1532,7 +1539,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && !(current->flags & PF_MEMALLOC_NOFS)); - ea_data = kvmalloc(value_len, GFP_KERNEL); + ea_data = kvmalloc(value_len, GFP_NOFS); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; @@ -1565,46 +1572,49 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, /* * Add value of the EA in an inode. */ -static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, - const void *value, size_t value_len, - struct inode **ret_inode) +static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, + struct inode *inode, const void *value, size_t value_len) { struct inode *ea_inode; u32 hash; int err; + /* Account inode & space to quota even if sharing... */ + err = ext4_xattr_inode_alloc_quota(inode, value_len); + if (err) + return ERR_PTR(err); + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); - if (err) { - iput(ea_inode); - return err; - } - - *ret_inode = ea_inode; - return 0; + if (err) + goto out_err; + return ea_inode; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); - if (IS_ERR(ea_inode)) - return PTR_ERR(ea_inode); + if (IS_ERR(ea_inode)) { + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ea_inode; + } err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); - iput(ea_inode); - return err; + goto out_err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); - - *ret_inode = ea_inode; - return 0; + return ea_inode; +out_err: + iput(ea_inode); + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ERR_PTR(err); } /* @@ -1616,6 +1626,7 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, + struct inode *new_ea_inode, bool is_block) { struct ext4_xattr_entry *last, *next; @@ -1623,7 +1634,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; - struct inode *new_ea_inode = NULL; size_t old_size, new_size; int ret; @@ -1708,43 +1718,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, old_ea_inode = NULL; goto out; } - } - if (i->value && in_inode) { - WARN_ON_ONCE(!i->value_len); - - ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); - if (ret) - goto out; - - ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, - i->value_len, - &new_ea_inode); - if (ret) { - new_ea_inode = NULL; - ext4_xattr_inode_free_quota(inode, NULL, i->value_len); - goto out; - } - } - if (old_ea_inode) { /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); - if (ret) { - /* Release newly required ref count on new_ea_inode. */ - if (new_ea_inode) { - int err; - - err = ext4_xattr_inode_dec_ref(handle, - new_ea_inode); - if (err) - ext4_warning_inode(new_ea_inode, - "dec ref new_ea_inode err=%d", - err); - ext4_xattr_inode_free_quota(inode, new_ea_inode, - i->value_len); - } + if (ret) goto out; - } ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); @@ -1868,7 +1846,6 @@ update_hash: ret = 0; out: iput(old_ea_inode); - iput(new_ea_inode); return ret; } @@ -1931,9 +1908,21 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, size_t old_ea_inode_quota = 0; unsigned int ea_ino; - #define header(x) ((struct ext4_xattr_header *)(x)) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) { + error = PTR_ERR(ea_inode); + ea_inode = NULL; + goto cleanup; + } + } + if (s->base) { int offset = (char *)s->here - bs->bh->b_data; @@ -1942,6 +1931,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, EXT4_JTR_NONE); if (error) goto cleanup; + lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { @@ -1968,7 +1958,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, - true /* is_block */); + ea_inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -2036,33 +2026,22 @@ clone_block: s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; - if (i->value && s->here->e_value_inum) { - /* - * A ref count on ea_inode has been taken as part of the call to - * ext4_xattr_set_entry() above. We would like to drop this - * extra ref but we have to wait until the xattr block is - * initialized and has its own ref count on the ea_inode. - */ - ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, - le32_to_cpu(s->here->e_hash), - &ea_inode); - if (error) { - ea_inode = NULL; +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); + if (IS_ERR(new_bh)) { + error = PTR_ERR(new_bh); + new_bh = NULL; goto cleanup; } - } -inserted: - if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_block_cache_find(inode, header(s->base), - &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -2211,17 +2190,16 @@ getblk_failed: cleanup: if (ea_inode) { - int error2; - - error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); - if (error2) - ext4_warning_inode(ea_inode, "dec ref error=%d", - error2); + if (error) { + int error2; - /* If there was an error, revert the quota charge. */ - if (error) + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); + } iput(ea_inode); } if (ce) @@ -2258,11 +2236,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; - is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + is->s.end = ITAIL(inode, raw_inode); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = xattr_check_inode(inode, header, is->s.end); - if (error) - return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); @@ -2279,14 +2254,38 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; + struct inode *ea_inode = NULL; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + } + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + false /* is_block */); + if (error) { + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + ext4_xattr_inode_free_quota(inode, ea_inode, + i_size_read(ea_inode)); + iput(ea_inode); + } return error; + } header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); @@ -2295,6 +2294,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } + iput(ea_inode); return 0; } @@ -2557,6 +2557,8 @@ retry: error = ext4_xattr_set_handle(handle, inode, name_index, name, value, value_len, flags); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, + handle); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) @@ -2564,7 +2566,6 @@ retry: if (error == 0) error = error2; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); return error; } @@ -2783,14 +2784,10 @@ retry: */ base = IFIRST(header); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + end = ITAIL(inode, raw_inode); min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; - ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; @@ -2877,33 +2874,31 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. - * If *ea_inode_array is NULL, this is essentially offsetof() */ - (*ea_inode_array) = - kmalloc(offsetof(struct ext4_xattr_inode_array, - inodes[EIA_MASK]), - GFP_NOFS); + (*ea_inode_array) = kmalloc( + struct_size(*ea_inode_array, inodes, EIA_MASK), + GFP_NOFS); if (*ea_inode_array == NULL) return -ENOMEM; (*ea_inode_array)->count = 0; } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ struct ext4_xattr_inode_array *new_array = NULL; - int count = (*ea_inode_array)->count; - /* if new_array is NULL, this is essentially offsetof() */ new_array = kmalloc( - offsetof(struct ext4_xattr_inode_array, - inodes[count + EIA_INCR]), - GFP_NOFS); + struct_size(*ea_inode_array, inodes, + (*ea_inode_array)->count + EIA_INCR), + GFP_NOFS); if (new_array == NULL) return -ENOMEM; memcpy(new_array, *ea_inode_array, - offsetof(struct ext4_xattr_inode_array, inodes[count])); + struct_size(*ea_inode_array, inodes, + (*ea_inode_array)->count)); kfree(*ea_inode_array); *ea_inode_array = new_array; } - (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; + (*ea_inode_array)->count++; + (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode; return 0; } @@ -3034,8 +3029,6 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) * * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. - * - * Returns 0, or a negative error number on failure. */ static void ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, @@ -3063,8 +3056,7 @@ ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, * * Compare two extended attribute blocks for equality. * - * Returns 0 if the blocks are equal, 1 if they differ, and - * a negative error number on errors. + * Returns 0 if the blocks are equal, 1 if they differ. */ static int ext4_xattr_cmp(struct ext4_xattr_header *header1, @@ -3103,8 +3095,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * * Find an identical extended attribute block. * - * Returns a pointer to the block found, or NULL if such a block was - * not found or an error occurred. + * Returns a pointer to the block found, or NULL if such a block was not + * found, or an error pointer if an error occurred while reading ea block. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, @@ -3126,11 +3118,11 @@ ext4_xattr_block_cache_find(struct inode *inode, bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { - if (PTR_ERR(bh) == -ENOMEM) - return NULL; - bh = NULL; - EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long)ce->e_value); + if (PTR_ERR(bh) != -ENOMEM) + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long)ce->e_value); + mb_cache_entry_put(ea_block_cache, ce); + return bh; } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 824faf0b15a8..1fedf44d4fb6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -32,8 +32,7 @@ struct ext4_xattr_header { __le32 h_refcount; /* reference count */ __le32 h_blocks; /* number of disk blocks used */ __le32 h_hash; /* hash value of all attributes */ - __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ - /* id = inum if refcount=1, blknum otherwise */ + __le32 h_checksum; /* crc32c(uuid+blknum+xattrblock) */ __u32 h_reserved[3]; /* zero right now */ }; @@ -68,6 +67,9 @@ struct ext4_xattr_entry { ((void *)raw_inode + \ EXT4_GOOD_OLD_INODE_SIZE + \ EXT4_I(inode)->i_extra_isize)) +#define ITAIL(inode, raw_inode) \ + ((void *)(raw_inode) + \ + EXT4_SB((inode)->i_sb)->s_inode_size) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) /* @@ -130,8 +132,8 @@ struct ext4_xattr_ibody_find { }; struct ext4_xattr_inode_array { - unsigned int count; /* # of used items in the array */ - struct inode *inodes[]; + unsigned int count; + struct inode *inodes[] __counted_by(count); }; extern const struct xattr_handler ext4_xattr_user_handler; @@ -193,7 +195,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); extern void ext4_evict_ea_inode(struct inode *inode); -extern const struct xattr_handler *ext4_xattr_handlers[]; +extern const struct xattr_handler * const ext4_xattr_handlers[]; extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); @@ -207,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); +extern int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line); + +#define xattr_check_inode(inode, header, end) \ + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) + #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); |
