diff options
Diffstat (limited to 'fs/ext4')
40 files changed, 2546 insertions, 768 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index d9beca1653c5..8fdfcd3c3e04 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux ext4-filesystem routines. # diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 3ec0e46de95f..fb50f9aa6ead 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/acl.c * @@ -183,7 +184,7 @@ ext4_get_acl(struct inode *inode, int type) */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, - struct posix_acl *acl) + struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; @@ -193,13 +194,6 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); - } break; case ACL_TYPE_DEFAULT: @@ -218,11 +212,12 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } error = ext4_xattr_set_handle(handle, inode, name_index, "", - value, size, 0); + value, size, xattr_flags); kfree(value); - if (!error) + if (!error) { set_cached_acl(inode, type, acl); + } return error; } @@ -231,18 +226,38 @@ int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; - int error, retries = 0; + int error, credits, retries = 0; + size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + umode_t mode = inode->i_mode; + int update_mode = 0; error = dquot_initialize(inode); if (error) return error; retry: - handle = ext4_journal_start(inode, EXT4_HT_XATTR, - ext4_jbd2_credits_xattr(inode)); + error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, + &credits); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); - error = __ext4_set_acl(handle, inode, type, acl); + if ((type == ACL_TYPE_ACCESS) && acl) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + goto out_stop; + update_mode = 1; + } + + error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); + if (!error && update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + ext4_mark_inode_dirty(handle, inode); + } +out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -267,13 +282,13 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, - default_acl); + default_acl, XATTR_CREATE); posix_acl_release(default_acl); } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, - acl); + acl, XATTR_CREATE); posix_acl_release(acl); } return error; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index da2c79577d72..a48fc5ae2701 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* File: fs/ext4/acl.h diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index e04ec868e37e..d5ddfb96c83c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/balloc.c * diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 4a606afb171f..f63e028c638c 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/bitmap.c * diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fdb19543af1e..bee888e0e2db 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/block_validity.c * diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index e8b365000d73..d5babc9f222b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/dir.c * @@ -411,7 +412,7 @@ static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, { struct dir_private_info *p; - p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; p->curr_hash = pos2maj_hash(filp, pos); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 32191548abed..58a0304566db 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ext4.h * @@ -838,13 +839,11 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) { if (unlikely(sizeof(time->tv_sec) > 4 && (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0) + +#if 1 /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. We assume that by kernel version 4.20, - * everyone will have run fsck over the affected - * filesystems to correct the problem. (This - * backwards compatibility may be removed before this - * time, at the discretion of the ext4 developers.) + * bits 1,1. (This backwards compatibility may be removed + * at the discretion of the ext4 developers.) */ u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) @@ -961,7 +960,7 @@ struct ext4_inode_info { /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to + * it is used for making block allocation decisions - we try to * place a file's data blocks near its inode block, and new inodes * near to their parent directory's inode. */ @@ -1049,10 +1048,8 @@ struct ext4_inode_info { ext4_group_t i_last_alloc_group; /* allocation reservation info for delalloc */ - /* In case of bigalloc, these refer to clusters rather than blocks */ + /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; - unsigned int i_reserved_meta_blocks; - unsigned int i_allocated_meta_blocks; ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; @@ -1114,6 +1111,7 @@ struct ext4_inode_info { /* * Mount flags set via mount options or defaults */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ @@ -1444,6 +1442,8 @@ struct ext4_sb_info { unsigned int *s_mb_maxs; unsigned int s_group_info_size; unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ /* tunables */ unsigned long s_stripe; @@ -1516,7 +1516,8 @@ struct ext4_sb_info { struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; - struct mb_cache *s_mb_cache; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ @@ -1526,6 +1527,7 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; + struct dax_device *s_daxdev; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1565,6 +1567,7 @@ enum { nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1797,10 +1800,12 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ - EXT4_FEATURE_INCOMPAT_CSUM_SEED) + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -2016,7 +2021,8 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) -#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) /* Legal values for the dx_root hash_version field: */ @@ -2098,6 +2104,12 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + /* * This structure is stuffed into the struct file's private_data field * for directories. It is where we put information so that we can do @@ -2126,6 +2138,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + /* * Timeout and state flag for lazy initialization inode thread. */ @@ -2389,16 +2411,17 @@ extern int ext4fs_dirhash(const char *name, int len, struct /* ialloc.c */ extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, - uid_t *owner, int handle_type, - unsigned int line_no, int nblocks); + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); -#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ - 0, 0, 0) + i_flags, 0, 0, 0) #define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ type, nblocks) \ __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ - (type), __LINE__, (nblocks)) + 0, (type), __LINE__, (nblocks)) extern void ext4_free_inode(handle_t *, struct inode *); @@ -2433,11 +2456,14 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb, extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); /* inode.c */ int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, @@ -2704,19 +2730,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_group_desc_csum(struct super_block *sb) -{ - return ext4_has_feature_gdt_csum(sb) || - EXT4_SB(sb)->s_chksum_driver != NULL; -} - static inline int ext4_has_metadata_csum(struct super_block *sb) { WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && !EXT4_SB(sb)->s_chksum_driver); - return (EXT4_SB(sb)->s_chksum_driver != NULL); + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); } + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | @@ -2756,13 +2783,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) { - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); - else - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) @@ -3047,7 +3076,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define S_SHIFT 12 -static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index dd106b1d5d89..2d593201cf7a 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Interface between ext4 and JBD */ @@ -47,7 +48,7 @@ static int ext4_journal_check_start(struct super_block *sb) if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return -EIO; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index f97611171023..48143e32411c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -77,7 +77,14 @@ #define EXT4_RESERVE_TRANS_BLOCKS 12U -#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 +/* + * Number of credits needed if we need to insert an entry into a + * directory. For each new index block, we need 4 blocks (old index + * block, new index block, bitmap block, bg summary). For normal + * htree directories there are 2 levels; if the largedir feature + * enabled it's 3 levels. + */ +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was @@ -104,20 +111,6 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) -static inline int ext4_jbd2_credits_xattr(struct inode *inode) -{ - int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); - - /* - * In case of inline data, we may push out the data to a block, - * so we need to reserve credits for this eventuality - */ - if (ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; - return credits; -} - - /* * Ext4 handle operation types -- for logging purposes */ @@ -234,6 +227,9 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); +int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc); /* * Wrapper functions with which ext4 calls into JBD. */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3e36508610b7..97f0fd06728d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) static inline int get_default_free_blocks_flags(struct inode *inode) { - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; @@ -4651,7 +4652,7 @@ retry: static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ext4_lblk_t len, loff_t new_size, - int flags, int mode) + int flags) { struct inode *inode = file_inode(file); handle_t *handle; @@ -4814,7 +4815,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits) >> blkbits, (round_up((offset + len), 1 << blkbits) - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags, mode); + new_size, flags); if (ret) goto out_dio; @@ -4840,7 +4841,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + flags); up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; @@ -4975,8 +4976,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5836,7 +5836,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, if (e1_blk > lblk1) next1 = e1_blk; if (e2_blk > lblk2) - next2 = e1_blk; + next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) goto finish; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e7f12a204cbc..763ef185dd17 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.c * diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f7aa24f4642d..ca90fc96f47e 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/ext4/extents_status.h * diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 02ce7e7bbdf5..5cb9aa3ad249 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/file.c * @@ -37,7 +38,11 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock_shared(inode); + if (!inode_trylock_shared(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock_shared(inode); + } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore @@ -179,7 +184,11 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -215,8 +224,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } - inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -235,9 +251,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->private = &overwrite; /* Check whether we do a DIO overwrite or not */ - if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && - ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) - overwrite = 1; + if (o_direct && !unaligned_aio) { + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + if (ext4_should_dioread_nolock(inode)) + overwrite = 1; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + } ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -260,7 +282,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - bool write = vmf->flags & FAULT_FLAG_WRITE; + + /* + * We have to distinguish real writes from writes which will result in a + * COW page; COW writes should *not* poke the journal (the file will not + * be changed). Doing so would cause unintended failures when mounted + * read-only. + * + * We check for VM_SHARED rather than vmf->cow_page since the latter is + * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * other sizes, dax_iomap_fault will handle splitting / fallback so that + * we eventually come back with a COW page. + */ + bool write = (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); if (write) { sb_start_pagefault(sb); @@ -292,41 +327,11 @@ static int ext4_dax_fault(struct vm_fault *vmf) return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } -/* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() - * handler we check for races agaist truncate. Note that since we cycle through - * i_mmap_sem, we are sure that also any hole punching that began before we - * were called is finished by now and so if it included part of the file we - * are working on, our pte will get unmapped and the check for pte_same() in - * wp_pfn_shared() fails. Thus fault gets retried and things work out as - * desired. - */ -static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct super_block *sb = inode->i_sb; - loff_t size; - int ret; - - sb_start_pagefault(sb); - file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(sb); - - return ret; -} - static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, - .pfn_mkwrite = ext4_dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops @@ -345,13 +350,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (ext4_encrypted_inode(inode)) { - int err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -376,7 +374,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return -EIO; if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && - !(sb->s_flags & MS_RDONLY))) { + !sb_rdonly(sb))) { sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; /* * Sample where the filesystem has been mounted and @@ -435,6 +433,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } + + filp->f_mode |= FMODE_NOWAIT; return dquot_file_open(inode, filp); } @@ -478,12 +478,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, pagevec_init(&pvec, 0); do { - int i, num; + int i; unsigned long nr_pages; - num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, - (pgoff_t)num); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, + &index, end); if (nr_pages == 0) break; @@ -502,9 +501,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, goto out; } - if (page->index > end) - goto out; - lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -521,6 +517,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, lastoff = page_offset(page); bh = head = page_buffers(page); do { + if (lastoff + bh->b_size <= startoff) + goto next; if (buffer_uptodate(bh) || buffer_unwritten(bh)) { if (whence == SEEK_DATA) @@ -535,6 +533,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); goto out; } +next: lastoff += bh->b_size; bh = bh->b_this_page; } while (bh != head); @@ -544,14 +543,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); } - /* The no. of pages is less than our desired, we are done. */ - if (nr_pages < num) - break; - - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* There are no pages upto endoff - that would be a hole in there. */ if (whence == SEEK_HOLE && lastoff < endoff) { found = 1; *offset = lastoff; @@ -576,7 +571,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } @@ -639,7 +634,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b19436098837..7ec340898598 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -480,6 +480,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start_fsb; ext4_fsblk_t end_fsb; + ext4_fsblk_t bofs; ext4_fsblk_t eofs; ext4_group_t start_ag; ext4_group_t end_ag; @@ -487,9 +488,12 @@ static int ext4_getfsmap_datadev(struct super_block *sb, ext4_grpblk_t last_cluster; int error = 0; + bofs = le32_to_cpu(sbi->s_es->s_first_data_block); eofs = ext4_blocks_count(sbi->s_es); if (keys[0].fmr_physical >= eofs) return 0; + else if (keys[0].fmr_physical < bofs) + keys[0].fmr_physical = bofs; if (keys[1].fmr_physical >= eofs) keys[1].fmr_physical = eofs - 1; start_fsb = keys[0].fmr_physical; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 9d549608fd30..26a7fe5c4fd3 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/fsync.c * @@ -107,7 +108,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); - if (inode->i_sb->s_flags & MS_RDONLY) { + if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) @@ -124,7 +125,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; /* diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 38b8a96eb97c..00c6dd29e621 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -148,8 +148,6 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; val = ((int) scp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; @@ -176,8 +174,6 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 98ac2f1f23b3..c5f697a3fad4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ialloc.c * @@ -294,7 +295,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) * as writing the quota to disk may need the lock as well. */ dquot_initialize(inode); - ext4_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); @@ -693,24 +693,25 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * somewhat arbitrary...) */ #define RECENTCY_MIN 5 -#define RECENTCY_DIRTY 30 +#define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) { struct ext4_group_desc *gdp; struct ext4_inode *raw_inode; struct buffer_head *bh; - unsigned long dtime, now; - int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; - int offset, ret = 0, recentcy = RECENTCY_MIN; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0; + int recentcy = RECENTCY_MIN; + u32 dtime, now; gdp = ext4_get_group_desc(sb, group, NULL); if (unlikely(!gdp)) return 0; - bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); - if (unlikely(!bh) || !buffer_uptodate(bh)) + if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it * must have been written out. @@ -719,18 +720,45 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); raw_inode = (struct ext4_inode *) (bh->b_data + offset); + + /* i_dtime is only 32 bits on disk, but we only care about relative + * times in the range of a few minutes (i.e. long enough to sync a + * recently-deleted inode to disk), so using the low 32 bits of the + * clock (a 68 year range) is enough, see time_before32() */ dtime = le32_to_cpu(raw_inode->i_dtime); - now = get_seconds(); + now = ktime_get_real_seconds(); if (buffer_dirty(bh)) recentcy += RECENTCY_DIRTY; - if (dtime && (dtime < now) && (now < dtime + recentcy)) + if (dtime && time_before32(dtime, now) && + time_before32(now, dtime + recentcy)) ret = 1; out: brelse(bh); return ret; } +static int find_inode_bit(struct super_block *sb, ext4_group_t group, + struct buffer_head *bitmap, unsigned long *ino) +{ +next: + *ino = ext4_find_next_zero_bit((unsigned long *) + bitmap->b_data, + EXT4_INODES_PER_GROUP(sb), *ino); + if (*ino >= EXT4_INODES_PER_GROUP(sb)) + return 0; + + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, *ino)) { + *ino = *ino + 1; + if (*ino < EXT4_INODES_PER_GROUP(sb)) + goto next; + return 0; + } + + return 1; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -743,8 +771,9 @@ out: */ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, - __u32 goal, uid_t *owner, int handle_type, - unsigned int line_no, int nblocks) + __u32 goal, uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -766,30 +795,69 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + sb = dir->i_sb; + sbi = EXT4_SB(sb); + + if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && + !(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_get_encryption_info(dir); if (err) return ERR_PTR(err); if (!fscrypt_has_encryption_key(dir)) return ERR_PTR(-ENOKEY); - if (!handle) - nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); encrypt = 1; } - sb = dir->i_sb; + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never + * more than 1k. In practice they are under + * 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + } + ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); - sbi = EXT4_SB(sb); /* * Initialize owners and quota early so that we don't have to account @@ -853,19 +921,13 @@ got_group: /* * Check free inodes count before loading bitmap. */ - if (ext4_free_inodes_count(sb, gdp) == 0) { - if (++group == ngroups) - group = 0; - continue; - } + if (ext4_free_inodes_count(sb, gdp) == 0) + goto next_group; grp = ext4_get_group_info(sb, group); /* Skip groups with already-known suspicious inode tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - if (++group == ngroups) - group = 0; - continue; - } + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); @@ -873,27 +935,20 @@ got_group: if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; - if (++group == ngroups) - group = 0; - continue; + goto next_group; } repeat_in_this_group: - ino = ext4_find_next_zero_bit((unsigned long *) - inode_bitmap_bh->b_data, - EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (!ret2) goto next_group; - if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + + if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); - continue; - } - if ((EXT4_SB(sb)->s_journal == NULL) && - recently_deleted(sb, group, ino)) { - ino++; - goto next_inode; + goto next_group; } + if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, @@ -913,11 +968,23 @@ repeat_in_this_group: } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + if (ret2) { + /* Someone already took the bit. Repeat the search + * with lock held. + */ + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (ret2) { + ext4_set_bit(ino, inode_bitmap_bh->b_data); + ret2 = 0; + } else { + ret2 = 1; /* we didn't grab the inode */ + } + } ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ -next_inode: + if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: @@ -1053,6 +1120,7 @@ got: /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_flags |= i_flags; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; @@ -1109,13 +1177,15 @@ got: goto fail_free_drop; } - err = ext4_init_acl(handle, inode, dir); - if (err) - goto fail_free_drop; + if (!(ei->i_flags & EXT4_EA_INODE_FL)) { + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; - err = ext4_init_security(handle, inode, dir, qstr); - if (err) - goto fail_free_drop; + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + } if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ @@ -1313,7 +1383,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int num, ret = 0, used_blks = 0; /* This should not happen, but just to be sure check this */ - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { ret = 1; goto out; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bc15c2c17633..c32802c956d5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/indirect.c * @@ -829,7 +830,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 8d141c0c8ff9..28c5c3abddb3 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, /* Compute min_offs. */ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - if (!entry->e_value_block && entry->e_value_size) { + if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5cf82d03968c..90afeb7293a6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * @@ -144,16 +145,12 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, /* * Test whether an inode is a fast symlink. + * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; - - if (ext4_has_inline_data(inode)) - return 0; - - return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + return S_ISLNK(inode->i_mode) && inode->i_size && + (inode->i_size < EXT4_N_BLOCKS * 4); } /* @@ -189,6 +186,8 @@ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; + int extra_credits = 3; + struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -213,7 +212,8 @@ void ext4_evict_inode(struct inode *inode) */ if (inode->i_ino != EXT4_JOURNAL_INO && ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_data.nrpages) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -238,8 +238,12 @@ void ext4_evict_inode(struct inode *inode) * protection against it */ sb_start_intwrite(inode->i_sb); + + if (!IS_NOQUOTA(inode)) + extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+3); + ext4_blocks_for_truncate(inode)+extra_credits); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -254,6 +258,16 @@ void ext4_evict_inode(struct inode *inode) if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* + * Set inode->i_size to 0 before calling ext4_truncate(). We need + * special handling of symlinks here because i_size is used to + * determine whether ext4_inode_info->i_data contains symlink data or + * block mappings. Setting i_size to 0 will remove its fast symlink + * status. Erase i_data so that it becomes a valid empty block map. + */ + if (ext4_inode_is_fast_symlink(inode)) + memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -271,25 +285,17 @@ void ext4_evict_inode(struct inode *inode) } } - /* - * ext4_ext_truncate() doesn't reserve any slop when it - * restarts journal transactions; therefore there may not be - * enough credits left in the handle to remove the inode from - * the orphan list and set the dtime field. - */ - if (!ext4_handle_has_enough_credits(handle, 3)) { - err = ext4_journal_extend(handle, 3); - if (err > 0) - err = ext4_journal_restart(handle, 3); - if (err != 0) { - ext4_warning(inode->i_sb, - "couldn't extend journal (err %d)", err); - stop_handle: - ext4_journal_stop(handle); - ext4_orphan_del(NULL, inode); - sb_end_intwrite(inode->i_sb); - goto no_delete; - } + /* Remove xattr references. */ + err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, + extra_credits); + if (err) { + ext4_warning(inode->i_sb, "xattr delete (err %d)", err); +stop_handle: + ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); + goto no_delete; } /* @@ -317,6 +323,7 @@ void ext4_evict_inode(struct inode *inode) ext4_free_inode(handle, inode); ext4_journal_stop(handle); sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -710,7 +717,7 @@ out_sem: if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && - !IS_NOQUOTA(inode) && + !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode); @@ -886,7 +893,7 @@ static int ext4_dio_get_block_unwritten_async(struct inode *inode, /* * Get block function for non-AIO DIO writes when we create unwritten extent if * blocks are not allocated yet. The extent will be converted to written - * after IO is complete from ext4_ext_direct_IO() function. + * after IO is complete by ext4_direct_IO_write(). */ static int ext4_dio_get_block_unwritten_sync(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -901,7 +908,7 @@ static int ext4_dio_get_block_unwritten_sync(struct inode *inode, /* * Mark inode as having pending DIO writes to unwritten extents. - * ext4_ext_direct_IO() checks this flag and converts extents to + * ext4_direct_IO_write() checks this flag and converts extents to * written. */ if (!ret && buffer_unwritten(bh_result)) @@ -1009,6 +1016,50 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return ERR_PTR(-EIO); } +/* Read a contiguous batch of blocks. */ +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs) +{ + int i, err; + + for (i = 0; i < bh_count; i++) { + bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); + if (IS_ERR(bhs[i])) { + err = PTR_ERR(bhs[i]); + bh_count = i; + goto out_brelse; + } + } + + for (i = 0; i < bh_count; i++) + /* Note that NULL bhs[i] is valid because of holes. */ + if (bhs[i] && !buffer_uptodate(bhs[i])) + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, + &bhs[i]); + + if (!wait) + return 0; + + for (i = 0; i < bh_count; i++) + if (bhs[i]) + wait_on_buffer(bhs[i]); + + for (i = 0; i < bh_count; i++) { + if (bhs[i] && !buffer_uptodate(bhs[i])) { + err = -EIO; + goto out_brelse; + } + } + return 0; + +out_brelse: + for (i = 0; i < bh_count; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + return err; +} + int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *head, unsigned from, @@ -1670,13 +1721,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, pagevec_init(&pvec, 0); while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (page->index > end) - break; + BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { @@ -1687,7 +1737,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, } unlock_page(page); } - index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } } @@ -2298,17 +2347,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_init(&pvec, 0); while (start <= end) { - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, - PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, + &start, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (page->index > end) - break; - /* Up to 'end' pages must be contiguous */ - BUG_ON(page->index != start); bh = head = page_buffers(page); do { if (lblk < mpd->map.m_lblk) @@ -2353,7 +2398,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_release(&pvec); return err; } - start++; } pagevec_release(&pvec); } @@ -3354,7 +3398,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { - struct block_device *bdev; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; @@ -3423,12 +3467,8 @@ retry: } iomap->flags = 0; - bdev = inode->i_sb->s_bdev; - iomap->bdev = bdev; - if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); - else - iomap->dax_dev = NULL; + iomap->bdev = inode->i_sb->s_bdev; + iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; if (ret == 0) { @@ -3461,7 +3501,6 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; - fs_put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; @@ -4712,7 +4751,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - inode->i_size = ext4_isize(raw_inode); + inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -4846,6 +4885,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } brelse(iloc.bh); ext4_set_inode_flags(inode); + unlock_new_inode(inode); return inode; @@ -5037,7 +5077,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (ei->i_disksize != ext4_isize(raw_inode)) { + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } @@ -5287,7 +5327,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } + + /* dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(inode, attr); + up_read(&EXT4_I(inode)->xattr_sem); + if (error) { ext4_journal_stop(handle); return error; @@ -5307,6 +5354,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; int shrink = (attr->ia_size <= inode->i_size); + if (ext4_encrypted_inode(inode)) { + error = fscrypt_get_encryption_info(inode); + if (error) + return error; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -5628,22 +5683,16 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, return err; } -/* - * Expand an inode by new_extra_isize bytes. - * Returns 0 on success or negative error number on failure. - */ -static int ext4_expand_extra_isize(struct inode *inode, - unsigned int new_extra_isize, - struct ext4_iloc iloc, - handle_t *handle) +static int __ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc, + handle_t *handle, int *no_expand) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; + int error; - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) - return 0; - - raw_inode = ext4_raw_inode(&iloc); + raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); @@ -5658,8 +5707,98 @@ static int ext4_expand_extra_isize(struct inode *inode, } /* try to expand with EAs present */ - return ext4_expand_extra_isize_ea(inode, new_extra_isize, - raw_inode, handle); + error = ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); + if (error) { + /* + * Inode size expansion failed; don't try again + */ + *no_expand = 1; + } + + return error; +} + +/* + * Expand an inode by new_extra_isize bytes. + * Returns 0 on success or negative error number on failure. + */ +static int ext4_try_to_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc iloc, + handle_t *handle) +{ + int no_expand; + int error; + + if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) + return -EOVERFLOW; + + /* + * In nojournal mode, we can immediately attempt to expand + * the inode. When journaled, we first need to obtain extra + * buffer credits since we may write into the EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if (ext4_handle_valid(handle) && + jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0) + return -ENOSPC; + + if (ext4_write_trylock_xattr(inode, &no_expand) == 0) + return -EBUSY; + + error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, + handle, &no_expand); + ext4_write_unlock_xattr(inode, &no_expand); + + return error; +} + +int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc) +{ + handle_t *handle; + int no_expand; + int error, rc; + + if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { + brelse(iloc->bh); + return -EOVERFLOW; + } + + handle = ext4_journal_start(inode, EXT4_HT_INODE, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + brelse(iloc->bh); + return error; + } + + ext4_write_lock_xattr(inode, &no_expand); + + BUFFER_TRACE(iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, iloc->bh); + if (error) { + brelse(iloc->bh); + goto out_stop; + } + + error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, + handle, &no_expand); + + rc = ext4_mark_iloc_dirty(handle, inode, iloc); + if (!error) + error = rc; + + ext4_write_unlock_xattr(inode, &no_expand); +out_stop: + ext4_journal_stop(handle); + return error; } /* @@ -5679,44 +5818,18 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - static unsigned int mnt_count; - int err, ret; + int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) return err; - if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && - !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { - /* - * In nojournal mode, we can immediately attempt to expand - * the inode. When journaled, we first need to obtain extra - * buffer credits since we may write into the EA block - * with this same handle. If journal_extend fails, then it will - * only result in a minor loss of functionality for that inode. - * If this is felt to be critical, then e2fsck should be run to - * force a large enough s_min_extra_isize. - */ - if (!ext4_handle_valid(handle) || - jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) { - ret = ext4_expand_extra_isize(inode, - sbi->s_want_extra_isize, - iloc, handle); - if (ret) { - if (mnt_count != - le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, - "Unable to expand inode %lu. Delete" - " some EAs or run e2fsck.", - inode->i_ino); - mnt_count = - le16_to_cpu(sbi->s_es->s_mnt_count); - } - } - } - } + + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) + ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, + iloc, handle); + return ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0c21e22acd74..75d83471f65c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ioctl.c * @@ -64,18 +65,16 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); - memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); - memswap(&inode1->i_version, &inode2->i_version, - sizeof(inode1->i_version)); - memswap(&inode1->i_blocks, &inode2->i_blocks, - sizeof(inode1->i_blocks)); - memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); - memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); - memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + swap(inode1->i_flags, inode2->i_flags); + swap(inode1->i_version, inode2->i_version); + swap(inode1->i_blocks, inode2->i_blocks); + swap(inode1->i_bytes, inode2->i_bytes); + swap(inode1->i_atime, inode2->i_atime); + swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); - memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + swap(ei1->i_flags, ei2->i_flags); + swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); @@ -218,7 +217,7 @@ static int ext4_ioctl_setflags(struct inode *inode, unsigned int jflag; /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) + if (ext4_is_quota_file(inode)) goto flags_out; oldflags = ei->i_flags; @@ -342,7 +341,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) err = -EPERM; inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) + if (ext4_is_quota_file(inode)) goto out_unlock; err = ext4_get_inode_loc(inode, &iloc); @@ -351,11 +350,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { - err = -EOVERFLOW; + err = ext4_expand_extra_isize(inode, + EXT4_SB(sb)->s_want_extra_isize, + &iloc); + if (err) + goto out_unlock; + } else { brelse(iloc.bh); - goto out_unlock; } - brelse(iloc.bh); dquot_initialize(inode); @@ -373,7 +375,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { + + /* __dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); err = __dquot_transfer(inode, transfer_to); + up_read(&EXT4_I(inode)->xattr_sem); dqput(transfer_to[PRJQUOTA]); if (err) goto out_dirty; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b7928cddd539..701085620cd8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -367,8 +367,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -2297,6 +2295,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) int err, buddy_loaded = 0; struct ext4_buddy e4b; struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, + sb->s_blocksize_bits, + EXT4_MAX_BLOCK_LOG_SIZE); struct sg { struct ext4_group_info info; ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; @@ -2308,8 +2309,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); - i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); + grinfo = ext4_get_group_info(sb, group); /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { @@ -2329,7 +2331,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) - seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg.info.bb_counters[i] : 0); seq_printf(seq, " ]\n"); @@ -2639,6 +2641,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; + INIT_LIST_HEAD(&sbi->s_freed_data_list); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -2782,7 +2785,8 @@ int ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count) + ext4_group_t block_group, ext4_grpblk_t cluster, int count, + struct bio **biop) { ext4_fsblk_t discard_block; @@ -2791,18 +2795,18 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + if (biop) { + return __blkdev_issue_discard(sb->s_bdev, + (sector_t)discard_block << (sb->s_blocksize_bits - 9), + (sector_t)count << (sb->s_blocksize_bits - 9), + GFP_NOFS, 0, biop); + } else + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } -/* - * This function is called by the jbd2 layer once the commit has finished, - * so we know we can free the blocks that were released with that commit. - */ -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc) +static void ext4_free_data_in_buddy(struct super_block *sb, + struct ext4_free_data *entry) { - struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; @@ -2810,18 +2814,6 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); - if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, - entry->efd_count); - if (err && err != -EOPNOTSUPP) - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%d failed" - " with %d", entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, err); - } - err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); @@ -2862,6 +2854,58 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_free_data *entry, *tmp; + struct bio *discard_bio = NULL; + struct list_head freed_data_list; + struct list_head *cut_pos = NULL; + int err; + + INIT_LIST_HEAD(&freed_data_list); + + spin_lock(&sbi->s_md_lock); + list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { + if (entry->efd_tid != commit_tid) + break; + cut_pos = &entry->efd_list; + } + if (cut_pos) + list_cut_position(&freed_data_list, &sbi->s_freed_data_list, + cut_pos); + spin_unlock(&sbi->s_md_lock); + + if (test_opt(sb, DISCARD)) { + list_for_each_entry(entry, &freed_data_list, efd_list) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, + &discard_bio); + if (err && err != -EOPNOTSUPP) { + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } else if (err == -EOPNOTSUPP) + break; + } + + if (discard_bio) { + submit_bio_wait(discard_bio); + bio_put(discard_bio); + } + } + + list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + ext4_free_data_in_buddy(sb, entry); +} + int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, @@ -3529,7 +3573,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "prellocated %u for group %u\n", preallocated, group); + mb_debug(1, "preallocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -4464,7 +4508,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, trace_ext4_request_blocks(ar); /* Allow to use superuser reservation for quota file */ - if (IS_NOQUOTA(ar->inode)) + if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { @@ -4583,14 +4627,28 @@ out: * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ -static int can_merge(struct ext4_free_data *entry1, - struct ext4_free_data *entry2) +static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, + struct ext4_free_data *entry, + struct ext4_free_data *new_entry, + struct rb_root *entry_rb_root) { - if ((entry1->efd_tid == entry2->efd_tid) && - (entry1->efd_group == entry2->efd_group) && - ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) - return 1; - return 0; + if ((entry->efd_tid != new_entry->efd_tid) || + (entry->efd_group != new_entry->efd_group)) + return; + if (entry->efd_start_cluster + entry->efd_count == + new_entry->efd_start_cluster) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; + } else if (new_entry->efd_start_cluster + new_entry->efd_count == + entry->efd_start_cluster) { + new_entry->efd_count += entry->efd_count; + } else + return; + spin_lock(&sbi->s_md_lock); + list_del(&entry->efd_list); + spin_unlock(&sbi->s_md_lock); + rb_erase(&entry->efd_node, entry_rb_root); + kmem_cache_free(ext4_free_data_cachep, entry); } static noinline_for_stack int @@ -4646,29 +4704,19 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry) && - ext4_journal_callback_try_del(handle, &entry->efd_jce)) { - new_entry->efd_start_cluster = entry->efd_start_cluster; - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - kmem_cache_free(ext4_free_data_cachep, entry); - } + ext4_try_merge_freed_extent(sbi, entry, new_entry, + &(db->bb_free_root)); } node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry) && - ext4_journal_callback_try_del(handle, &entry->efd_jce)) { - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - kmem_cache_free(ext4_free_data_cachep, entry); - } + ext4_try_merge_freed_extent(sbi, entry, new_entry, + &(db->bb_free_root)); } - /* Add the extent to transaction's private list */ - new_entry->efd_jce.jce_func = ext4_free_data_callback; + spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, &new_entry->efd_jce); + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); return 0; @@ -4871,7 +4919,8 @@ do_more: * them with group lock_held */ if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, block_group, bit, count); + err = ext4_issue_discard(sb, block_group, bit, count, + NULL); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%d block:%d count:%lu failed" @@ -5094,7 +5143,7 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count); + ret = ext4_issue_discard(sb, group, start, count, NULL); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 2bed62084a8c..dcf52540f379 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/ext4/mballoc.h * @@ -78,10 +79,8 @@ do { \ struct ext4_free_data { - /* MUST be the first member */ - struct ext4_journal_cb_entry efd_jce; - - /* ext4_free_data private data starts from here */ + /* this links the free block information from sb_info */ + struct list_head efd_list; /* this links the free block information from group_info */ struct rb_node efd_node; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 364ea4d4a943..cf5181b62df1 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), - S_IFREG, NULL, goal, owner); + S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index eb9835638680..27b9a76a0dfa 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/random.h> #include <linux/buffer_head.h> @@ -185,7 +186,7 @@ static int kmmpd(void *data) goto exit_thread; } - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { ext4_warning(sb, "kmmpd being stopped since filesystem " "has been remounted as readonly."); goto exit_thread; @@ -367,7 +368,7 @@ skip: goto failed; } - mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL); if (!mmpd_data) { ext4_warning(sb, "not enough memory for mmpd_data"); goto failed; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c992ef2c2f94..9bb36909ec92 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -484,7 +484,7 @@ mext_check_arguments(struct inode *orig_inode, return -EBUSY; } - if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) { + if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " "not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 404256caf9cf..bd48a8d83961 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/namei.c * @@ -513,7 +514,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { - return le32_to_cpu(entry->block) & 0x00ffffff; + return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) @@ -739,6 +740,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; @@ -768,9 +770,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } indirect = root->info.indirect_levels; - if (indirect > 1) { - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", - root->info.indirect_levels); + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed" + "supported value", dir->i_ino, + ext4_dir_htree_level(dir->i_sb)); + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { + ext4_warning(dir->i_sb, "Enable large directory " + "feature to access it"); + } goto fail; } @@ -859,12 +867,19 @@ fail: static void dx_release(struct dx_frame *frames) { + struct dx_root_info *info; + int i; + if (frames[0].bh == NULL) return; - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); + info = &((struct dx_root *)frames[0].bh->b_data)->info; + for (i = 0; i <= info->indirect_levels; i++) { + if (frames[i].bh == NULL) + break; + brelse(frames[i].bh); + frames[i].bh = NULL; + } } /* @@ -1050,7 +1065,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; @@ -1328,13 +1343,12 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; - ext4_lblk_t start, block, b; + ext4_lblk_t start, block; const u8 *name = d_name->name; - int ra_max = 0; /* Number of bh's in the readahead + size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ - int ra_ptr = 0; /* Current index into readahead + size_t ra_ptr = 0; /* Current index into readahead buffer */ - int num = 0; ext4_lblk_t nblocks; int i, namelen, retval; struct ext4_filename fname; @@ -1397,42 +1411,28 @@ restart: if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; - b = block; - for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { - /* - * Terminate if we reach the end of the - * directory and must wrap, or if our - * search has finished at this block. - */ - if (b >= nblocks || (num && block == start)) { - bh_use[ra_max] = NULL; - break; - } - num++; - bh = ext4_getblk(NULL, dir, b++, 0); - if (IS_ERR(bh)) { - if (ra_max == 0) { - ret = bh; - goto cleanup_and_exit; - } - break; - } - bh_use[ra_max] = bh; - if (bh) - ll_rw_block(REQ_OP_READ, - REQ_META | REQ_PRIO, - 1, &bh); + if (block < start) + ra_max = start - block; + else + ra_max = nblocks - block; + ra_max = min(ra_max, ARRAY_SIZE(bh_use)); + retval = ext4_bread_batch(dir, block, ra_max, + false /* wait */, bh_use); + if (retval) { + ret = ERR_PTR(retval); + ra_max = 0; + goto cleanup_and_exit; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); - goto next; + ret = ERR_PTR(-EIO); + goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, @@ -1442,7 +1442,8 @@ restart: EXT4_ERROR_INODE(dir, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); - goto next; + ret = ERR_PTR(-EFSBADCRC); + goto cleanup_and_exit; } set_buffer_verified(bh); i = search_dirblock(bh, dir, &fname, @@ -1485,7 +1486,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1889,7 +1890,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); - dir->i_version++; + inode_inc_iversion(dir); ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirent_node(handle, dir, bh); @@ -1908,7 +1909,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, { struct buffer_head *bh2; struct dx_root *root; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; struct ext4_dir_entry_tail *t; @@ -2127,13 +2128,16 @@ out: static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; + int restart; int err; +again: + restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); @@ -2155,24 +2159,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, if (err != -ENOSPC) goto cleanup; + err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; + int levels = frame - frames + 1; + unsigned int icount; + int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext4_warning_inode(dir, "Directory index full!"); + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { + add_level = 0; + break; + } + frame--; /* split higher index block */ + at = frame->at; + entries = frame->entries; + restart = 1; + } + if (add_level && levels == ext4_dir_htree_level(sb)) { + ext4_warning(sb, "Directory (ino: %lu) index full, " + "reach max htree level :%d", + dir->i_ino, levels); + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { + ext4_warning(sb, "Large directory feature is " + "not enabled on this " + "filesystem"); + } err = -ENOSPC; goto cleanup; } + icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); @@ -2187,7 +2211,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, err = ext4_journal_get_write_access(handle, frame->bh); if (err) goto journal_error; - if (levels) { + if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", @@ -2195,7 +2219,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, - frames[0].bh); + (frame - 1)->bh); if (err) goto journal_error; @@ -2211,17 +2235,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, frame->entries = entries = entries2; swap(frame->bh, bh2); } - dx_insert_block(frames + 0, hash2, newblock); - dxtrace(dx_show_index("node", frames[1].entries)); + dx_insert_block((frame - 1), hash2, newblock); + dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); + err = ext4_handle_dirty_dx_node(handle, dir, + (frame - 1)->bh); + if (err) + goto journal_error; + if (restart) { + err = ext4_handle_dirty_dx_node(handle, dir, + frame->bh); + goto journal_error; + } } else { - dxtrace(printk(KERN_DEBUG - "Creating second level index...\n")); + struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); @@ -2229,22 +2261,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext4_journal_get_write_access(handle, - frame->bh); + dxroot = (struct dx_root *)frames[0].bh->b_data; + dxroot->info.indirect_levels += 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + info->indirect_levels)); + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; - } - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); - if (err) { - ext4_std_error(inode->i_sb, err); - goto cleanup; + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + brelse(bh2); + restart = 1; + goto journal_error; } } de = do_split(handle, dir, &bh, frame, &fname->hinfo); @@ -2256,10 +2284,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, goto cleanup; journal_error: - ext4_std_error(dir->i_sb, err); + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ cleanup: brelse(bh); dx_release(frames); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path + */ + if (restart && err == 0) + goto again; return err; } @@ -2296,7 +2329,7 @@ int ext4_generic_delete_entry(handle_t *handle, blocksize); else de->inode = 0; - dir->i_version++; + inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -2348,19 +2381,22 @@ out: } /* - * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, - * since this indicates that nlinks count was previously 1. + * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 + * since this indicates that nlinks count was previously 1 to avoid overflowing + * the 16-bit i_links_count field on disk. Directories with i_nlink == 1 mean + * that subdirectory link counts are not being maintained accurately. + * + * The caller has already checked for i_nlink overflow in case the DIR_LINK + * feature is not enabled and returned -EMLINK. The is_dx() check is a proxy + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. */ static void ext4_inc_count(handle_t *handle, struct inode *inode) { inc_nlink(inode); - if (is_dx(inode) && inode->i_nlink > 1) { - /* limit is 16-bit i_links_count */ - if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { - set_nlink(inode, 1); - ext4_set_feature_dir_nlink(inode->i_sb); - } - } + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); } /* diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1a82138ba739..db7590178dfc 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/page-io.c * @@ -85,7 +86,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); mapping_set_error(page->mapping, -EIO); } @@ -104,7 +105,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_error) + if (bio->bi_status) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -300,27 +301,28 @@ static void ext4_end_bio(struct bio *bio) char b[BDEVNAME_SIZE]; if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", - bdevname(bio->bi_bdev, b), + bio_devname(bio, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), - bio->bi_error)) { + bio->bi_status)) { ext4_finish_bio(bio); bio_put(bio); return; } bio->bi_end_io = NULL; - if (bio->bi_error) { + if (bio->bi_status) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - bio->bi_error, inode->i_ino, + bio->bi_status, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, bio->bi_error); + mapping_set_error(inode->i_mapping, + blk_status_to_errno(bio->bi_status)); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -349,6 +351,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0; + io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } @@ -373,7 +376,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, return -ENOMEM; wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; @@ -396,6 +399,7 @@ submit_and_retry: ret = io_submit_init_bio(io, bh); if (ret) return ret; + io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a81b829d56de..9ffa6fad18db 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * @@ -73,7 +74,7 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -83,7 +84,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -254,7 +255,7 @@ int ext4_mpage_readpages(struct address_space *mapping, fscrypt_release_ctx(ctx); goto set_error_page; } - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c3ed9021b781..1dac59c24792 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/resize.c * @@ -1927,7 +1928,8 @@ retry: n_desc_blocks = o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); - n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_blocks_count = (ext4_fsblk_t)n_group * + EXT4_BLOCKS_PER_GROUP(sb); n_group--; /* set to last group number */ } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d37c81f327e7..b0915b734a38 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -373,6 +373,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); + + ext4_process_freed_data(sb, txn->t_tid); + spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, @@ -402,7 +405,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) static void ext4_handle_error(struct super_block *sb) { - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; if (!test_opt(sb, ERRORS_CONT)) { @@ -584,8 +587,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ - if (errno == -EROFS && journal_current_handle() == NULL && - (sb->s_flags & MS_RDONLY)) + if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { @@ -625,7 +627,7 @@ void __ext4_abort(struct super_block *sb, const char *function, sb->s_id, function, line, &vaf); va_end(args); - if ((sb->s_flags & MS_RDONLY) == 0) { + if (sb_rdonly(sb) == 0) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; /* @@ -886,11 +888,11 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!(sb->s_flags & MS_RDONLY) && !aborted) { + if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) ext4_commit_super(sb, 1); for (i = 0; i < sbi->s_gdb_count; i++) @@ -927,9 +929,13 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } - if (sbi->s_mb_cache) { - ext4_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_inode_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + } + if (sbi->s_ea_block_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); @@ -944,6 +950,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev); kfree(sbi); } @@ -971,8 +978,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_reserved_meta_blocks = 0; - ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); @@ -1143,7 +1148,16 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; - int res, res2, retries = 0; + int res, res2, credits, retries = 0; + + /* + * Encrypting the root directory is not allowed because e2fsck expects + * lost+found to exist and be unencrypted, and encrypting the root + * directory would imply encrypting the lost+found directory as well as + * the filename "lost+found" itself. + */ + if (inode->i_ino == EXT4_ROOT_INO) + return -EPERM; res = ext4_convert_inline_data(inode); if (res) @@ -1178,8 +1192,12 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (res) return res; retry: - handle = ext4_journal_start(inode, EXT4_HT_MISC, - ext4_jbd2_credits_xattr(inode)); + res = ext4_xattr_set_credits(inode, len, false /* is_create */, + &credits); + if (res) + return res; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1203,7 +1221,7 @@ retry: return res; } -static int ext4_dummy_context(struct inode *inode) +static bool ext4_dummy_context(struct inode *inode) { return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); } @@ -1256,16 +1274,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode) } static const struct dquot_operations ext4_quota_operations = { - .get_reserved_space = ext4_get_reserved_space, - .write_dquot = ext4_write_dquot, - .acquire_dquot = ext4_acquire_dquot, - .release_dquot = ext4_release_dquot, - .mark_dirty = ext4_mark_dquot_dirty, - .write_info = ext4_write_info, - .alloc_dquot = dquot_alloc, - .destroy_dquot = dquot_destroy, - .get_projid = ext4_get_projid, - .get_next_id = ext4_get_next_id, + .get_reserved_space = ext4_get_reserved_space, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, + .get_inode_usage = ext4_get_inode_usage, + .get_next_id = ext4_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -1328,7 +1347,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, - Opt_max_dir_size_kb, Opt_nojournal_checksum, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, }; static const match_table_t tokens = { @@ -1411,6 +1430,8 @@ static const match_table_t tokens = { {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_nombcache, "nombcache"}, + {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1618,6 +1639,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -1655,7 +1677,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; return 1; case Opt_i_version: - sb->s_flags |= MS_I_VERSION; + sb->s_flags |= SB_I_VERSION; return 1; case Opt_lazytime: sb->s_flags |= MS_LAZYTIME; @@ -2038,7 +2060,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); - if (sb->s_flags & MS_I_VERSION) + if (sb->s_flags & SB_I_VERSION) SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); @@ -2077,7 +2099,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset) struct super_block *sb = seq->private; int rc; - seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); + seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_puts(seq, "\n"); return rc; @@ -2345,7 +2367,7 @@ static int ext4_check_descriptors(struct super_block *sb, "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } @@ -2382,6 +2404,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, unsigned int s_flags = sb->s_flags; int ret, nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA + int quota_update = 0; int i; #endif if (!es->s_last_orphan) { @@ -2420,14 +2443,32 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); - if (ret < 0) + + if (!ret) + quota_update = 1; + else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " - "quota: error %d", ret); + "quota: type %d: error %d", i, ret); } } #endif @@ -2488,10 +2529,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -3092,8 +3135,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (first_not_zeroed == ngroups || - (sb->s_flags & MS_RDONLY) || + if (first_not_zeroed == ngroups || sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) goto out; @@ -3355,6 +3397,7 @@ static void ext4_set_resv_clusters(struct super_block *sb) static int ext4_fill_super(struct super_block *sb, void *data, int silent) { + struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; @@ -3380,6 +3423,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if ((data && !orig_data) || !sbi) goto out_free_base; + sbi->s_daxdev = dax_dev; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) @@ -3445,7 +3489,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Load the checksum driver */ - if (ext4_has_feature_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) { sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); @@ -3467,7 +3512,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb)) + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); @@ -3597,6 +3642,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "The Hurd can't support 64-bit file systems"); goto failed_mount; } + + /* + * ea_inode feature uses l_i_version field which is not + * available in HURD_COMPAT mode. + */ + if (ext4_has_feature_ea_inode(sb)) { + ext4_msg(sb, KERN_ERR, + "ea_inode feature is not supported for Hurd"); + goto failed_mount; + } } if (IS_EXT2_SB(sb)) { @@ -3626,7 +3681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ - if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) + if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) goto failed_mount; blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); @@ -3755,12 +3810,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif @@ -3950,7 +4005,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); @@ -3960,7 +4015,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_journal_needs_recovery(sb)); - if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY)) + if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) goto failed_mount3a; @@ -3972,7 +4027,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) err = ext4_load_journal(sb, es, journal_devnum); if (err) goto failed_mount3a; - } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); @@ -4061,10 +4116,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; no_journal: - sbi->s_mb_cache = ext4_xattr_create_cache(); - if (!sbi->s_mb_cache) { - ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); - goto failed_mount_wq; + if (!test_opt(sb, NO_MBCACHE)) { + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_block_cache"); + goto failed_mount_wq; + } + + if (ext4_has_feature_ea_inode(sb)) { + sbi->s_ea_inode_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_inode_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_inode_cache"); + goto failed_mount_wq; + } + } } if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && @@ -4074,7 +4141,7 @@ no_journal: goto failed_mount_wq; } - if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) && + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && !ext4_has_feature_encrypt(sb)) { ext4_set_feature_encrypt(sb); ext4_commit_super(sb, 1); @@ -4128,7 +4195,7 @@ no_journal: goto failed_mount4; } - if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + if (ext4_setup_super(sb, es, sb_rdonly(sb))) sb->s_flags |= MS_RDONLY; /* determine the minimum size of new large inodes, if present */ @@ -4216,7 +4283,7 @@ no_journal: #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ - if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) { + if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount8; @@ -4296,9 +4363,13 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: - if (sbi->s_mb_cache) { - ext4_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_inode_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + } + if (sbi->s_ea_block_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -4329,6 +4400,7 @@ out_fail: out_free_base: kfree(sbi); kfree(orig_data); + fs_put_dax(dax_dev); return err ? err : ret; } @@ -4535,7 +4607,7 @@ static int ext4_load_journal(struct super_block *sb, * can get read-write access to the device. */ if (ext4_has_feature_journal_needs_recovery(sb)) { - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); if (really_read_only) { @@ -4690,8 +4762,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (jbd2_journal_flush(journal) < 0) goto out; - if (ext4_has_feature_journal_needs_recovery(sb) && - sb->s_flags & MS_RDONLY) { + if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { ext4_clear_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); } @@ -4747,7 +4818,7 @@ int ext4_force_commit(struct super_block *sb) { journal_t *journal; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; journal = EXT4_SB(sb)->s_journal; @@ -4812,7 +4883,7 @@ static int ext4_freeze(struct super_block *sb) int error = 0; journal_t *journal; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; journal = EXT4_SB(sb)->s_journal; @@ -4847,7 +4918,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if ((sb->s_flags & MS_RDONLY) || ext4_forced_shutdown(EXT4_SB(sb))) + if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) return 0; if (EXT4_SB(sb)->s_journal) { @@ -4957,6 +5028,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { + ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); + err = -EINVAL; + goto restore_opts; + } + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); @@ -4979,7 +5056,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (*flags & MS_LAZYTIME) sb->s_flags |= MS_LAZYTIME; - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; @@ -5074,7 +5151,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * Reinitialize lazy itable initialization thread based on * current settings */ - if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) + if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) ext4_unregister_li_request(sb); else { ext4_group_t first_not_zeroed; @@ -5139,7 +5216,7 @@ static int ext4_statfs_project(struct super_block *sb, dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); limit = (dquot->dq_dqb.dqb_bsoftlimit ? dquot->dq_dqb.dqb_bsoftlimit : @@ -5162,7 +5239,7 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } @@ -5208,18 +5285,13 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/* Helper function for writing quotas on sync - we need to start transaction - * before quota file is locked for write. Otherwise the are possible deadlocks: - * Process 1 Process 2 - * ext4_create() quota_sync() - * jbd2_journal_start() write_dquot() - * dquot_initialize() down(dqio_mutex) - * down(dqio_mutex) jbd2_journal_start() - * - */ #ifdef CONFIG_QUOTA +/* + * Helper functions so that transaction is started before we acquire dqio_sem + * to keep correct lock ordering of transaction > dqio_sem + */ static inline struct inode *dquot_to_inode(struct dquot *dquot) { return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; @@ -5354,6 +5426,13 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); + sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; + } else { + /* + * Clear the flag just in case mount options changed since + * last time. + */ + sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } /* @@ -5450,13 +5529,16 @@ static int ext4_enable_quotas(struct super_block *sb) test_opt(sb, PRJQUOTA), }; - sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d). Please run " @@ -5646,7 +5728,7 @@ static inline int ext2_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; @@ -5677,7 +5759,7 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 0; if (!ext4_has_feature_journal(sb)) return 0; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 5c8fc53cb0e5..a2006c9af1d9 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/symlink.c * diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d74dc5f81a04..e21afd52e7d7 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/sysfs.c * @@ -100,7 +101,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (!ret || val >= clusters) + if (ret || val >= clusters) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index c70d06a383e2..b64a9fa0ff41 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/ext4/truncate.h * diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5d3c2536641c..218a7ba57819 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr.c * @@ -72,12 +73,14 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); -static struct buffer_head *ext4_xattr_cache_find(struct inode *, - struct ext4_xattr_header *, - struct mb_cache_entry **); -static void ext4_xattr_rehash(struct ext4_xattr_header *, - struct ext4_xattr_entry *); +static void ext4_xattr_block_cache_insert(struct mb_cache *, + struct buffer_head *); +static struct buffer_head * +ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, + struct mb_cache_entry **); +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count); +static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, @@ -104,8 +107,22 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; -#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ - inode->i_sb->s_fs_info)->s_mb_cache) +#define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_block_cache) + +#define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_inode_cache) + +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode); + +#ifdef CONFIG_LOCKDEP +void ext4_xattr_inode_set_class(struct inode *ea_inode) +{ + lockdep_set_subclass(&ea_inode->i_rwsem, 1); +} +#endif static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, @@ -177,9 +194,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, /* Check the values */ while (!IS_LAST_ENTRY(entry)) { - if (entry->e_value_block != 0) - return -EFSCORRUPTED; - if (entry->e_value_size != 0) { + if (entry->e_value_size != 0 && + entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); u32 size = le32_to_cpu(entry->e_value_size); void *value; @@ -269,6 +285,208 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, return cmp ? -ENODATA : 0; } +static u32 +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) +{ + return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); +} + +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) +{ + return ((u64)ea_inode->i_ctime.tv_sec << 32) | + ((u32)ea_inode->i_version); +} + +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) +{ + ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); + ea_inode->i_version = (u32)ref_count; +} + +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) +{ + return (u32)ea_inode->i_atime.tv_sec; +} + +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) +{ + ea_inode->i_atime.tv_sec = hash; +} + +/* + * Read the EA value from an inode. + */ +static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) +{ + int blocksize = 1 << ea_inode->i_blkbits; + int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; + int tail_size = (size % blocksize) ?: blocksize; + struct buffer_head *bhs_inline[8]; + struct buffer_head **bhs = bhs_inline; + int i, ret; + + if (bh_count > ARRAY_SIZE(bhs_inline)) { + bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); + if (!bhs) + return -ENOMEM; + } + + ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, + true /* wait */, bhs); + if (ret) + goto free_bhs; + + for (i = 0; i < bh_count; i++) { + /* There shouldn't be any holes in ea_inode. */ + if (!bhs[i]) { + ret = -EFSCORRUPTED; + goto put_bhs; + } + memcpy((char *)buf + blocksize * i, bhs[i]->b_data, + i < bh_count - 1 ? blocksize : tail_size); + } + ret = 0; +put_bhs: + for (i = 0; i < bh_count; i++) + brelse(bhs[i]); +free_bhs: + if (bhs != bhs_inline) + kfree(bhs); + return ret; +} + +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) + +static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + u32 ea_inode_hash, struct inode **ea_inode) +{ + struct inode *inode; + int err; + + inode = ext4_iget(parent->i_sb, ea_ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error(parent->i_sb, + "error while reading EA inode %lu err=%d", ea_ino, + err); + return err; + } + + if (is_bad_inode(inode)) { + ext4_error(parent->i_sb, + "error while reading EA inode %lu is_bad_inode", + ea_ino); + err = -EIO; + goto error; + } + + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + ext4_error(parent->i_sb, + "EA inode %lu does not have EXT4_EA_INODE_FL flag", + ea_ino); + err = -EINVAL; + goto error; + } + + ext4_xattr_inode_set_class(inode); + + /* + * Check whether this is an old Lustre-style xattr inode. Lustre + * implementation does not have hash validation, rather it has a + * backpointer from ea_inode to the parent inode. + */ + if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && + EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && + inode->i_generation == parent->i_generation) { + ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); + ext4_xattr_inode_set_ref(inode, 1); + } else { + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + } + + *ea_inode = inode; + return 0; +error: + iput(inode); + return err; +} + +static int +ext4_xattr_inode_verify_hashes(struct inode *ea_inode, + struct ext4_xattr_entry *entry, void *buffer, + size_t size) +{ + u32 hash; + + /* Verify stored hash matches calculated hash. */ + hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); + if (hash != ext4_xattr_inode_get_hash(ea_inode)) + return -EFSCORRUPTED; + + if (entry) { + __le32 e_hash, tmp_data; + + /* Verify entry hash. */ + tmp_data = cpu_to_le32(hash); + e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, + &tmp_data, 1); + if (e_hash != entry->e_hash) + return -EFSCORRUPTED; + } + return 0; +} + +/* + * Read xattr value from the EA inode. + */ +static int +ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, + void *buffer, size_t size) +{ + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + struct inode *ea_inode; + int err; + + err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), + le32_to_cpu(entry->e_hash), &ea_inode); + if (err) { + ea_inode = NULL; + goto out; + } + + if (i_size_read(ea_inode) != size) { + ext4_warning_inode(ea_inode, + "ea_inode file size=%llu entry size=%zu", + i_size_read(ea_inode), size); + err = -EFSCORRUPTED; + goto out; + } + + err = ext4_xattr_inode_read(ea_inode, buffer, size); + if (err) + goto out; + + if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { + err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, + size); + if (err) { + ext4_warning_inode(ea_inode, + "EA inode hash validation failed"); + goto out; + } + + if (ea_inode_cache) + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, + ext4_xattr_inode_get_hash(ea_inode), + ea_inode->i_ino, true /* reusable */); + } +out: + iput(ea_inode); + return err; +} + static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) @@ -277,7 +495,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -298,7 +516,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -EFSCORRUPTED; goto cleanup; } - ext4_xattr_cache_insert(ext4_mb_cache, bh); + ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, 1); if (error) @@ -308,8 +526,15 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), - size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, + size); + if (error) + goto cleanup; + } else { + memcpy(buffer, bh->b_data + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -350,8 +575,15 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, + size); + if (error) + goto cleanup; + } else { + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -428,7 +660,6 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -450,7 +681,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = -EFSCORRUPTED; goto cleanup; } - ext4_xattr_cache_insert(ext4_mb_cache, bh); + ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: @@ -539,15 +770,456 @@ static void ext4_xattr_update_super_block(handle_t *handle, } } +int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) +{ + struct ext4_iloc iloc = { .bh = NULL }; + struct buffer_head *bh = NULL; + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + qsize_t ea_inode_refs = 0; + void *end; + int ret; + + lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); + + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + ret = xattr_check_inode(inode, header, end); + if (ret) + goto out; + + for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + ret = -EIO; + goto out; + } + + if (ext4_xattr_check_block(inode, bh)) { + ret = -EFSCORRUPTED; + goto out; + } + + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + *usage = ea_inode_refs + 1; + ret = 0; +out: + brelse(iloc.bh); + brelse(bh); + return ret; +} + +static inline size_t round_up_cluster(struct inode *inode, size_t length) +{ + struct super_block *sb = inode->i_sb; + size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + + inode->i_blkbits); + size_t mask = ~(cluster_size - 1); + + return (length + cluster_size - 1) & mask; +} + +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) +{ + int err; + + err = dquot_alloc_inode(inode); + if (err) + return err; + err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); + if (err) + dquot_free_inode(inode); + return err; +} + +static void ext4_xattr_inode_free_quota(struct inode *parent, + struct inode *ea_inode, + size_t len) +{ + if (ea_inode && + ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) + return; + dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); + dquot_free_inode(parent); +} + +int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create) +{ + int credits; + int blocks; + + /* + * 1) Owner inode update + * 2) Ref count update on old xattr block + * 3) new xattr block + * 4) block bitmap update for new xattr block + * 5) group descriptor for new xattr block + * 6) block bitmap update for old xattr block + * 7) group descriptor for old block + * + * 6 & 7 can happen if we have two racing threads T_a and T_b + * which are each trying to set an xattr on inodes I_a and I_b + * which were both initially sharing an xattr block. + */ + credits = 7; + + /* Quota updates. */ + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (inode && ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + + /* We are done if ea_inode feature is not enabled. */ + if (!ext4_has_feature_ea_inode(sb)) + return credits; + + /* New ea_inode, inode map, block bitmap, group descriptor. */ + credits += 4; + + /* Data blocks. */ + blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree. */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + + /* Blocks themselves. */ + credits += blocks; + + if (!is_create) { + /* Dereference ea_inode holding old xattr value. + * Old ea_inode, inode map, block bitmap, group descriptor. + */ + credits += 4; + + /* Data blocks for old ea_inode. */ + blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree for old + * ea_inode. + */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + } + + /* We may need to clone the existing xattr block in which case we need + * to increment ref counts for existing ea_inodes referenced by it. + */ + if (block_bh) { + struct ext4_xattr_entry *entry = BFIRST(block_bh); + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + /* Ref count update on ea_inode. */ + credits += 1; + } + return credits; +} + +static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, + int credits, struct buffer_head *bh, + bool dirty, bool block_csum) +{ + int error; + + if (!ext4_handle_valid(handle)) + return 0; + + if (handle->h_buffer_credits >= credits) + return 0; + + error = ext4_journal_extend(handle, credits - handle->h_buffer_credits); + if (!error) + return 0; + if (error < 0) { + ext4_warning(inode->i_sb, "Extend journal (error %d)", error); + return error; + } + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + + error = ext4_journal_restart(handle, credits); + if (error) { + ext4_warning(inode->i_sb, "Restart journal (error %d)", error); + return error; + } + + if (bh) { + error = ext4_journal_get_write_access(handle, bh); + if (error) { + ext4_warning(inode->i_sb, + "Get write access failed (error %d)", + error); + return error; + } + } + return 0; +} + +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, + int ref_change) +{ + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode); + struct ext4_iloc iloc; + s64 ref_count; + u32 hash; + int ret; + + inode_lock(ea_inode); + + ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); + if (ret) { + iloc.bh = NULL; + goto out; + } + + ref_count = ext4_xattr_inode_get_ref(ea_inode); + ref_count += ref_change; + ext4_xattr_inode_set_ref(ea_inode, ref_count); + + if (ref_change > 0) { + WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", + ea_inode->i_ino, ref_count); + + if (ref_count == 1) { + WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + set_nlink(ea_inode, 1); + ext4_orphan_del(handle, ea_inode); + + if (ea_inode_cache) { + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_create(ea_inode_cache, + GFP_NOFS, hash, + ea_inode->i_ino, + true /* reusable */); + } + } + } else { + WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", + ea_inode->i_ino, ref_count); + + if (ref_count == 0) { + WARN_ONCE(ea_inode->i_nlink != 1, + "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + clear_nlink(ea_inode); + ext4_orphan_add(handle, ea_inode); + + if (ea_inode_cache) { + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_delete(ea_inode_cache, hash, + ea_inode->i_ino); + } + } + } + + ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); + iloc.bh = NULL; + if (ret) + ext4_warning_inode(ea_inode, + "ext4_mark_iloc_dirty() failed ret=%d", ret); +out: + brelse(iloc.bh); + inode_unlock(ea_inode); + return ret; +} + +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, 1); +} + +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, -1); +} + +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, + struct ext4_xattr_entry *first) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *failed_entry; + unsigned int ea_ino; + int err, saved_err; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); + if (err) + goto cleanup; + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "inc ref error %d", err); + iput(ea_inode); + goto cleanup; + } + iput(ea_inode); + } + return 0; + +cleanup: + saved_err = err; + failed_entry = entry; + + for (entry = first; entry != failed_entry; + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); + if (err) { + ext4_warning(parent->i_sb, + "cleanup ea_ino %u iget error %d", ea_ino, + err); + continue; + } + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", + err); + iput(ea_inode); + } + return saved_err; +} + +static void +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, + struct buffer_head *bh, + struct ext4_xattr_entry *first, bool block_csum, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits, bool skip_quota) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + bool dirty = false; + unsigned int ea_ino; + int err; + int credits; + + /* One credit for dec ref on ea_inode, one for orphan list addition, */ + credits = 2 + extra_credits; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); + if (err) + continue; + + err = ext4_expand_inode_array(ea_inode_array, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, + "Expand inode array err=%d", err); + iput(ea_inode); + continue; + } + + err = ext4_xattr_ensure_credits(handle, parent, credits, bh, + dirty, block_csum); + if (err) { + ext4_warning_inode(ea_inode, "Ensure credits err=%d", + err); + continue; + } + + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", + err); + continue; + } + + if (!skip_quota) + ext4_xattr_inode_free_quota(parent, ea_inode, + le32_to_cpu(entry->e_value_size)); + + /* + * Forget about ea_inode within the same transaction that + * decrements the ref count. This avoids duplicate decrements in + * case the rest of the work spills over to subsequent + * transactions. + */ + entry->e_value_inum = 0; + entry->e_value_size = 0; + + dirty = true; + } + + if (dirty) { + /* + * Note that we are deliberately skipping csum calculation for + * the final update because we do not expect any journal + * restarts until xattr block is freed. + */ + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + ext4_warning_inode(parent, + "handle dirty metadata err=%d", err); + } +} + /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, - struct buffer_head *bh) + struct buffer_head *bh, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; @@ -565,9 +1237,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); + if (ea_block_cache) + mb_cache_entry_delete(ea_block_cache, hash, + bh->b_blocknr); get_bh(bh); unlock_buffer(bh); + + if (ext4_has_feature_ea_inode(inode->i_sb)) + ext4_xattr_inode_dec_ref_all(handle, inode, bh, + BFIRST(bh), + true /* block_csum */, + ea_inode_array, + extra_credits, + true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -577,11 +1259,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; - ce = mb_cache_entry_get(ext4_mb_cache, hash, - bh->b_blocknr); - if (ce) { - ce->e_reusable = 1; - mb_cache_entry_put(ext4_mb_cache, ce); + if (ea_block_cache) { + ce = mb_cache_entry_get(ea_block_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; + mb_cache_entry_put(ea_block_cache, ce); + } } } @@ -620,7 +1304,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -631,113 +1315,457 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, return (*min_offs - ((void *)last - base) - sizeof(__u32)); } -static int -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +/* + * Write the value of the EA in an inode. + */ +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, + const void *buf, int bufsize) +{ + struct buffer_head *bh = NULL; + unsigned long block = 0; + int blocksize = ea_inode->i_sb->s_blocksize; + int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; + int csize, wsize = 0; + int ret = 0; + int retries = 0; + +retry: + while (ret >= 0 && ret < max_blocks) { + struct ext4_map_blocks map; + map.m_lblk = block += ret; + map.m_len = max_blocks -= ret; + + ret = ext4_map_blocks(handle, ea_inode, &map, + EXT4_GET_BLOCKS_CREATE); + if (ret <= 0) { + ext4_mark_inode_dirty(handle, ea_inode); + if (ret == -ENOSPC && + ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + break; + } + } + + if (ret < 0) + return ret; + + block = 0; + while (wsize < bufsize) { + if (bh != NULL) + brelse(bh); + csize = (bufsize - wsize) > blocksize ? blocksize : + bufsize - wsize; + bh = ext4_getblk(handle, ea_inode, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (ret) + goto out; + + memcpy(bh->b_data, buf, csize); + set_buffer_uptodate(bh); + ext4_handle_dirty_metadata(handle, ea_inode, bh); + + buf += csize; + wsize += csize; + block += 1; + } + + inode_lock(ea_inode); + i_size_write(ea_inode, wsize); + ext4_update_i_disksize(ea_inode, wsize); + inode_unlock(ea_inode); + + ext4_mark_inode_dirty(handle, ea_inode); + +out: + brelse(bh); + + return ret; +} + +/* + * Create an inode to store the value of a large EA. + */ +static struct inode *ext4_xattr_inode_create(handle_t *handle, + struct inode *inode, u32 hash) +{ + struct inode *ea_inode = NULL; + uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; + int err; + + /* + * Let the next inode be the goal, so we try and allocate the EA inode + * in the same group, or nearby one. + */ + ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG | 0600, NULL, inode->i_ino + 1, owner, + EXT4_EA_INODE_FL); + if (!IS_ERR(ea_inode)) { + ea_inode->i_op = &ext4_file_inode_operations; + ea_inode->i_fop = &ext4_file_operations; + ext4_set_aops(ea_inode); + ext4_xattr_inode_set_class(ea_inode); + unlock_new_inode(ea_inode); + ext4_xattr_inode_set_ref(ea_inode, 1); + ext4_xattr_inode_set_hash(ea_inode, hash); + err = ext4_mark_inode_dirty(handle, ea_inode); + if (!err) + err = ext4_inode_attach_jinode(ea_inode); + if (err) { + iput(ea_inode); + return ERR_PTR(err); + } + + /* + * Xattr inodes are shared therefore quota charging is performed + * at a higher level. + */ + dquot_free_inode(ea_inode); + dquot_drop(ea_inode); + inode_lock(ea_inode); + ea_inode->i_flags |= S_NOQUOTA; + inode_unlock(ea_inode); + } + + return ea_inode; +} + +static struct inode * +ext4_xattr_inode_cache_find(struct inode *inode, const void *value, + size_t value_len, u32 hash) +{ + struct inode *ea_inode; + struct mb_cache_entry *ce; + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + void *ea_data; + + if (!ea_inode_cache) + return NULL; + + ce = mb_cache_entry_find_first(ea_inode_cache, hash); + if (!ce) + return NULL; + + ea_data = ext4_kvmalloc(value_len, GFP_NOFS); + if (!ea_data) { + mb_cache_entry_put(ea_inode_cache, ce); + return NULL; + } + + while (ce) { + ea_inode = ext4_iget(inode->i_sb, ce->e_value); + if (!IS_ERR(ea_inode) && + !is_bad_inode(ea_inode) && + (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && + i_size_read(ea_inode) == value_len && + !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && + !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, + value_len) && + !memcmp(value, ea_data, value_len)) { + mb_cache_entry_touch(ea_inode_cache, ce); + mb_cache_entry_put(ea_inode_cache, ce); + kvfree(ea_data); + return ea_inode; + } + + if (!IS_ERR(ea_inode)) + iput(ea_inode); + ce = mb_cache_entry_find_next(ea_inode_cache, ce); + } + kvfree(ea_data); + return NULL; +} + +/* + * Add value of the EA in an inode. + */ +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, + const void *value, size_t value_len, + struct inode **ret_inode) +{ + struct inode *ea_inode; + u32 hash; + int err; + + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); + ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); + if (ea_inode) { + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + iput(ea_inode); + return err; + } + + *ret_inode = ea_inode; + return 0; + } + + /* Create an inode for the EA value */ + ea_inode = ext4_xattr_inode_create(handle, inode, hash); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + + err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); + if (err) { + ext4_xattr_inode_dec_ref(handle, ea_inode); + iput(ea_inode); + return err; + } + + if (EA_INODE_CACHE(inode)) + mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, + ea_inode->i_ino, true /* reusable */); + + *ret_inode = ea_inode; + return 0; +} + +/* + * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode + * feature is enabled. + */ +#define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) + +static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + struct ext4_xattr_search *s, + handle_t *handle, struct inode *inode, + bool is_block) { struct ext4_xattr_entry *last; - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + struct ext4_xattr_entry *here = s->here; + size_t min_offs = s->end - s->base, name_len = strlen(i->name); + int in_inode = i->in_inode; + struct inode *old_ea_inode = NULL; + struct inode *new_ea_inode = NULL; + size_t old_size, new_size; + int ret; + + /* Space used by old and new values. */ + old_size = (!s->not_found && !here->e_value_inum) ? + EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; + new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; + + /* + * Optimization for the simple case when old and new values have the + * same padded sizes. Not applicable if external inodes are involved. + */ + if (new_size && new_size == old_size) { + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + here->e_value_size = cpu_to_le32(i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, new_size); + } else { + memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, new_size - i->value_len); + } + goto update_hash; + } /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; } } - free = min_offs - ((void *)last - s->base) - sizeof(__u32); - if (!s->not_found) { - if (s->here->e_value_size) { - size_t size = le32_to_cpu(s->here->e_value_size); - free += EXT4_XATTR_SIZE(size); - } - free += EXT4_XATTR_LEN(name_len); - } + + /* Check whether we have enough space. */ if (i->value) { - if (free < EXT4_XATTR_LEN(name_len) + - EXT4_XATTR_SIZE(i->value_len)) - return -ENOSPC; + size_t free; + + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) + free += EXT4_XATTR_LEN(name_len) + old_size; + + if (free < EXT4_XATTR_LEN(name_len) + new_size) { + ret = -ENOSPC; + goto out; + } + + /* + * If storing the value in an external inode is an option, + * reserve space for xattr entries/names in the external + * attribute block so that a long value does not occupy the + * whole space and prevent futher entries being added. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + new_size && is_block && + (min_offs + old_size - new_size) < + EXT4_XATTR_BLOCK_RESERVE(inode)) { + ret = -ENOSPC; + goto out; + } } - if (i->value && s->not_found) { - /* Insert the new name. */ - size_t size = EXT4_XATTR_LEN(name_len); - size_t rest = (void *)last - (void *)s->here + sizeof(__u32); - memmove((void *)s->here + size, s->here, rest); - memset(s->here, 0, size); - s->here->e_name_index = i->name_index; - s->here->e_name_len = name_len; - memcpy(s->here->e_name, i->name, name_len); - } else { - if (s->here->e_value_size) { - void *first_val = s->base + min_offs; - size_t offs = le16_to_cpu(s->here->e_value_offs); - void *val = s->base + offs; - size_t size = EXT4_XATTR_SIZE( - le32_to_cpu(s->here->e_value_size)); - - if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { - /* The old and the new value have the same - size. Just replace. */ - s->here->e_value_size = - cpu_to_le32(i->value_len); - if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); - } else { - /* Clear pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); - memcpy(val, i->value, i->value_len); - } - return 0; - } + /* + * Getting access to old and new ea inodes is subject to failures. + * Finish that work before doing any modifications to the xattr data. + */ + if (!s->not_found && here->e_value_inum) { + ret = ext4_xattr_inode_iget(inode, + le32_to_cpu(here->e_value_inum), + le32_to_cpu(here->e_hash), + &old_ea_inode); + if (ret) { + old_ea_inode = NULL; + goto out; + } + } + if (i->value && in_inode) { + WARN_ON_ONCE(!i->value_len); + + ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); + if (ret) + goto out; + + ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, + i->value_len, + &new_ea_inode); + if (ret) { + new_ea_inode = NULL; + ext4_xattr_inode_free_quota(inode, NULL, i->value_len); + goto out; + } + } - /* Remove the old value. */ - memmove(first_val + size, first_val, val - first_val); - memset(first_val, 0, size); - s->here->e_value_size = 0; - s->here->e_value_offs = 0; - min_offs += size; - - /* Adjust all value offsets. */ - last = s->first; - while (!IS_LAST_ENTRY(last)) { - size_t o = le16_to_cpu(last->e_value_offs); - if (last->e_value_size && o < offs) - last->e_value_offs = - cpu_to_le16(o + size); - last = EXT4_XATTR_NEXT(last); + if (old_ea_inode) { + /* We are ready to release ref count on the old_ea_inode. */ + ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); + if (ret) { + /* Release newly required ref count on new_ea_inode. */ + if (new_ea_inode) { + int err; + + err = ext4_xattr_inode_dec_ref(handle, + new_ea_inode); + if (err) + ext4_warning_inode(new_ea_inode, + "dec ref new_ea_inode err=%d", + err); + ext4_xattr_inode_free_quota(inode, new_ea_inode, + i->value_len); } + goto out; } - if (!i->value) { - /* Remove the old name. */ - size_t size = EXT4_XATTR_LEN(name_len); - last = ENTRY((void *)last - size); - memmove(s->here, (void *)s->here + size, - (void *)last - (void *)s->here + sizeof(__u32)); - memset(last, 0, size); + + ext4_xattr_inode_free_quota(inode, old_ea_inode, + le32_to_cpu(here->e_value_size)); + } + + /* No failures allowed past this point. */ + + if (!s->not_found && here->e_value_offs) { + /* Remove the old value. */ + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + memmove(first_val + old_size, first_val, val - first_val); + memset(first_val, 0, old_size); + min_offs += old_size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + + if (!last->e_value_inum && + last->e_value_size && o < offs) + last->e_value_offs = cpu_to_le16(o + old_size); + last = EXT4_XATTR_NEXT(last); } } + if (!i->value) { + /* Remove old name. */ + size_t size = EXT4_XATTR_LEN(name_len); + + last = ENTRY((void *)last - size); + memmove(here, (void *)here + size, + (void *)last - (void *)here + sizeof(__u32)); + memset(last, 0, size); + } else if (s->not_found) { + /* Insert new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)here + sizeof(__u32); + + memmove((void *)here + size, here, rest); + memset(here, 0, size); + here->e_name_index = i->name_index; + here->e_name_len = name_len; + memcpy(here->e_name, i->name, name_len); + } else { + /* This is an update, reset value info. */ + here->e_value_inum = 0; + here->e_value_offs = 0; + here->e_value_size = 0; + } + if (i->value) { - /* Insert the new value. */ - s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value_len) { - size_t size = EXT4_XATTR_SIZE(i->value_len); - void *val = s->base + min_offs - size; - s->here->e_value_offs = cpu_to_le16(min_offs - size); + /* Insert new value. */ + if (in_inode) { + here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); + } else if (i->value_len) { + void *val = s->base + min_offs - new_size; + + here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); + memset(val, 0, new_size); } else { - /* Clear the pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, + new_size - i->value_len); } } + here->e_value_size = cpu_to_le32(i->value_len); } - return 0; + +update_hash: + if (i->value) { + __le32 hash = 0; + + /* Entry hash calculation. */ + if (in_inode) { + __le32 crc32c_hash; + + /* + * Feed crc32c hash instead of the raw value for entry + * hash calculation. This is to avoid walking + * potentially long value buffer again. + */ + crc32c_hash = cpu_to_le32( + ext4_xattr_inode_get_hash(new_ea_inode)); + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, + &crc32c_hash, 1); + } else if (is_block) { + __le32 *value = s->base + le16_to_cpu( + here->e_value_offs); + + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, value, + new_size >> 2); + } + here->e_hash = hash; + } + + if (is_block) + ext4_xattr_rehash((struct ext4_xattr_header *)s->base); + + ret = 0; +out: + iput(old_ea_inode); + iput(new_ea_inode); + return ret; } struct ext4_xattr_block_find { @@ -794,15 +1822,18 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; - struct ext4_xattr_search *s = &bs->s; + struct ext4_xattr_search s_copy = bs->s; + struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + struct inode *ea_inode = NULL, *tmp_inode; + size_t old_ea_inode_quota = 0; + unsigned int ea_ino; + #define header(x) ((struct ext4_xattr_header *)(x)) - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; if (s->base) { BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); @@ -818,17 +1849,12 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - mb_cache_entry_delete_block(ext4_mb_cache, hash, - bs->bh->b_blocknr); + if (ea_block_cache) + mb_cache_entry_delete(ea_block_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); - error = ext4_xattr_set_entry(i, s); - if (!error) { - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), - s->here); - ext4_xattr_cache_insert(ext4_mb_cache, - bs->bh); - } + error = ext4_xattr_set_entry(i, s, handle, inode, + true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -854,6 +1880,36 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; + + /* + * If existing entry points to an xattr inode, we need + * to prevent ext4_xattr_set_entry() from decrementing + * ref count on it because the reference belongs to the + * original block. In this case, make the entry look + * like it has an empty value. + */ + if (!s->not_found && s->here->e_value_inum) { + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &tmp_inode); + if (error) + goto cleanup; + + if (!ext4_test_inode_state(tmp_inode, + EXT4_STATE_LUSTRE_EA_INODE)) { + /* + * Defer quota free call for previous + * inode until success is guaranteed. + */ + old_ea_inode_quota = le32_to_cpu( + s->here->e_value_size); + } + iput(tmp_inode); + + s->here->e_value_inum = 0; + s->here->e_value_size = 0; + } } } else { /* Allocate a buffer where we construct the new block. */ @@ -870,17 +1926,33 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), s->here); + + if (i->value && s->here->e_value_inum) { + /* + * A ref count on ea_inode has been taken as part of the call to + * ext4_xattr_set_entry() above. We would like to drop this + * extra ref but we have to wait until the xattr block is + * initialized and has its own ref count on the ea_inode. + */ + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &ea_inode); + if (error) { + ea_inode = NULL; + goto cleanup; + } + } inserted: if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), + &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -925,7 +1997,7 @@ inserted: EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; @@ -944,12 +2016,13 @@ inserted: if (error) goto cleanup_dquot; } - mb_cache_entry_touch(ext4_mb_cache, ce); - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_touch(ea_block_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ ea_bdebug(bs->bh, "keeping this block"); + ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); new_bh = bs->bh; get_bh(new_bh); } else { @@ -984,6 +2057,22 @@ getblk_failed: EXT4_FREE_BLOCKS_METADATA); goto cleanup; } + error = ext4_xattr_inode_inc_ref_all(handle, inode, + ENTRY(header(s->base)+1)); + if (error) + goto getblk_failed; + if (ea_inode) { + /* Drop the extra ref on ea_inode. */ + error = ext4_xattr_inode_dec_ref(handle, + ea_inode); + if (error) + ext4_warning_inode(ea_inode, + "dec ref error=%d", + error); + iput(ea_inode); + ea_inode = NULL; + } + lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, new_bh); if (error) { @@ -995,7 +2084,7 @@ getblk_failed: ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext4_xattr_cache_insert(ext4_mb_cache, new_bh); + ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) @@ -1003,17 +2092,40 @@ getblk_failed: } } + if (old_ea_inode_quota) + ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); + /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ - if (bs->bh && bs->bh != new_bh) - ext4_xattr_release_block(handle, inode, bs->bh); + if (bs->bh && bs->bh != new_bh) { + struct ext4_xattr_inode_array *ea_inode_array = NULL; + + ext4_xattr_release_block(handle, inode, bs->bh, + &ea_inode_array, + 0 /* extra_credits */); + ext4_xattr_inode_array_free(ea_inode_array); + } error = 0; cleanup: + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + /* If there was an error, revert the quota charge. */ + if (error) + ext4_xattr_inode_free_quota(inode, ea_inode, + i_size_read(ea_inode)); + iput(ea_inode); + } if (ce) - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1070,7 +2182,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) { if (error == -ENOSPC && ext4_has_inline_data(inode)) { @@ -1082,7 +2194,8 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_find(inode, i, is); if (error) return error; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, + false /* is_block */); } if (error) return error; @@ -1098,7 +2211,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, return 0; } -static int ext4_xattr_ibody_set(struct inode *inode, +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { @@ -1108,7 +2221,7 @@ static int ext4_xattr_ibody_set(struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); @@ -1127,12 +2240,31 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s, { void *value; + /* When e_value_inum is set the value is stored externally. */ + if (s->here->e_value_inum) + return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); return !memcmp(value, i->value, i->value_len); } +static struct buffer_head *ext4_xattr_get_block(struct inode *inode) +{ + struct buffer_head *bh; + int error; + + if (!EXT4_I(inode)->i_file_acl) + return NULL; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) + return ERR_PTR(-EIO); + error = ext4_xattr_check_block(inode, bh); + if (error) + return ERR_PTR(error); + return bh; +} + /* * ext4_xattr_set_handle() * @@ -1155,7 +2287,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, .name = name, .value = value, .value_len = value_len, - + .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, @@ -1173,6 +2305,28 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_write_lock_xattr(inode, &no_expand); + /* Check journal credits under write lock. */ + if (ext4_handle_valid(handle)) { + struct buffer_head *bh; + int credits; + + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + goto cleanup; + } + + credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, + flags & XATTR_CREATE); + brelse(bh); + + if (!ext4_handle_has_enough_credits(handle, credits)) { + error = -ENOSPC; + goto cleanup; + } + } + error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; @@ -1202,9 +2356,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } + if (!value) { if (!is.s.not_found) - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { @@ -1215,7 +2370,12 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; - error = ext4_xattr_ibody_set(inode, &i, &is); + if (ext4_has_feature_ea_inode(inode->i_sb) && + (EXT4_XATTR_SIZE(i.value_len) > + EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) + i.in_inode = 1; +retry_inode: + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); @@ -1226,11 +2386,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); - if (error) - goto cleanup; - if (!is.s.not_found) { + if (!error && !is.s.not_found) { i.value = NULL; - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); + } else if (error == -ENOSPC) { + /* + * Xattr does not fit in the block, store at + * external inode if possible. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + !i.in_inode) { + i.in_inode = 1; + goto retry_inode; + } } } } @@ -1256,6 +2425,33 @@ cleanup: return error; } +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits) +{ + struct buffer_head *bh; + int err; + + *credits = 0; + + if (!EXT4_SB(inode->i_sb)->s_journal) + return 0; + + down_read(&EXT4_I(inode)->xattr_sem); + + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + } else { + *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, is_create); + brelse(bh); + err = 0; + } + + up_read(&EXT4_I(inode)->xattr_sem); + return err; +} + /* * ext4_xattr_set() * @@ -1269,13 +2465,20 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; + struct super_block *sb = inode->i_sb; int error, retries = 0; - int credits = ext4_jbd2_credits_xattr(inode); + int credits; error = dquot_initialize(inode); if (error) return error; + retry: + error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, + &credits); + if (error) + return error; + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -1286,7 +2489,7 @@ retry: value, value_len, flags); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; @@ -1311,7 +2514,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); @@ -1331,18 +2534,16 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; - size_t value_offs, value_size; + size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, + .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int error; - value_offs = le16_to_cpu(entry->e_value_offs); - value_size = le32_to_cpu(entry->e_value_size); - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); buffer = kmalloc(value_size, GFP_NOFS); @@ -1358,7 +2559,15 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, bs->bh = NULL; /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, value_size); + if (error) + goto out; + } else { + size_t value_offs = le16_to_cpu(entry->e_value_offs); + memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + } + memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; @@ -1372,11 +2581,10 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, goto out; /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(inode, &i, is); + error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto out; - i.name = b_entry_name; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); @@ -1420,9 +2628,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); + total_size = EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { @@ -1441,8 +2650,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, } entry_size = EXT4_XATTR_LEN(entry->e_name_len); - total_size = entry_size + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + total_size = entry_size; + if (!entry->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) @@ -1464,23 +2675,21 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; - struct buffer_head *bh = NULL; + struct buffer_head *bh; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; size_t min_offs; size_t ifree, bfree; int total_ino; void *base, *end; int error = 0, tried_min_extra_isize = 0; - int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ - int no_expand; - - if (ext4_write_trylock_xattr(inode, &no_expand) == 0) - return 0; retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) - goto out; + return 0; header = IHDR(inode, raw_inode); @@ -1515,6 +2724,7 @@ retry: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EFSCORRUPTED; + brelse(bh); goto cleanup; } base = BHDR(bh); @@ -1522,11 +2732,11 @@ retry: min_offs = end - base; bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, NULL); + brelse(bh); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; - brelse(bh); goto retry; } error = -ENOSPC; @@ -1544,7 +2754,6 @@ retry: s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; - brelse(bh); goto retry; } goto cleanup; @@ -1556,66 +2765,192 @@ shift: EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; - brelse(bh); -out: - ext4_write_unlock_xattr(inode, &no_expand); - return 0; cleanup: - brelse(bh); - /* - * Inode size expansion failed; don't try again - */ - no_expand = 1; - ext4_write_unlock_xattr(inode, &no_expand); + if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { + ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", + inode->i_ino); + mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); + } return error; } +#define EIA_INCR 16 /* must be 2^n */ +#define EIA_MASK (EIA_INCR - 1) +/* Add the large xattr @inode into @ea_inode_array for deferred iput(). + * If @ea_inode_array is new or full it will be grown and the old + * contents copied over. + */ +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode) +{ + if (*ea_inode_array == NULL) { + /* + * Start with 15 inodes, so it fits into a power-of-two size. + * If *ea_inode_array is NULL, this is essentially offsetof() + */ + (*ea_inode_array) = + kmalloc(offsetof(struct ext4_xattr_inode_array, + inodes[EIA_MASK]), + GFP_NOFS); + if (*ea_inode_array == NULL) + return -ENOMEM; + (*ea_inode_array)->count = 0; + } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { + /* expand the array once all 15 + n * 16 slots are full */ + struct ext4_xattr_inode_array *new_array = NULL; + int count = (*ea_inode_array)->count; + + /* if new_array is NULL, this is essentially offsetof() */ + new_array = kmalloc( + offsetof(struct ext4_xattr_inode_array, + inodes[count + EIA_INCR]), + GFP_NOFS); + if (new_array == NULL) + return -ENOMEM; + memcpy(new_array, *ea_inode_array, + offsetof(struct ext4_xattr_inode_array, inodes[count])); + kfree(*ea_inode_array); + *ea_inode_array = new_array; + } + (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; + return 0; +} /* * ext4_xattr_delete_inode() * - * Free extended attribute resources associated with this inode. This - * is called immediately before an inode is freed. We have exclusive - * access to the inode. + * Free extended attribute resources associated with this inode. Traverse + * all entries and decrement reference on any xattr inodes associated with this + * inode. This is called immediately before an inode is freed. We have exclusive + * access to the inode. If an orphan inode is deleted it will also release its + * references on xattr block and xattr inodes. */ -void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_header *header; + struct ext4_iloc iloc = { .bh = NULL }; + struct ext4_xattr_entry *entry; + struct inode *ea_inode; + int error; - if (!EXT4_I(inode)->i_file_acl) - goto cleanup; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); + error = ext4_xattr_ensure_credits(handle, inode, extra_credits, + NULL /* bh */, + false /* dirty */, + false /* block_csum */); + if (error) { + EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - goto cleanup; + + if (ext4_has_feature_ea_inode(inode->i_sb) && + ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); + goto cleanup; + } + + error = ext4_journal_get_write_access(handle, iloc.bh); + if (error) { + EXT4_ERROR_INODE(inode, "write access (error %d)", + error); + goto cleanup; + } + + header = IHDR(inode, ext4_raw_inode(&iloc)); + if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, + IFIRST(header), + false /* block_csum */, + ea_inode_array, + extra_credits, + false /* skip_quota */); } - ext4_xattr_release_block(handle, inode, bh); - EXT4_I(inode)->i_file_acl = 0; + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + error = ext4_xattr_check_block(inode, bh); + if (error) { + EXT4_ERROR_INODE(inode, "bad block %llu (error %d)", + EXT4_I(inode)->i_file_acl, error); + goto cleanup; + } + + if (ext4_has_feature_ea_inode(inode->i_sb)) { + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + error = ext4_xattr_inode_iget(inode, + le32_to_cpu(entry->e_value_inum), + le32_to_cpu(entry->e_hash), + &ea_inode); + if (error) + continue; + ext4_xattr_inode_free_quota(inode, ea_inode, + le32_to_cpu(entry->e_value_size)); + iput(ea_inode); + } + + } + + ext4_xattr_release_block(handle, inode, bh, ea_inode_array, + extra_credits); + /* + * Update i_file_acl value in the same transaction that releases + * block. + */ + EXT4_I(inode)->i_file_acl = 0; + error = ext4_mark_inode_dirty(handle, inode); + if (error) { + EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", + error); + goto cleanup; + } + } + error = 0; cleanup: + brelse(iloc.bh); brelse(bh); + return error; +} + +void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) +{ + int idx; + + if (ea_inode_array == NULL) + return; + + for (idx = 0; idx < ea_inode_array->count; ++idx) + iput(ea_inode_array->inodes[idx]); + kfree(ea_inode_array); } /* - * ext4_xattr_cache_insert() + * ext4_xattr_block_cache_insert() * - * Create a new entry in the extended attribute cache, and insert + * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. * * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) +ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, + struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); @@ -1623,7 +2958,9 @@ ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) EXT4_XATTR_REFCOUNT_MAX; int error; - error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + if (!ea_block_cache) + return; + error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) @@ -1655,11 +2992,11 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || + entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; - if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EFSCORRUPTED; - if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + if (!entry1->e_value_inum && + memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; @@ -1673,7 +3010,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, } /* - * ext4_xattr_cache_find() + * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * @@ -1681,30 +3018,33 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * not found or an error occurred. */ static struct buffer_head * -ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb_cache_entry **pce) +ext4_xattr_block_cache_find(struct inode *inode, + struct ext4_xattr_header *header, + struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + if (!ea_block_cache) + return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); - ce = mb_cache_entry_find_first(ext4_mb_cache, hash); + ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_block); + bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long) ce->e_block); + (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ext4_mb_cache, ce); + ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } @@ -1717,30 +3057,22 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, * * Compute the hash of an extended attribute. */ -static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count) { __u32 hash = 0; - char *name = entry->e_name; - int n; - for (n = 0; n < entry->e_name_len; n++) { + while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ *name++; } - - if (entry->e_value_size != 0) { - __le32 *value = (__le32 *)((char *)header + - le16_to_cpu(entry->e_value_offs)); - for (n = (le32_to_cpu(entry->e_value_size) + - EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { - hash = (hash << VALUE_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ - le32_to_cpu(*value++); - } + while (value_count--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); } - entry->e_hash = cpu_to_le32(hash); + return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT @@ -1753,13 +3085,11 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, * * Re-compute the extended attribute hash value after an entry has changed. */ -static void ext4_xattr_rehash(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) +static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; - ext4_xattr_hash_entry(header, entry); here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 099c8b670ef5..f8cc07588ac9 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* File: fs/ext4/xattr.h @@ -44,7 +45,7 @@ struct ext4_xattr_entry { __u8 e_name_len; /* length of name */ __u8 e_name_index; /* attribute name index */ __le16 e_value_offs; /* offset in disk block of value */ - __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_inum; /* inode in which the value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ char e_name[0]; /* attribute name */ @@ -69,6 +70,13 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) +/* + * The minimum size of EA value when you start storing it in an external inode + * size of block - size of header - size of 1 entry - 4 null bytes +*/ +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ + ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) + #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) #define BFIRST(bh) ENTRY(BHDR(bh)+1) @@ -77,10 +85,11 @@ struct ext4_xattr_entry { #define EXT4_ZERO_XATTR_VALUE ((void *)-1) struct ext4_xattr_info { - int name_index; const char *name; const void *value; size_t value_len; + int name_index; + int in_inode; }; struct ext4_xattr_search { @@ -96,6 +105,11 @@ struct ext4_xattr_ibody_find { struct ext4_iloc iloc; }; +struct ext4_xattr_inode_array { + unsigned int count; /* # of used items in the array */ + struct inode *inodes[0]; +}; + extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; @@ -139,8 +153,16 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits); +extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create); -extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **array, + int extra_credits); +extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); @@ -169,3 +191,11 @@ static inline int ext4_init_security(handle_t *handle, struct inode *inode, return 0; } #endif + +#ifdef CONFIG_LOCKDEP +extern void ext4_xattr_inode_set_class(struct inode *ea_inode); +#else +static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { } +#endif + +extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index a8921112030d..629001b28632 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_security.c * Handler for storing security labels as extended attributes. diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index c7765c735714..e9389e5d75c3 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_trusted.c * Handler for trusted extended attributes. diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index ca20e423034b..d4546184b34b 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_user.c * Handler for extended user attributes. |