summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/.kunitconfig3
-rw-r--r--fs/ext4/Kconfig36
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c21
-rw-r--r--fs/ext4/acl.h5
-rw-r--r--fs/ext4/balloc.c268
-rw-r--r--fs/ext4/bitmap.c37
-rw-r--r--fs/ext4/block_validity.c194
-rw-r--r--fs/ext4/crypto.c241
-rw-r--r--fs/ext4/dir.c203
-rw-r--r--fs/ext4/ext4.h1473
-rw-r--r--fs/ext4/ext4_extents.h12
-rw-r--r--fs/ext4/ext4_jbd2.c137
-rw-r--r--fs/ext4/ext4_jbd2.h161
-rw-r--r--fs/ext4/extents.c2351
-rw-r--r--fs/ext4/extents_status.c759
-rw-r--r--fs/ext4/extents_status.h44
-rw-r--r--fs/ext4/fast_commit.c2343
-rw-r--r--fs/ext4/fast_commit.h186
-rw-r--r--fs/ext4/file.c477
-rw-r--r--fs/ext4/fsmap.c99
-rw-r--r--fs/ext4/fsmap.h4
-rw-r--r--fs/ext4/fsync.c70
-rw-r--r--fs/ext4/hash.c41
-rw-r--r--fs/ext4/ialloc.c468
-rw-r--r--fs/ext4/indirect.c72
-rw-r--r--fs/ext4/inline.c805
-rw-r--r--fs/ext4/inode-test.c321
-rw-r--r--fs/ext4/inode.c4439
-rw-r--r--fs/ext4/ioctl.c1281
-rw-r--r--fs/ext4/mballoc-test.c999
-rw-r--r--fs/ext4/mballoc.c4197
-rw-r--r--fs/ext4/mballoc.h79
-rw-r--r--fs/ext4/migrate.c59
-rw-r--r--fs/ext4/mmp.c139
-rw-r--r--fs/ext4/move_extent.c913
-rw-r--r--fs/ext4/namei.c1635
-rw-r--r--fs/ext4/orphan.c661
-rw-r--r--fs/ext4/page-io.c257
-rw-r--r--fs/ext4/readpage.c168
-rw-r--r--fs/ext4/resize.c337
-rw-r--r--fs/ext4/super.c5646
-rw-r--r--fs/ext4/symlink.c80
-rw-r--r--fs/ext4/sysfs.c266
-rw-r--r--fs/ext4/truncate.h8
-rw-r--r--fs/ext4/verity.c166
-rw-r--r--fs/ext4/xattr.c872
-rw-r--r--fs/ext4/xattr.h42
-rw-r--r--fs/ext4/xattr_hurd.c52
-rw-r--r--fs/ext4/xattr_security.c1
-rw-r--r--fs/ext4/xattr_trusted.c1
-rw-r--r--fs/ext4/xattr_user.c1
52 files changed, 21886 insertions, 11248 deletions
diff --git a/fs/ext4/.kunitconfig b/fs/ext4/.kunitconfig
new file mode 100644
index 000000000000..bf51da7cd9fc
--- /dev/null
+++ b/fs/ext4/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_KUNIT_TESTS=y
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 1afa5a4bcb5f..01873c2a34ad 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -1,37 +1,10 @@
# SPDX-License-Identifier: GPL-2.0-only
-# Ext3 configs are here for backward compatibility with old configs which may
-# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
-# kernels after the removal of ext3 driver.
-config EXT3_FS
- tristate "The Extended 3 (ext3) filesystem"
- select EXT4_FS
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
-config EXT3_FS_POSIX_ACL
- bool "Ext3 POSIX Access Control Lists"
- depends on EXT3_FS
- select EXT4_FS_POSIX_ACL
- select FS_POSIX_ACL
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
-config EXT3_FS_SECURITY
- bool "Ext3 Security Labels"
- depends on EXT3_FS
- select EXT4_FS_SECURITY
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
+ select BUFFER_HEAD
select JBD2
select CRC16
- select CRYPTO
- select CRYPTO_CRC32C
+ select CRC32
select FS_IOMAP
select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
help
@@ -103,14 +76,13 @@ config EXT4_DEBUG
config EXT4_KUNIT_TESTS
tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS
- select EXT4_FS
- depends on KUNIT
+ depends on EXT4_FS && KUNIT
default KUNIT_ALL_TESTS
help
This builds the ext4 KUnit tests.
KUnit tests run during boot and output the results to the debug log
- in TAP format (http://testanything.org/). Only useful for kernel devs
+ in TAP format (https://testanything.org/). Only useful for kernel devs
running KUnit test harness and are not for inclusion into a production
build.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 4ccb3c9189d8..72206a292676 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -9,10 +9,12 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
+ xattr_user.o fast_commit.o orphan.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
ext4-inode-test-objs += inode-test.o
obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
+ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 76f634d185f1..3bffe862f954 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -139,16 +139,19 @@ fail:
/*
* Inode operation get_posix_acl().
*
- * inode->i_mutex: don't care
+ * inode->i_rwsem: don't care
*/
struct posix_acl *
-ext4_get_acl(struct inode *inode, int type)
+ext4_get_acl(struct inode *inode, int type, bool rcu)
{
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -180,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type)
/*
* Set the access or default ACL of an inode.
*
- * inode->i_mutex: down unless called from ext4_new_inode
+ * inode->i_rwsem: down unless called from ext4_new_inode
*/
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
@@ -222,11 +225,13 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
}
int
-ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type)
{
handle_t *handle;
int error, credits, retries = 0;
size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
int update_mode = 0;
@@ -244,7 +249,7 @@ retry:
return PTR_ERR(handle);
if ((type == ACL_TYPE_ACCESS) && acl) {
- error = posix_acl_update_mode(inode, &mode, &acl);
+ error = posix_acl_update_mode(idmap, inode, &mode, &acl);
if (error)
goto out_stop;
if (mode != inode->i_mode)
@@ -254,7 +259,7 @@ retry:
error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
if (!error && update_mode) {
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
error = ext4_mark_inode_dirty(handle, inode);
}
out_stop:
@@ -267,8 +272,8 @@ out_stop:
/*
* Initialize the ACLs of a new inode. Called from ext4_new_inode.
*
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
+ * dir->i_rwsem: down
+ * inode->i_rwsem: up (access to inode is still exclusive)
*/
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9b63f5416a2f..0c5a79c3b5d4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -55,8 +55,9 @@ static inline int ext4_acl_count(size_t size)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
-struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu);
+int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ba46d87cdf1..8040c731b3e4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -22,6 +22,7 @@
#include "mballoc.h"
#include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group);
@@ -80,32 +81,54 @@ static inline int ext4_block_in_group(struct super_block *sb,
return (actual_group == block_group) ? 1 : 0;
}
-/* Return the number of clusters used for file system metadata; this
+/*
+ * Return the number of clusters used for file system metadata; this
* represents the overhead needed by the file system.
*/
static unsigned ext4_num_overhead_clusters(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
- unsigned num_clusters;
- int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+ unsigned base_clusters, num_clusters;
+ int block_cluster = -1, inode_cluster;
+ int itbl_cluster_start = -1, itbl_cluster_end = -1;
ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
- ext4_fsblk_t itbl_blk;
+ ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1;
+ ext4_fsblk_t itbl_blk_start, itbl_blk_end;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/* This is the number of clusters used by the superblock,
* block group descriptors, and reserved block group
* descriptor blocks */
- num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+ base_clusters = ext4_num_base_meta_clusters(sb, block_group);
+ num_clusters = base_clusters;
+
+ /*
+ * Account and record inode table clusters if any cluster
+ * is in the block group, or inode table cluster range is
+ * [-1, -1] and won't overlap with block/inode bitmap cluster
+ * accounted below.
+ */
+ itbl_blk_start = ext4_inode_table(sb, gdp);
+ itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
+ if (itbl_blk_start <= end && itbl_blk_end >= start) {
+ itbl_blk_start = max(itbl_blk_start, start);
+ itbl_blk_end = min(itbl_blk_end, end);
+
+ itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
+ itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);
+
+ num_clusters += itbl_cluster_end - itbl_cluster_start + 1;
+ /* check if border cluster is overlapped */
+ if (itbl_cluster_start == base_clusters - 1)
+ num_clusters--;
+ }
/*
- * For the allocation bitmaps and inode table, we first need
- * to check to see if the block is in the block group. If it
- * is, then check to see if the cluster is already accounted
- * for in the clusters used for the base metadata cluster, or
- * if we can increment the base metadata cluster to include
- * that block. Otherwise, we will have to track the cluster
- * used for the allocation bitmap or inode table explicitly.
+ * For the allocation bitmaps, we first need to check to see
+ * if the block is in the block group. If it is, then check
+ * to see if the cluster is already accounted for in the clusters
+ * used for the base metadata cluster and inode tables cluster.
* Normally all of these blocks are contiguous, so the special
* case handling shouldn't be necessary except for *very*
* unusual file system layouts.
@@ -113,46 +136,26 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb,
if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
block_cluster = EXT4_B2C(sbi,
ext4_block_bitmap(sb, gdp) - start);
- if (block_cluster < num_clusters)
- block_cluster = -1;
- else if (block_cluster == num_clusters) {
+ if (block_cluster >= base_clusters &&
+ (block_cluster < itbl_cluster_start ||
+ block_cluster > itbl_cluster_end))
num_clusters++;
- block_cluster = -1;
- }
}
if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
inode_cluster = EXT4_B2C(sbi,
ext4_inode_bitmap(sb, gdp) - start);
- if (inode_cluster < num_clusters)
- inode_cluster = -1;
- else if (inode_cluster == num_clusters) {
- num_clusters++;
- inode_cluster = -1;
- }
- }
-
- itbl_blk = ext4_inode_table(sb, gdp);
- for (i = 0; i < sbi->s_itb_per_group; i++) {
- if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
- c = EXT4_B2C(sbi, itbl_blk + i - start);
- if ((c < num_clusters) || (c == inode_cluster) ||
- (c == block_cluster) || (c == itbl_cluster))
- continue;
- if (c == num_clusters) {
- num_clusters++;
- continue;
- }
+ /*
+ * Additional check if inode bitmap is in just accounted
+ * block_cluster
+ */
+ if (inode_cluster != block_cluster &&
+ inode_cluster >= base_clusters &&
+ (inode_cluster < itbl_cluster_start ||
+ inode_cluster > itbl_cluster_end))
num_clusters++;
- itbl_cluster = c;
- }
}
- if (block_cluster != -1)
- num_clusters++;
- if (inode_cluster != -1)
- num_clusters++;
-
return num_clusters;
}
@@ -185,10 +188,8 @@ static int ext4_init_block_bitmap(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
- J_ASSERT_BH(bh, buffer_locked(bh));
+ ASSERT(buffer_locked(bh));
- /* If checksum is bad mark all blocks used to prevent allocation
- * essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT |
@@ -239,7 +240,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
- return num_clusters_in_group(sb, block_group) -
+ return num_clusters_in_group(sb, block_group) -
ext4_num_overhead_clusters(sb, block_group, gdp);
}
@@ -272,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh_p;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
+ sb, block_group, bh);
+
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
" groups_count = %u", block_group, ngroups);
@@ -303,6 +307,36 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
return desc;
}
+static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_grpblk_t next_zero_bit;
+ unsigned long bitmap_size = sb->s_blocksize * 8;
+ unsigned int offset = num_clusters_in_group(sb, block_group);
+
+ if (bitmap_size <= offset)
+ return 0;
+
+ next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset);
+
+ return (next_zero_bit < bitmap_size ? next_zero_bit : 0);
+}
+
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info **grp_info;
+ long indexv, indexh;
+
+ if (unlikely(group >= EXT4_SB(sb)->s_groups_count))
+ return NULL;
+ indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+ indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+ return grp_info[indexh];
+}
+
/*
* Return the block number which was discovered to be invalid, or 0 if
* the block bitmap is valid.
@@ -350,13 +384,13 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
blk = ext4_inode_table(sb, desc);
offset = blk - group_first_block;
if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
- EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit)
+ EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit)
return blk;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
- EXT4_B2C(sbi, offset + sbi->s_itb_per_group),
+ EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1,
EXT4_B2C(sbi, offset));
if (next_zero_bit <
- EXT4_B2C(sbi, offset + sbi->s_itb_per_group))
+ EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1)
/* bad bitmap for inode tables */
return blk;
return 0;
@@ -368,18 +402,22 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
- if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
return -EFSCORRUPTED;
ext4_lock_group(sb, block_group);
if (buffer_verified(bh))
goto verified;
- if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
- desc, bh) ||
+ if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) ||
ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
@@ -396,6 +434,15 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EFSCORRUPTED;
}
+ blk = ext4_valid_block_bitmap_padding(sb, block_group, bh);
+ if (unlikely(blk != 0)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set",
+ block_group, blk);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ return -EFSCORRUPTED;
+ }
set_buffer_verified(bh);
verified:
ext4_unlock_group(sb, block_group);
@@ -406,6 +453,7 @@ verified:
* ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
+ * @ignore_locked: ignore locked buffers
*
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
@@ -413,7 +461,8 @@ verified:
* Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -421,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
ext4_fsblk_t bitmap_blk;
int err;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
+ sb, block_group, ignore_locked);
+
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return ERR_PTR(-EFSCORRUPTED);
@@ -441,6 +493,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
return ERR_PTR(-ENOMEM);
}
+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, return if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
if (bitmap_uptodate(bh))
goto verify;
@@ -461,17 +519,19 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
goto out;
}
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
- set_bitmap_uptodate(bh);
- set_buffer_uptodate(bh);
- set_buffer_verified(bh);
- ext4_unlock_group(sb, block_group);
- unlock_buffer(bh);
if (err) {
+ ext4_unlock_group(sb, block_group);
+ unlock_buffer(bh);
ext4_error(sb, "Failed to init block bitmap for group "
"%u: %d", block_group, err);
goto out;
}
- goto verify;
+ set_bitmap_uptodate(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+ unlock_buffer(bh);
+ return bh;
}
ext4_unlock_group(sb, block_group);
if (buffer_uptodate(bh)) {
@@ -487,10 +547,11 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
set_buffer_new(bh);
- trace_ext4_read_block_bitmap_load(sb, block_group);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
+ (ignore_locked ? REQ_RAHEAD : 0),
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO));
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -508,13 +569,15 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
{
struct ext4_group_desc *desc;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
+ sb, block_group, bh);
+
if (!buffer_new(bh))
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return -EFSCORRUPTED;
wait_on_buffer(bh);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
ext4_error_err(sb, EIO, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
@@ -534,7 +597,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
int err;
- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
@@ -586,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
+ capable(CAP_SYS_RESOURCE)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
@@ -614,27 +677,47 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
/**
* ext4_should_retry_alloc() - check if a block allocation should be retried
- * @sb: super block
- * @retries: number of attemps has been made
+ * @sb: superblock
+ * @retries: number of retry attempts made so far
*
- * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
- * it is profitable to retry the operation, this function will wait
- * for the current or committing transaction to complete, and then
- * return TRUE. We will only retry once.
+ * ext4_should_retry_alloc() is called when ENOSPC is returned while
+ * attempting to allocate blocks. If there's an indication that a pending
+ * journal transaction might free some space and allow another attempt to
+ * succeed, this function will wait for the current or committing transaction
+ * to complete and then return TRUE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
- (*retries)++ > 1 ||
- !EXT4_SB(sb)->s_journal)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!sbi->s_journal)
return 0;
- smp_mb();
- if (EXT4_SB(sb)->s_mb_free_pending == 0)
+ if (++(*retries) > 3) {
+ percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
return 0;
+ }
- jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
- jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ /*
+ * if there's no indication that blocks are about to be freed it's
+ * possible we just missed a transaction commit that did so
+ */
+ smp_mb();
+ if (atomic_read(&sbi->s_mb_free_pending) == 0) {
+ if (test_opt(sb, DISCARD)) {
+ atomic_inc(&sbi->s_retry_alloc_pending);
+ flush_work(&sbi->s_discard_work);
+ atomic_dec(&sbi->s_retry_alloc_pending);
+ }
+ return ext4_has_free_clusters(sbi, 1, 0);
+ }
+
+ /*
+ * it's possible we've just missed a transaction commit here,
+ * so ignore the returned status
+ */
+ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
+ (void) jbd2_journal_force_commit_nested(sbi->s_journal);
return 1;
}
@@ -669,7 +752,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
*count = ar.len;
/*
* Account for the allocated meta blocks. We will never
- * fail EDQUOT for metdata, but we do account for it.
+ * fail EDQUOT for metadata, but we do account for it.
*/
if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
dquot_alloc_block_nofail(inode,
@@ -838,11 +921,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
}
/*
- * This function returns the number of file system metadata clusters at
+ * This function returns the number of file system metadata blocks at
* the beginning of a block group, including the reserved gdt blocks.
*/
-static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
- ext4_group_t block_group)
+unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
+ ext4_group_t block_group)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned num;
@@ -854,14 +937,21 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
sbi->s_desc_per_block) {
if (num) {
- num += ext4_bg_num_gdb(sb, block_group);
+ num += ext4_bg_num_gdb_nometa(sb, block_group);
num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- num += ext4_bg_num_gdb(sb, block_group);
+ num += ext4_bg_num_gdb_meta(sb, block_group);
}
- return EXT4_NUM_B2C(sbi, num);
+ return num;
}
+
+static unsigned int ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group));
+}
+
/**
* ext4_inode_to_goal_block - return a hint for block allocation
* @inode: inode for block allocation
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index f63e028c638c..87760fabdd2e 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -16,19 +16,21 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
}
-int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz)
+ struct buffer_head *bh)
{
__u32 hi;
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
+ sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
provided |= (hi << 16);
@@ -38,23 +40,25 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
return provided == calculated;
}
-void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz)
+ struct buffer_head *bh)
{
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ sz = EXT4_INODES_PER_GROUP(sb) >> 3;
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}
-int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+int ext4_block_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh)
{
@@ -63,24 +67,21 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
provided |= (hi << 16);
} else
calculated &= 0xFFFF;
- if (provided == calculated)
- return 1;
-
- return 0;
+ return provided == calculated;
}
-void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+void ext4_block_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh)
{
@@ -88,10 +89,10 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 16e9b2fda03a..e8c5525afc67 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -24,6 +24,7 @@ struct ext4_system_zone {
struct rb_node node;
ext4_fsblk_t start_blk;
unsigned int count;
+ u32 ino;
};
static struct kmem_cache *ext4_system_zone_cachep;
@@ -45,7 +46,8 @@ void ext4_exit_system_zone(void)
static inline int can_merge(struct ext4_system_zone *entry1,
struct ext4_system_zone *entry2)
{
- if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+ if ((entry1->start_blk + entry1->count) == entry2->start_blk &&
+ entry1->ino == entry2->ino)
return 1;
return 0;
}
@@ -66,11 +68,11 @@ static void release_system_zone(struct ext4_system_blocks *system_blks)
*/
static int add_system_zone(struct ext4_system_blocks *system_blks,
ext4_fsblk_t start_blk,
- unsigned int count)
+ unsigned int count, u32 ino)
{
- struct ext4_system_zone *new_entry = NULL, *entry;
+ struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
- struct rb_node *parent = NULL, *new_node = NULL;
+ struct rb_node *parent = NULL, *new_node;
while (*n) {
parent = *n;
@@ -79,30 +81,21 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
n = &(*n)->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
- else {
- if (start_blk + count > (entry->start_blk +
- entry->count))
- entry->count = (start_blk + count -
- entry->start_blk);
- new_node = *n;
- new_entry = rb_entry(new_node, struct ext4_system_zone,
- node);
- break;
- }
+ else /* Unexpected overlap of system zones. */
+ return -EFSCORRUPTED;
}
- if (!new_entry) {
- new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
- GFP_KERNEL);
- if (!new_entry)
- return -ENOMEM;
- new_entry->start_blk = start_blk;
- new_entry->count = count;
- new_node = &new_entry->node;
-
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &system_blks->root);
- }
+ new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+ GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+ new_entry->start_blk = start_blk;
+ new_entry->count = count;
+ new_entry->ino = ino;
+ new_node = &new_entry->node;
+
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &system_blks->root);
/* Can we merge to the left? */
node = rb_prev(new_node);
@@ -138,7 +131,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_INFO "System zones: ");
rcu_read_lock();
- system_blks = rcu_dereference(sbi->system_blks);
+ system_blks = rcu_dereference(sbi->s_system_blks);
node = rb_first(&system_blks->root);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
@@ -151,40 +144,6 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_CONT "\n");
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with filesystem metadata blocks.
- */
-static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
- struct ext4_system_blocks *system_blks,
- ext4_fsblk_t start_blk,
- unsigned int count)
-{
- struct ext4_system_zone *entry;
- struct rb_node *n;
-
- if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
- (start_blk + count < start_blk) ||
- (start_blk + count > ext4_blocks_count(sbi->s_es)))
- return 0;
-
- if (system_blks == NULL)
- return 1;
-
- n = system_blks->root.rb_node;
- while (n) {
- entry = rb_entry(n, struct ext4_system_zone, node);
- if (start_blk + count - 1 < entry->start_blk)
- n = n->rb_left;
- else if (start_blk >= (entry->start_blk + entry->count))
- n = n->rb_right;
- else
- return 0;
- }
- return 1;
-}
-
static int ext4_protect_reserved_inode(struct super_block *sb,
struct ext4_system_blocks *system_blks,
u32 ino)
@@ -214,19 +173,16 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
if (n == 0) {
i++;
} else {
- if (!ext4_data_block_valid_rcu(sbi, system_blks,
- map.m_pblk, n)) {
- err = -EFSCORRUPTED;
- __ext4_error(sb, __func__, __LINE__, -err,
- map.m_pblk, "blocks %llu-%llu "
- "from inode %u overlap system zone",
- map.m_pblk,
- map.m_pblk + map.m_len - 1, ino);
+ err = add_system_zone(system_blks, map.m_pblk, n, ino);
+ if (err < 0) {
+ if (err == -EFSCORRUPTED) {
+ EXT4_ERROR_INODE_ERR(inode, -err,
+ "blocks %llu-%llu from inode overlap system zone",
+ map.m_pblk,
+ map.m_pblk + map.m_len - 1);
+ }
break;
}
- err = add_system_zone(system_blks, map.m_pblk, n);
- if (err < 0)
- break;
i += n;
}
}
@@ -248,7 +204,7 @@ static void ext4_destroy_system_zone(struct rcu_head *rcu)
*
* The update of system_blks pointer in this function is protected by
* sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
* protected only by RCU. That's why we first build the rbtree and then
* swap it in place.
*/
@@ -259,40 +215,35 @@ int ext4_setup_system_zone(struct super_block *sb)
struct ext4_system_blocks *system_blks;
struct ext4_group_desc *gdp;
ext4_group_t i;
- int flex_size = ext4_flex_bg_size(sbi);
int ret;
- if (!test_opt(sb, BLOCK_VALIDITY)) {
- if (sbi->system_blks)
- ext4_release_system_zone(sb);
- return 0;
- }
- if (sbi->system_blks)
- return 0;
-
system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
if (!system_blks)
return -ENOMEM;
for (i=0; i < ngroups; i++) {
+ unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i);
+
cond_resched();
- if (ext4_bg_has_super(sb, i) &&
- ((i < 5) || ((i % flex_size) == 0)))
- add_system_zone(system_blks,
+ if (meta_blks != 0) {
+ ret = add_system_zone(system_blks,
ext4_group_first_block_no(sb, i),
- ext4_bg_num_gdb(sb, i) + 1);
+ meta_blks, 0);
+ if (ret)
+ goto err;
+ }
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(system_blks,
- ext4_block_bitmap(sb, gdp), 1);
+ ext4_block_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
- ext4_inode_bitmap(sb, gdp), 1);
+ ext4_inode_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group);
+ sbi->s_itb_per_group, 0);
if (ret)
goto err;
}
@@ -305,10 +256,10 @@ int ext4_setup_system_zone(struct super_block *sb)
/*
* System blks rbtree complete, announce it once to prevent racing
- * with ext4_data_block_valid() accessing the rbtree at the same
+ * with ext4_inode_block_valid() accessing the rbtree at the same
* time.
*/
- rcu_assign_pointer(sbi->system_blks, system_blks);
+ rcu_assign_pointer(sbi->s_system_blks, system_blks);
if (test_opt(sb, DEBUG))
debug_print_tree(sbi);
@@ -325,7 +276,7 @@ err:
*
* The update of system_blks pointer in this function is protected by
* sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
* protected only by RCU. So we first clear the system_blks pointer and
* then free the rbtree only after RCU grace period expires.
*/
@@ -333,19 +284,27 @@ void ext4_release_system_zone(struct super_block *sb)
{
struct ext4_system_blocks *system_blks;
- system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+ system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks,
lockdep_is_held(&sb->s_umount));
- rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
+ rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL);
if (system_blks)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
- unsigned int count)
+int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_system_blocks *system_blks;
- int ret;
+ struct ext4_system_zone *entry;
+ struct rb_node *n;
+ int ret = 1;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es)))
+ return 0;
/*
* Lock the system zone to prevent it being released concurrently
@@ -353,29 +312,54 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
* mount option.
*/
rcu_read_lock();
- system_blks = rcu_dereference(sbi->system_blks);
- ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
- count);
+ system_blks = rcu_dereference(sbi->s_system_blks);
+ if (system_blks == NULL)
+ goto out_rcu;
+
+ n = system_blks->root.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ ret = 0;
+ if (inode)
+ ret = (entry->ino == inode->i_ino);
+ break;
+ }
+ }
+out_rcu:
rcu_read_unlock();
return ret;
}
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with some other filesystem metadata blocks.
+ */
+int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
+}
+
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
__le32 *bref = p;
unsigned int blk;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ if (journal && inode == journal->j_inode)
return 0;
while (bref < p+max) {
blk = le32_to_cpu(*bref++);
if (blk &&
- unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- blk, 1))) {
+ unlikely(!ext4_inode_block_valid(inode, blk, 1))) {
ext4_error_inode(inode, function, line, blk,
"invalid block");
return -EFSCORRUPTED;
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
new file mode 100644
index 000000000000..cf0a0970c095
--- /dev/null
+++ b/fs/ext4/crypto.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/quotaops.h>
+#include <linux/uuid.h>
+
+#include "ext4.h"
+#include "xattr.h"
+#include "ext4_jbd2.h"
+
+static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
+ const struct fscrypt_name *src)
+{
+ memset(dst, 0, sizeof(*dst));
+
+ dst->usr_fname = src->usr_fname;
+ dst->disk_name = src->disk_name;
+ dst->hinfo.hash = src->hash;
+ dst->hinfo.minor_hash = src->minor_hash;
+ dst->crypto_buf = src->crypto_buf;
+}
+
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_setup_filename(dir, iname, lookup, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+ err = ext4_fname_setup_ci_filename(dir, iname, fname);
+ if (err)
+ ext4_fname_free_filename(fname);
+
+ return err;
+}
+
+int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_prepare_lookup(dir, dentry, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+ err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
+ if (err)
+ ext4_fname_free_filename(fname);
+ return err;
+}
+
+void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+
+ name.crypto_buf = fname->crypto_buf;
+ fscrypt_free_filename(&name);
+
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
+
+ ext4_fname_free_ci_filename(fname);
+}
+
+static bool uuid_is_zero(__u8 u[16])
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ if (u[i])
+ return false;
+ return true;
+}
+
+int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg)
+{
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, err2;
+ handle_t *handle;
+
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+
+ if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto pwsalt_err_exit;
+ }
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto pwsalt_err_journal;
+ lock_buffer(sbi->s_sbh);
+ generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+pwsalt_err_journal:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+pwsalt_err_exit:
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
+}
+
+static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
+ void *fs_data)
+{
+ handle_t *handle = fs_data;
+ int res, res2, credits, retries = 0;
+
+ /*
+ * Encrypting the root directory is not allowed because e2fsck expects
+ * lost+found to exist and be unencrypted, and encrypting the root
+ * directory would imply encrypting the lost+found directory as well as
+ * the filename "lost+found" itself.
+ */
+ if (inode->i_ino == EXT4_ROOT_INO)
+ return -EPERM;
+
+ if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
+ return -EINVAL;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EOPNOTSUPP;
+
+ res = ext4_convert_inline_data(inode);
+ if (res)
+ return res;
+
+ /*
+ * If a journal handle was specified, then the encryption context is
+ * being set on a new inode via inheritance and is part of a larger
+ * transaction to create the inode. Otherwise the encryption context is
+ * being set on an existing inode in its own transaction. Only in the
+ * latter case should the "retry on ENOSPC" logic be used.
+ */
+
+ if (handle) {
+ res = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode, false);
+ }
+ return res;
+ }
+
+ res = dquot_initialize(inode);
+ if (res)
+ return res;
+retry:
+ res = ext4_xattr_set_credits(inode, len, false /* is_create */,
+ &credits);
+ if (res)
+ return res;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode, false);
+ res = ext4_mark_inode_dirty(handle, inode);
+ if (res)
+ EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
+ }
+ res2 = ext4_journal_stop(handle);
+
+ if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ if (!res)
+ res = res2;
+ return res;
+}
+
+static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
+{
+ return EXT4_SB(sb)->s_dummy_enc_policy.policy;
+}
+
+static bool ext4_has_stable_inodes(struct super_block *sb)
+{
+ return ext4_has_feature_stable_inodes(sb);
+}
+
+const struct fscrypt_operations ext4_cryptops = {
+ .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) -
+ (int)offsetof(struct ext4_inode_info, vfs_inode),
+ .needs_bounce_pages = 1,
+ .has_32bit_inodes = 1,
+ .supports_subblock_data_units = 1,
+ .legacy_key_prefix = "ext4:",
+ .get_context = ext4_get_context,
+ .set_context = ext4_set_context,
+ .get_dummy_policy = ext4_get_dummy_policy,
+ .empty_dir = ext4_empty_dir,
+ .has_stable_inodes = ext4_has_stable_inodes,
+};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c654205f648d..256fe2c1d4c1 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -55,6 +55,18 @@ static int is_dx_dir(struct inode *inode)
return 0;
}
+static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de)
+{
+ /* Check if . or .. , or skip if namelen is 0 */
+ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') &&
+ (de->name[1] == '.' || de->name[1] == '\0'))
+ return true;
+ /* Check if this is a csum entry */
+ if (de->file_type == EXT4_FT_DIR_CSUM)
+ return true;
+ return false;
+}
+
/*
* Return 0 if the directory entry is OK, and 1 if there is a problem
*
@@ -73,36 +85,43 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
const int rlen = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
const int next_offset = ((char *) de - buf) + rlen;
+ bool fake = is_fake_dir_entry(de);
+ bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb);
- if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
+ if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
error_msg = "rec_len is smaller than minimal";
else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
+ else if (unlikely(rlen < ext4_dir_rec_len(de->name_len,
+ fake ? NULL : dir)))
error_msg = "rec_len is too small for name_len";
else if (unlikely(next_offset > size))
error_msg = "directory entry overrun";
- else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
+ else if (unlikely(next_offset > size - ext4_dir_rec_len(1,
+ has_csum ? NULL : dir) &&
next_offset != size))
error_msg = "directory entry too close to block end";
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else if (unlikely(next_offset == size && de->name_len == 1 &&
+ de->name[0] == '.'))
+ error_msg = "'.' directory cannot be the last in data block";
else
return 0;
if (filp)
ext4_error_file(filp, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u, "
- "inode=%u, rec_len=%d, name_len=%d, size=%d",
+ "inode=%u, rec_len=%d, size=%d fake=%d",
error_msg, offset, le32_to_cpu(de->inode),
- rlen, de->name_len, size);
+ rlen, size, fake);
else
ext4_error_inode(dir, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u, "
- "inode=%u, rec_len=%d, name_len=%d, size=%d",
+ "inode=%u, rec_len=%d, size=%d fake=%d",
error_msg, offset, le32_to_cpu(de->inode),
- rlen, de->name_len, size);
+ rlen, size, fake);
return 1;
}
@@ -117,20 +136,19 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct super_block *sb = inode->i_sb;
struct buffer_head *bh = NULL;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
+ struct dir_private_info *info = file->private_data;
- if (IS_ENCRYPTED(inode)) {
- err = fscrypt_get_encryption_info(inode);
- if (err)
- return err;
- }
+ err = fscrypt_prepare_readdir(inode);
+ if (err)
+ return err;
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
- if (err != ERR_BAD_DX_DIR) {
+ if (err != ERR_BAD_DX_DIR)
return err;
- }
+
/* Can we just clear INDEX flag to ignore htree information? */
- if (!ext4_has_metadata_csum(sb)) {
+ if (!ext4_has_feature_metadata_csum(sb)) {
/*
* We don't set the inode dirty flag since it's not
* critical that it gets flushed back to the disk.
@@ -148,7 +166,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
}
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
}
@@ -174,13 +192,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
continue;
}
if (err > 0) {
- pgoff_t index = map.m_pblk >>
- (PAGE_SHIFT - inode->i_blkbits);
+ pgoff_t index = map.m_pblk << inode->i_blkbits >>
+ PAGE_SHIFT;
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
- sb->s_bdev->bd_inode->i_mapping,
- &file->f_ra, file,
- index, 1);
+ sb->s_bdev->bd_mapping,
+ &file->f_ra, file, index,
+ 1 << EXT4_SB(sb)->s_min_folio_order);
file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
if (IS_ERR(bh)) {
@@ -215,7 +233,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (!inode_eq_iversion(inode, file->f_version)) {
+ if (!inode_eq_iversion(inode, info->cookie)) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@@ -226,7 +244,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len,
- sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
+ sb->s_blocksize) < ext4_dir_rec_len(1,
+ inode))
break;
i += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
@@ -234,7 +253,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset = i;
ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
}
while (ctx->pos < inode->i_size
@@ -264,10 +283,20 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct fscrypt_str de_name =
FSTR_INIT(de->name,
de->name_len);
+ u32 hash;
+ u32 minor_hash;
+
+ if (IS_CASEFOLDED(inode)) {
+ hash = EXT4_DIRENT_HASH(de);
+ minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ } else {
+ hash = 0;
+ minor_hash = 0;
+ }
/* Directory is encrypted */
err = fscrypt_fname_disk_to_usr(inode,
- 0, 0, &de_name, &fstr);
+ hash, minor_hash, &de_name, &fstr);
de_name = fstr;
fstr.len = save_len;
if (err)
@@ -286,7 +315,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
goto done;
brelse(bh);
bh = NULL;
- offset = 0;
}
done:
err = 0;
@@ -368,6 +396,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
+ struct dir_private_info *info = file->private_data;
int dx_dir = is_dx_dir(inode);
loff_t ret, htree_max = ext4_get_htree_eof(file);
@@ -376,7 +405,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
htree_max, htree_max);
else
ret = ext4_llseek(file, offset, whence);
- file->f_version = inode_peek_iversion(inode) - 1;
+ info->cookie = inode_peek_iversion(inode) - 1;
return ret;
}
@@ -392,11 +421,11 @@ struct fname {
__u32 inode;
__u8 name_len;
__u8 file_type;
- char name[];
+ char name[] __counted_by(name_len);
};
/*
- * This functoin implements a non-recursive way of freeing all of the
+ * This function implements a non-recursive way of freeing all of the
* nodes in the red-black tree.
*/
static void free_rb_tree_fname(struct rb_root *root)
@@ -413,18 +442,15 @@ static void free_rb_tree_fname(struct rb_root *root)
*root = RB_ROOT;
}
-
-static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
- loff_t pos)
+static void ext4_htree_init_dir_info(struct file *filp, loff_t pos)
{
- struct dir_private_info *p;
-
- p = kzalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
- return NULL;
- p->curr_hash = pos2maj_hash(filp, pos);
- p->curr_minor_hash = pos2min_hash(filp, pos);
- return p;
+ struct dir_private_info *p = filp->private_data;
+
+ if (is_dx_dir(file_inode(filp)) && !p->initialized) {
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
+ p->initialized = true;
+ }
}
void ext4_htree_free_dir_info(struct dir_private_info *p)
@@ -448,14 +474,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct rb_node **p, *parent = NULL;
struct fname *fname, *new_fn;
struct dir_private_info *info;
- int len;
info = dir_file->private_data;
p = &info->root.rb_node;
/* Create and allocate the fname structure */
- len = sizeof(struct fname) + ent_name->len + 1;
- new_fn = kzalloc(len, GFP_KERNEL);
+ new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1),
+ GFP_KERNEL);
if (!new_fn)
return -ENOMEM;
new_fn->hash = hash;
@@ -499,7 +524,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
/*
* This is a helper function for ext4_dx_readdir. It calls filldir
- * for all entres on the fname linked list. (Normally there is only
+ * for all entries on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
static int call_filldir(struct file *file, struct dir_context *ctx,
@@ -534,14 +559,9 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
struct dir_private_info *info = file->private_data;
struct inode *inode = file_inode(file);
struct fname *fname;
- int ret;
+ int ret = 0;
- if (!info) {
- info = ext4_htree_create_dir_info(file, ctx->pos);
- if (!info)
- return -ENOMEM;
- file->private_data = info;
- }
+ ext4_htree_init_dir_info(file, ctx->pos);
if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
@@ -574,15 +594,15 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
* cached entries.
*/
if ((!info->curr_node) ||
- !inode_eq_iversion(inode, file->f_version)) {
+ !inode_eq_iversion(inode, info->cookie)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
- return ret;
+ goto finished;
if (ret == 0) {
ctx->pos = ext4_get_htree_eof(file);
break;
@@ -613,14 +633,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
}
finished:
info->last_pos = ctx->pos;
- return 0;
-}
-
-static int ext4_dir_open(struct inode * inode, struct file * filp)
-{
- if (IS_ENCRYPTED(inode))
- return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
- return 0;
+ return ret < 0 ? ret : 0;
}
static int ext4_release_dir(struct inode *inode, struct file *filp)
@@ -639,7 +652,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
unsigned int offset = 0;
char *top;
- de = (struct ext4_dir_entry_2 *)buf;
+ de = buf;
top = buf + buf_size;
while ((char *) de < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -655,7 +668,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
return 0;
}
+static int ext4_dir_open(struct inode *inode, struct file *file)
+{
+ struct dir_private_info *info;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+ file->private_data = info;
+ return 0;
+}
+
const struct file_operations ext4_dir_operations = {
+ .open = ext4_dir_open,
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
.iterate_shared = ext4_readdir,
@@ -664,57 +689,5 @@ const struct file_operations ext4_dir_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
- .open = ext4_dir_open,
.release = ext4_release_dir,
};
-
-#ifdef CONFIG_UNICODE
-static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
-{
- struct qstr qstr = {.name = str, .len = len };
- const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *inode = READ_ONCE(parent->d_inode);
-
- if (!inode || !IS_CASEFOLDED(inode) ||
- !EXT4_SB(inode->i_sb)->s_encoding) {
- if (len != name->len)
- return -1;
- return memcmp(str, name->name, len);
- }
-
- return ext4_ci_compare(inode, name, &qstr, false);
-}
-
-static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
-{
- const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
- const struct unicode_map *um = sbi->s_encoding;
- const struct inode *inode = READ_ONCE(dentry->d_inode);
- unsigned char *norm;
- int len, ret = 0;
-
- if (!inode || !IS_CASEFOLDED(inode) || !um)
- return 0;
-
- norm = kmalloc(PATH_MAX, GFP_ATOMIC);
- if (!norm)
- return -ENOMEM;
-
- len = utf8_casefold(um, str, norm, PATH_MAX);
- if (len < 0) {
- if (ext4_has_strict_mode(sbi))
- ret = -EINVAL;
- goto out;
- }
- str->hash = full_name_hash(dentry, norm, len);
-out:
- kfree(norm);
- return ret;
-}
-
-const struct dentry_operations ext4_dentry_ops = {
- .d_hash = ext4_d_hash,
- .d_compare = ext4_d_compare,
-};
-#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b08841f70b69..56112f201cac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -17,6 +17,7 @@
#ifndef _EXT4_H
#define _EXT4_H
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
@@ -27,19 +28,19 @@
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
-#include <linux/version.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
-#include <crypto/hash.h>
+#include <linux/crc32c.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
+#include <uapi/linux/ext4.h>
#include <linux/fscrypt.h>
#include <linux/fsverity.h>
@@ -99,6 +100,16 @@
#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
+#define ASSERT(assert) \
+do { \
+ if (unlikely(!(assert))) { \
+ printk(KERN_EMERG \
+ "Assertion failure in %s() at %s:%d: '%s'\n", \
+ __func__, __FILE__, __LINE__, #assert); \
+ BUG(); \
+ } \
+} while (0)
+
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
@@ -117,6 +128,55 @@ enum SHIFT_DIRECTION {
};
/*
+ * For each criteria, mballoc has slightly different way of finding
+ * the required blocks nad usually, higher the criteria the slower the
+ * allocation. We start at lower criterias and keep falling back to
+ * higher ones if we are not able to find any blocks. Lower (earlier)
+ * criteria are faster.
+ */
+enum criteria {
+ /*
+ * Used when number of blocks needed is a power of 2. This
+ * doesn't trigger any disk IO except prefetch and is the
+ * fastest criteria.
+ */
+ CR_POWER2_ALIGNED,
+
+ /*
+ * Tries to lookup in-memory data structures to find the most
+ * suitable group that satisfies goal request. No disk IO
+ * except block prefetch.
+ */
+ CR_GOAL_LEN_FAST,
+
+ /*
+ * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal
+ * length to the best available length for faster allocation.
+ */
+ CR_BEST_AVAIL_LEN,
+
+ /*
+ * Reads each block group sequentially, performing disk IO if
+ * necessary, to find suitable block group. Tries to
+ * allocate goal length but might trim the request if nothing
+ * is found after enough tries.
+ */
+ CR_GOAL_LEN_SLOW,
+
+ /*
+ * Finds the first free set of blocks and allocates
+ * those. This is only used in rare cases when
+ * CR_GOAL_LEN_SLOW also fails to allocate anything.
+ */
+ CR_ANY_FREE,
+
+ /*
+ * Number of criterias defined.
+ */
+ EXT4_MB_NUM_CRS
+};
+
+/*
* Flags used in mballoc's allocation_context flags field.
*
* Also used to show what's going on for debugging purposes when the
@@ -125,14 +185,8 @@ enum SHIFT_DIRECTION {
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
-/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 0x0002
-/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST 0x0008
-/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
@@ -186,14 +240,27 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
+#define EXT4_MAP_DELAYED BIT(BH_Delay)
+/*
+ * This is for use in ext4_map_query_blocks() for a special case where we can
+ * have a physically and logically contiguous blocks split across two leaf
+ * nodes instead of a single extent. This is required in case of atomic writes
+ * to know whether the returned extent is last in leaf. If yes, then lookup for
+ * next in leaf block in ext4_map_query_blocks_next_in_leaf().
+ * - This is never going to be added to any buffer head state.
+ * - We use the next available bit after BH_BITMAP_UPTODATE.
+ */
+#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
- EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
ext4_lblk_t m_lblk;
unsigned int m_len;
unsigned int m_flags;
+ u64 m_seq;
};
/*
@@ -207,7 +274,10 @@ struct ext4_system_blocks {
/*
* Flags for ext4_io_end->flags
*/
-#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_FAILED 0x0002
+
+#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED)
struct ext4_io_end_vec {
struct list_head list; /* list of io_end_vec */
@@ -227,7 +297,7 @@ typedef struct ext4_io_end {
struct bio *bio; /* Linked list of completed
* bios covering the extent */
unsigned int flag; /* unwritten or not */
- atomic_t count; /* reference counter */
+ refcount_t count; /* reference counter */
struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
@@ -296,7 +366,16 @@ struct ext4_io_submit {
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
blkbits))
-
+#define EXT4_B_TO_LBLK(inode, offset) \
+ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
+#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits)
+
+/* Translate a block number to a page index */
+#define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \
+ PAGE_SHIFT)
+/* Translate a page index to a block number */
+#define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \
+ (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
@@ -426,28 +505,50 @@ struct flex_groups {
#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */
+
+#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */
+
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
-
-/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
-#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
+/* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \
+ EXT4_UNRM_FL | \
+ EXT4_COMPR_FL | \
+ EXT4_SYNC_FL | \
EXT4_IMMUTABLE_FL | \
EXT4_APPEND_FL | \
EXT4_NODUMP_FL | \
EXT4_NOATIME_FL | \
- EXT4_PROJINHERIT_FL)
+ EXT4_JOURNAL_DATA_FL | \
+ EXT4_NOTAIL_FL | \
+ EXT4_DIRSYNC_FL | \
+ EXT4_TOPDIR_FL | \
+ EXT4_EXTENTS_FL | \
+ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
+ EXT4_DAX_FL | \
+ EXT4_PROJINHERIT_FL | \
+ EXT4_CASEFOLD_FL)
+
+/* User visible flags */
+#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \
+ EXT4_DIRTY_FL | \
+ EXT4_COMPRBLK_FL | \
+ EXT4_NOCOMPR_FL | \
+ EXT4_ENCRYPT_FL | \
+ EXT4_INDEX_FL | \
+ EXT4_VERITY_FL | \
+ EXT4_INLINE_DATA_FL)
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
- EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
+ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
+ EXT4_DAX_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
@@ -459,6 +560,10 @@ struct flex_groups {
/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+/* Flags which are mutually exclusive to DAX */
+#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
+ EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -499,6 +604,7 @@ enum {
EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
+ EXT4_INODE_DAX = 25, /* Inode is DAX */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */
@@ -518,7 +624,7 @@ enum {
*
* It's not paranoia if the Murphy's Law really *is* out to get you. :-)
*/
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
static inline void ext4_check_flag_values(void)
@@ -551,17 +657,6 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(RESERVED);
}
-/* Used to pass group descriptor data when online resize is done */
-struct ext4_new_group_input {
- __u32 group; /* Group number for this data */
- __u64 block_bitmap; /* Absolute block number of block bitmap */
- __u64 inode_bitmap; /* Absolute block number of inode bitmap */
- __u64 inode_table; /* Absolute block number of inode table start */
- __u32 blocks_count; /* Total number of blocks in this group */
- __u16 reserved_blocks; /* Number of reserved blocks in this group */
- __u16 unused;
-};
-
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
struct compat_ext4_new_group_input {
u32 group;
@@ -607,16 +702,22 @@ enum {
/* Caller is from the delayed allocation writeout path
* finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
- /* caller is from the direct IO path, request to creation of an
- unwritten extents if not allocated, split the unwritten
- extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_PRE_IO 0x0008
-#define EXT4_GET_BLOCKS_CONVERT 0x0010
-#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
- EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
- /* Convert extent to initialized after IO complete */
-#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
+ /*
+ * This means that we cannot merge newly allocated extents, and if we
+ * found an unwritten extent, we need to split it.
+ */
+#define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008
+ /*
+ * Caller is from the dio or dioread_nolock buffered IO, reqest to
+ * create an unwritten extent if it does not exist or split the
+ * found unwritten extent. Also do not merge the newly created
+ * unwritten extent, io end will convert unwritten to written,
+ * and try to merge the written extent.
+ */
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+ /* Convert unwritten extent to initialized. */
+#define EXT4_GET_BLOCKS_CONVERT 0x0010
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -628,9 +729,23 @@ enum {
#define EXT4_GET_BLOCKS_ZERO 0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
- /* Caller will submit data before dropping transaction handle. This
- * allows jbd2 to avoid submitting data before commit. */
+ /* Caller is in the context of data submission, such as writeback,
+ * fsync, etc. Especially, in the generic writeback path, caller will
+ * submit data before dropping transaction handle. This allows jbd2
+ * to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\
+ EXT4_GET_BLOCKS_IO_SUBMIT)
+ /* Caller is in the atomic contex, find extent if it has been cached */
+#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
+/*
+ * Atomic write caller needs this to query in the slow path of mixed mapping
+ * case, when a contiguous extent can be split across two adjacent leaf nodes.
+ * Look EXT4_MAP_QUERY_LAST_IN_LEAF.
+ */
+#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000
/*
* The bit position of these flags must not overlap with any of the
@@ -644,6 +759,13 @@ enum {
#define EXT4_EX_NOCACHE 0x40000000
#define EXT4_EX_FORCE_CACHE 0x20000000
#define EXT4_EX_NOFAIL 0x10000000
+/*
+ * ext4_map_query_blocks() uses this filter mask to filter the flags needed to
+ * pass while lookup/querying of on disk extent tree.
+ */
+#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\
+ EXT4_EX_NOFAIL |\
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
/*
* Flags used by ext4_free_blocks
@@ -656,64 +778,10 @@ enum {
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040
-/*
- * ioctl commands
- */
-#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
-#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
-#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
-#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
-#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
-#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
-#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
-#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
-#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
-#define EXT4_IOC_MIGRATE _IO('f', 9)
- /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
- /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
-#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
-#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
-#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
-#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
-#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
-#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
-#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
-#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
-/* ioctl codes 19--39 are reserved for fscrypt */
-#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
-#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
-#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
-
-#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
-#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
-
-#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
-
-/*
- * Flags for going down operation
- */
-#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */
-#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
-#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
-
-/*
- * Flags returned by EXT4_IOC_GETSTATE
- *
- * We only expose to userspace a subset of the state flags in
- * i_state_flags
- */
-#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001
-#define EXT4_STATE_FLAG_NEW 0x00000002
-#define EXT4_STATE_FLAG_NEWENTRY 0x00000004
-#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008
-
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
-#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS
#define EXT4_IOC32_GETVERSION _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
@@ -724,12 +792,6 @@ enum {
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
#endif
-/*
- * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
- * It indicates that the entry in extent status cache is for a hole.
- */
-#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000
-
/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
@@ -800,15 +862,6 @@ struct ext4_inode {
__le32 i_projid; /* Project ID */
};
-struct move_extent {
- __u32 reserved; /* should be zero */
- __u32 donor_fd; /* donor file descriptor */
- __u64 orig_start; /* logical start offset in block for orig */
- __u64 donor_start; /* logical start offset in block for donor */
- __u64 len; /* block length to be moved */
- __u64 moved_len; /* moved block length */
-};
-
#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS)
@@ -850,64 +903,80 @@ struct move_extent {
* affected filesystem before 2242.
*/
-static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
+static inline __le32 ext4_encode_extra_time(struct timespec64 ts)
{
- u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK;
- return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
+ u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK;
+ return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS));
}
-static inline void ext4_decode_extra_time(struct timespec64 *time,
- __le32 extra)
+static inline struct timespec64 ext4_decode_extra_time(__le32 base,
+ __le32 extra)
{
+ struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) };
+
if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
- time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
+ ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ return ts;
}
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \
do { \
- if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
- (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
- (raw_inode)->xtime ## _extra = \
- ext4_encode_extra_time(&(inode)->xtime); \
- } \
- else \
- (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \
+ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
+ (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \
+ (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \
+ } else \
+ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \
} while (0)
-#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
-do { \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
- (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
- (raw_inode)->xtime ## _extra = \
- ext4_encode_extra_time(&(einode)->xtime); \
+#define EXT4_INODE_SET_ATIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))
+
+#define EXT4_INODE_SET_MTIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \
+ raw_inode, (einode)->xtime)
+
+#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \
+ (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \
+ ext4_decode_extra_time((raw_inode)->xtime, \
+ (raw_inode)->xtime ## _extra) : \
+ (struct timespec64) { \
+ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \
+ })
+
+#define EXT4_INODE_GET_ATIME(inode, raw_inode) \
+do { \
+ inode_set_atime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \
} while (0)
-#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
+#define EXT4_INODE_GET_MTIME(inode, raw_inode) \
do { \
- (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \
- if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
- ext4_decode_extra_time(&(inode)->xtime, \
- raw_inode->xtime ## _extra); \
- } \
- else \
- (inode)->xtime.tv_nsec = 0; \
+ inode_set_mtime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \
} while (0)
+#define EXT4_INODE_GET_CTIME(inode, raw_inode) \
+do { \
+ inode_set_ctime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \
+} while (0)
-#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
-do { \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
- (einode)->xtime.tv_sec = \
- (signed)le32_to_cpu((raw_inode)->xtime); \
- else \
- (einode)->xtime.tv_sec = 0; \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
- ext4_decode_extra_time(&(einode)->xtime, \
- raw_inode->xtime ## _extra); \
- else \
- (einode)->xtime.tv_nsec = 0; \
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
+do { \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ (einode)->xtime = \
+ EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \
+ raw_inode); \
+ else \
+ (einode)->xtime = (struct timespec64){0, 0}; \
} while (0)
#define i_disk_version osd1.linux1.l_i_version
@@ -938,6 +1007,7 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
#include "extents_status.h"
+#include "fast_commit.h"
/*
* Lock subclasses for i_data_sem in the ext4_inode_info structure.
@@ -954,11 +1024,13 @@ do { \
* where the second inode has larger inode number
* than the first
* I_DATA_SEM_QUOTA - Used for quota inodes only
+ * I_DATA_SEM_EA - Used for ea_inodes only
*/
enum {
I_DATA_SEM_NORMAL = 0,
I_DATA_SEM_OTHER,
I_DATA_SEM_QUOTA,
+ I_DATA_SEM_EA
};
@@ -986,14 +1058,47 @@ struct ext4_inode_info {
/*
* Extended attributes can be read independently of the main file
- * data. Taking i_mutex even when reading would cause contention
+ * data. Taking i_rwsem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
- struct list_head i_orphan; /* unlinked but open inodes */
+ /*
+ * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
+ * i_orphan is used.
+ */
+ union {
+ struct list_head i_orphan; /* unlinked but open inodes */
+ unsigned int i_orphan_idx; /* Index in orphan file */
+ };
+
+ /* Fast commit related info */
+
+ /* For tracking dentry create updates */
+ struct list_head i_fc_dilist;
+ struct list_head i_fc_list; /*
+ * inodes that need fast commit
+ * protected by sbi->s_fc_lock.
+ */
+
+ /* Start of lblk range that needs to be committed in this fast commit */
+ ext4_lblk_t i_fc_lblk_start;
+
+ /* End of lblk range that needs to be committed in this fast commit */
+ ext4_lblk_t i_fc_lblk_len;
+
+ spinlock_t i_raw_lock; /* protects updates to the raw inode */
+
+ /* Fast commit wait queue for this inode */
+ wait_queue_head_t i_fc_wait;
+
+ /*
+ * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
+ * and inode's EXT4_FC_STATE_COMMITTING state bit.
+ */
+ spinlock_t i_fc_lock;
/*
* i_disksize keeps track of what the inode size is ON DISK, not
@@ -1023,20 +1128,9 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
- /*
- * i_mmap_sem is for serializing page faults with truncate / punch hole
- * operations. We have to make sure that new page cannot be faulted in
- * a section of the inode that is being punched. We cannot easily use
- * i_data_sem for this since we need protection for the whole punch
- * operation and i_data_sem ranks below transaction start so we have
- * to occasionally drop it.
- */
- struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
- spinlock_t i_raw_lock; /* protects updates to the raw inode */
-
/*
* File creation time. Its function is same as that of
* struct timespec64 i_{a,c,m}time in the generic inode.
@@ -1044,8 +1138,13 @@ struct ext4_inode_info {
struct timespec64 i_crtime;
/* mballoc */
- struct list_head i_prealloc_list;
- spinlock_t i_prealloc_lock;
+ atomic_t i_prealloc_active;
+
+ /* allocation reservation info for delalloc */
+ /* In case of bigalloc, this refer to clusters rather than blocks */
+ unsigned int i_reserved_data_blocks;
+ struct rb_root i_prealloc_node;
+ rwlock_t i_prealloc_lock;
/* extents status tree */
struct ext4_es_tree i_es_tree;
@@ -1056,14 +1155,12 @@ struct ext4_inode_info {
ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
extents to shrink. Protected by
i_es_lock */
+ u64 i_es_seq; /* Change counter for extents.
+ Protected by i_es_lock */
/* ialloc */
ext4_group_t i_last_alloc_group;
- /* allocation reservation info for delalloc */
- /* In case of bigalloc, this refer to clusters rather than blocks */
- unsigned int i_reserved_data_blocks;
-
/* pending cluster reservations for bigalloc file systems */
struct ext4_pending_tree i_pending_tree;
@@ -1078,6 +1175,7 @@ struct ext4_inode_info {
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
#endif
+ spinlock_t i_block_reservation_lock;
/* Lock protecting lists below */
spinlock_t i_completed_io_lock;
@@ -1087,9 +1185,6 @@ struct ext4_inode_info {
*/
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
-
- spinlock_t i_block_reservation_lock;
/*
* Transactions that contain inode's metadata needed to complete
@@ -1099,13 +1194,21 @@ struct ext4_inode_info {
tid_t i_datasync_tid;
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
__u32 i_csum_seed;
kprojid_t i_projid;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
+#endif
+
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info;
+#endif
};
/*
@@ -1114,6 +1217,7 @@ struct ext4_inode_info {
#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
#define EXT4_ERROR_FS 0x0002 /* Errors detected */
#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */
+#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */
/*
* Misc. filesystem flags
@@ -1135,9 +1239,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
-#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */
#else
-#define EXT4_MOUNT_DAX 0
+#define EXT4_MOUNT_DAX_ALWAYS 0
#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
@@ -1162,6 +1266,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
+#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1180,10 +1285,17 @@ struct ext4_inode_info {
blocks */
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
-
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
specified journal checksum */
+#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
+#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
+#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
+#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
+ * scanning in mballoc
+ */
+#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -1200,15 +1312,13 @@ struct ext4_inode_info {
#define ext4_test_and_set_bit __test_and_set_bit_le
#define ext4_set_bit __set_bit_le
-#define ext4_set_bit_atomic ext2_set_bit_atomic
#define ext4_test_and_clear_bit __test_and_clear_bit_le
#define ext4_clear_bit __clear_bit_le
-#define ext4_clear_bit_atomic ext2_clear_bit_atomic
#define ext4_test_bit test_bit_le
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
-extern void ext4_set_bits(void *bm, int cur, int len);
+extern void mb_set_bits(void *bm, int cur, int len);
/*
* Maximal mount counts between two filesystem checks
@@ -1227,6 +1337,8 @@ extern void ext4_set_bits(void *bm, int cur, int len);
/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM 1
+#define EXT4_LABEL_MAX 16
+
/*
* Structure of the super block
*/
@@ -1276,7 +1388,7 @@ struct ext4_super_block {
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
-/*78*/ char s_volume_name[16]; /* volume name */
+/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
@@ -1301,7 +1413,7 @@ struct ext4_super_block {
__le32 s_first_meta_bg; /* First metablock block group */
__le32 s_mkfs_time; /* When the filesystem was created */
__le32 s_jnl_blocks[17]; /* Backup of the journal inode */
- /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+ /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */
/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
__le32 s_r_blocks_count_hi; /* Reserved blocks count */
__le32 s_free_blocks_count_hi; /* Free blocks count */
@@ -1356,7 +1468,10 @@ struct ext4_super_block {
__u8 s_last_error_errcode;
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
- __le32 s_reserved[95]; /* Padding to the end of the block */
+ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
+ __le16 s_def_resuid_hi;
+ __le16 s_def_resgid_hi;
+ __le32 s_reserved[93]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1364,30 +1479,58 @@ struct ext4_super_block {
#ifdef __KERNEL__
-/*
- * run-time mount flags
- */
-#define EXT4_MF_MNTDIR_SAMPLED 0x0001
-#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
-
-#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL)
-#else
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
-#endif
-
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
#define EXT4_ENC_UTF8_12_1 1
+/* Types of ext4 journal triggers */
+enum ext4_journal_trigger_type {
+ EXT4_JTR_ORPHAN_FILE,
+ EXT4_JTR_NONE /* This must be the last entry for indexing to work! */
+};
+
+#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE
+
+struct ext4_journal_trigger {
+ struct jbd2_buffer_trigger_type tr_triggers;
+ struct super_block *sb;
+};
+
+static inline struct ext4_journal_trigger *EXT4_TRIGGER(
+ struct jbd2_buffer_trigger_type *trigger)
+{
+ return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
+}
+
+#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04
+
+/* Structure at the tail of orphan block */
+struct ext4_orphan_block_tail {
+ __le32 ob_magic;
+ __le32 ob_checksum;
+};
+
+static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
+{
+ return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
+ sizeof(u32);
+}
+
+struct ext4_orphan_block {
+ atomic_t ob_free_entries; /* Number of free orphan entries in block */
+ struct buffer_head *ob_bh; /* Buffer for orphan block */
+};
+
/*
- * Flags for ext4_sb_info.s_encoding_flags.
+ * Info about orphan file.
*/
-#define EXT4_ENC_STRICT_MODE_FL (1 << 0)
-
-#define ext4_has_strict_mode(sbi) \
- (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL)
+struct ext4_orphan_info {
+ int of_blocks; /* Number of orphan blocks in a file */
+ __u32 of_csum_seed; /* Checksum seed for orphan file */
+ struct ext4_orphan_block *of_binfo; /* Array with info about orphan
+ * file blocks */
+};
/*
* fourth extended-fs super-block data in memory
@@ -1409,11 +1552,13 @@ struct ext4_sb_info {
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
+ /* Array of bh's for the block group descriptors */
struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
- unsigned int s_mount_flags;
+ unsigned long s_mount_flags;
unsigned int s_def_mount_opt;
+ unsigned int s_def_mount_opt2;
ext4_fsblk_t s_sb_block;
atomic64_t s_resv_clusters;
kuid_t s_resuid;
@@ -1428,37 +1573,37 @@ struct ext4_sb_info {
unsigned int s_inode_goal;
u32 s_hash_seed[4];
int s_def_hash_version;
- int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */
struct percpu_counter s_freeclusters_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
struct percpu_counter s_dirtyclusters_counter;
+ struct percpu_counter s_sra_exceeded_retry_limit;
struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
struct kobject s_kobj;
struct completion s_kobj_unregister;
struct super_block *s_sb;
-#ifdef CONFIG_UNICODE
- struct unicode_map *s_encoding;
- __u16 s_encoding_flags;
-#endif
+ struct buffer_head *s_mmp_bh;
/* Journaling */
struct journal_s *s_journal;
- struct list_head s_orphan;
- struct mutex s_orphan_lock;
unsigned long s_ext4_flags; /* Ext4 superblock flags */
+ struct mutex s_orphan_lock; /* Protects on disk list changes */
+ struct list_head s_orphan; /* List of orphaned inodes in on disk
+ list */
+ struct ext4_orphan_info s_orphan_info;
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct block_device *journal_bdev;
+ struct file *s_journal_bdev_file;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
- struct ext4_system_blocks __rcu *system_blks;
+ struct ext4_system_blocks __rcu *s_system_blks;
#ifdef EXTENTS_STATS
/* ext4 extents stats */
@@ -1477,12 +1622,18 @@ struct ext4_sb_info {
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
- unsigned int s_mb_free_pending;
- struct list_head s_freed_data_list; /* List of blocks to be freed
+ atomic_t s_mb_free_pending;
+ struct list_head s_freed_data_list[2]; /* List of blocks to be freed
after commit completed */
+ struct list_head s_discard_list;
+ struct work_struct s_discard_work;
+ atomic_t s_retry_alloc_pending;
+ struct xarray *s_mb_avg_fragment_size;
+ struct xarray *s_mb_largest_free_orders;
/* tunables */
unsigned long s_stripe;
+ unsigned int s_mb_max_linear_groups;
unsigned int s_mb_stream_request;
unsigned int s_mb_max_to_scan;
unsigned int s_mb_min_to_scan;
@@ -1490,21 +1641,33 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_dir_size_kb;
+ unsigned int s_mb_prefetch;
+ unsigned int s_mb_prefetch_limit;
+ unsigned int s_mb_best_avail_max_trim_order;
+ unsigned int s_sb_update_sec;
+ unsigned int s_sb_update_kb;
+
/* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;
+ ext4_group_t *s_mb_last_groups;
+ unsigned int s_mb_nr_global_goals;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
atomic_t s_bal_success; /* we found long enough chunks */
atomic_t s_bal_allocated; /* in blocks */
atomic_t s_bal_ex_scanned; /* total extents scanned */
+ atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */
+ atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_stream_goals; /* stream allocation global goal hits */
+ atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- spinlock_t s_bal_lock;
- unsigned long s_mb_buddies_generated;
- unsigned long long s_mb_generation_time;
+ atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
+ atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
+ atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
+ atomic_t s_mb_buddies_generated; /* number of buddies generated */
+ atomic64_t s_mb_generation_time;
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
@@ -1539,16 +1702,18 @@ struct ext4_sb_info {
struct task_struct *s_mmp_tsk;
/* record the last minlen when FITRIM is called. */
- atomic_t s_last_trim_minblks;
+ unsigned long s_last_trim_minblks;
- /* Reference to checksum algorithm driver via cryptoapi */
- struct crypto_shash *s_chksum_driver;
+ /* minimum folio order of a page cache allocation */
+ u16 s_min_folio_order;
+ /* supported maximum folio order, 0 means not supported */
+ u16 s_max_folio_order;
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_csum_seed;
/* Reclaim extents from extent status tree */
- struct shrinker s_es_shrinker;
+ struct shrinker *s_es_shrinker;
struct list_head s_es_list; /* List of inodes with reclaimable extents */
long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
@@ -1556,23 +1721,89 @@ struct ext4_sb_info {
struct mb_cache *s_ea_inode_cache;
spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+ /* Journal triggers for checksum computation */
+ struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT];
+
/* Ratelimit ext4 messages. */
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+ atomic_t s_warning_count;
+ atomic_t s_msg_count;
- /* Encryption context for '-o test_dummy_encryption' */
- struct fscrypt_dummy_context s_dummy_enc_ctx;
+ /* Encryption policy for '-o test_dummy_encryption' */
+ struct fscrypt_dummy_policy s_dummy_enc_policy;
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
- * or EXTENTS flag.
+ * or EXTENTS flag or between writepages ops and changing DELALLOC or
+ * DIOREAD_NOLOCK mount options on remount.
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
+ u64 s_dax_part_off;
#ifdef CONFIG_EXT4_DEBUG
unsigned long s_simulate_fail;
#endif
+ /* Record the errseq of the backing block device */
+ errseq_t s_bdev_wb_err;
+ spinlock_t s_bdev_wb_lock;
+
+ /* Information about errors that happened during this mount */
+ spinlock_t s_error_lock;
+ int s_add_error_count;
+ int s_first_error_code;
+ __u32 s_first_error_line;
+ __u32 s_first_error_ino;
+ __u64 s_first_error_block;
+ const char *s_first_error_func;
+ time64_t s_first_error_time;
+ int s_last_error_code;
+ __u32 s_last_error_line;
+ __u32 s_last_error_ino;
+ __u64 s_last_error_block;
+ const char *s_last_error_func;
+ time64_t s_last_error_time;
+ /*
+ * If we are in a context where we cannot update the on-disk
+ * superblock, we queue the work here. This is used to update
+ * the error information in the superblock, and for periodic
+ * updates of the superblock called from the commit callback
+ * function.
+ */
+ struct work_struct s_sb_upd_work;
+
+ /* Atomic write unit values in bytes */
+ unsigned int s_awu_min;
+ unsigned int s_awu_max;
+
+ /* Ext4 fast commit sub transaction ID */
+ atomic_t s_fc_subtid;
+
+ /*
+ * After commit starts, the main queue gets locked, and the further
+ * updates get added in the staging queue.
+ */
+#define FC_Q_MAIN 0
+#define FC_Q_STAGING 1
+ struct list_head s_fc_q[2]; /* Inodes staged for fast commit
+ * that have data changes in them.
+ */
+ struct list_head s_fc_dentry_q[2]; /* directory entry updates */
+ unsigned int s_fc_bytes;
+ /*
+ * Main fast commit lock. This lock protects accesses to the
+ * following fields:
+ * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
+ */
+ struct mutex s_fc_lock;
+ struct buffer_head *s_fc_bh;
+ struct ext4_fc_stats s_fc_stats;
+ tid_t s_fc_ineligible_tid;
+#ifdef CONFIG_EXT4_DEBUG
+ int s_fc_debug_max_replay;
+#endif
+ struct ext4_fc_replay_state s_fc_replay_state;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1584,6 +1815,30 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
return container_of(inode, struct ext4_inode_info, vfs_inode);
}
+static inline int ext4_writepages_down_read(struct super_block *sb)
+{
+ percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
+ return memalloc_nofs_save();
+}
+
+static inline void ext4_writepages_up_read(struct super_block *sb, int ctx)
+{
+ memalloc_nofs_restore(ctx);
+ percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
+}
+
+static inline int ext4_writepages_down_write(struct super_block *sb)
+{
+ percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem);
+ return memalloc_nofs_save();
+}
+
+static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
+{
+ memalloc_nofs_restore(ctx);
+ percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
+}
+
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
@@ -1591,6 +1846,18 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+static inline int ext4_get_resuid(struct ext4_super_block *es)
+{
+ return le16_to_cpu(es->s_def_resuid) |
+ le16_to_cpu(es->s_def_resuid_hi) << 16;
+}
+
+static inline int ext4_get_resgid(struct ext4_super_block *es)
+{
+ return le16_to_cpu(es->s_def_resgid) |
+ le16_to_cpu(es->s_def_resgid_hi) << 16;
+}
+
/*
* Returns: sbi->field[index]
* Used to access an array element from the following sbi fields which require
@@ -1609,6 +1876,31 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
})
/*
+ * run-time mount flags
+ */
+enum {
+ EXT4_MF_MNTDIR_SAMPLED,
+ EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
+ EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */
+};
+
+static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
+{
+ set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
+{
+ clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
+{
+ return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+
+/*
* Simulate_fail codes
*/
#define EXT4_SIM_BBITMAP_EIO 1
@@ -1634,14 +1926,6 @@ static inline bool ext4_simulate_fail(struct super_block *sb,
return false;
}
-static inline void ext4_simulate_fail_bh(struct super_block *sb,
- struct buffer_head *bh,
- unsigned long code)
-{
- if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
- clear_buffer_uptodate(bh);
-}
-
/*
* Error number codes for s_{first,last}_error_errno
*
@@ -1672,7 +1956,6 @@ static inline void ext4_simulate_fail_bh(struct super_block *sb,
* Inode dynamic state flags
*/
enum {
- EXT4_STATE_JDATA, /* journaled data exists */
EXT4_STATE_NEW, /* inode is newly created */
EXT4_STATE_XATTR, /* has in-inode xattrs */
EXT4_STATE_NO_EXPAND, /* No space for expansion */
@@ -1683,6 +1966,9 @@ enum {
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
+ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
+ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */
+ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1742,6 +2028,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
+ * Check whether the inode is tracked as orphan (either in orphan file or
+ * orphan list).
+ */
+static inline bool ext4_inode_orphan_tracked(struct inode *inode)
+{
+ return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
+ !list_empty(&EXT4_I(inode)->i_orphan);
+}
+
+/*
* Codes for operating systems
*/
#define EXT4_OS_LINUX 0
@@ -1756,7 +2052,6 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */
#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
-#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV
#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV
#define EXT4_GOOD_OLD_INODE_SIZE 128
@@ -1776,7 +2071,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
+/*
+ * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
+ * incompatible only if fast commit blocks are present in the FS. Since we
+ * clear the journal (and thus the fast commit blocks), we don't mark FS as
+ * incompatible. We also have a JBD2 incompat feature, which gets set when
+ * there are fast commit blocks present in the journal.
+ */
+#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
+#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1797,6 +2101,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
+#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be
+ non-empty */
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1878,7 +2184,9 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
+EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
@@ -1893,6 +2201,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
+EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT)
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
@@ -1926,7 +2235,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
-#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \
+ EXT4_FEATURE_COMPAT_ORPHAN_FILE)
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
@@ -1951,7 +2261,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA |\
EXT4_FEATURE_RO_COMPAT_PROJECT |\
- EXT4_FEATURE_RO_COMPAT_VERITY)
+ EXT4_FEATURE_RO_COMPAT_VERITY |\
+ EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -1987,17 +2298,36 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}
+extern int ext4_feature_set_ok(struct super_block *sb, int readonly);
+
/*
* Superblock flags
*/
-#define EXT4_FLAGS_RESIZING 0
-#define EXT4_FLAGS_SHUTDOWN 1
+enum {
+ EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */
+ EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */
+ EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */
+ EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */
+};
-static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
+static inline int ext4_forced_shutdown(struct super_block *sb)
{
- return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+ return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}
+static inline int ext4_emergency_ro(struct super_block *sb)
+{
+ return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
+}
+
+static inline int ext4_emergency_state(struct super_block *sb)
+{
+ if (unlikely(ext4_forced_shutdown(sb)))
+ return -EIO;
+ if (unlikely(ext4_emergency_ro(sb)))
+ return -EROFS;
+ return 0;
+}
/*
* Default values for user and/or group using reserved blocks
@@ -2030,10 +2360,19 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
#define EXT4_DEFM_NODELALLOC 0x0800
/*
- * Default journal batch times
+ * Default journal batch times and ioprio.
*/
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
+
+/*
+ * Default values for superblock update
+ */
+#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
+#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */
+
/*
* Minimum number of groups in a flexgroup before we separate out
@@ -2045,6 +2384,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
+/*
+ * Base length of the ext4 directory entry excluding the name length
+ */
+#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)
struct ext4_dir_entry {
__le32 inode; /* Inode number */
@@ -2053,6 +2396,17 @@ struct ext4_dir_entry {
char name[EXT4_NAME_LEN]; /* File name */
};
+
+/*
+ * Encrypted Casefolded entries require saving the hash on disk. This structure
+ * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
+ * boundary.
+ */
+struct ext4_dir_entry_hash {
+ __le32 hash;
+ __le32 minor_hash;
+};
+
/*
* The new version of the directory entry. Since EXT4 structures are
* stored in intel byte order, and the name_len field could never be
@@ -2068,6 +2422,22 @@ struct ext4_dir_entry_2 {
};
/*
+ * Access the hashes at the end of ext4_dir_entry_2
+ */
+#define EXT4_DIRENT_HASHES(entry) \
+ ((struct ext4_dir_entry_hash *) \
+ (((void *)(entry)) + \
+ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
+#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash)
+#define EXT4_DIRENT_MINOR_HASH(entry) \
+ le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash)
+
+static inline bool ext4_hash_in_dirent(const struct inode *inode)
+{
+ return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
+}
+
+/*
* This is a bogus directory entry at the end of each leaf block that
* records checksums.
*/
@@ -2108,33 +2478,37 @@ struct ext4_dir_entry_tail {
*/
#define EXT4_DIR_PAD 4
#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
- ~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN ((1<<16)-1)
/*
- * If we ever get support for fs block sizes > page_size, we'll need
- * to remove the #if statements in the next two functions...
+ * The rec_len is dependent on the type of directory. Directories that are
+ * casefolded and encrypted need to store the hash as well, so we add room for
+ * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
+ * pass NULL for dir, as those entries do not use the extra fields.
*/
+static inline unsigned int ext4_dir_rec_len(__u8 name_len,
+ const struct inode *dir)
+{
+ int rec_len = (name_len + 8 + EXT4_DIR_ROUND);
+
+ if (dir && ext4_hash_in_dirent(dir))
+ rec_len += sizeof(struct ext4_dir_entry_hash);
+ return (rec_len & ~EXT4_DIR_ROUND);
+}
+
static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
unsigned len = le16_to_cpu(dlen);
-#if (PAGE_SIZE >= 65536)
if (len == EXT4_MAX_REC_LEN || len == 0)
return blocksize;
return (len & 65532) | ((len & 3) << 16);
-#else
- return len;
-#endif
}
static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
- if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
- BUG();
-#if (PAGE_SIZE >= 65536)
+ BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
if (len < 65536)
return cpu_to_le16(len);
if (len == blocksize) {
@@ -2144,9 +2518,6 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
return cpu_to_le16(0);
}
return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
-#else
- return cpu_to_le16(len);
-#endif
}
/*
@@ -2168,23 +2539,12 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_LEGACY_UNSIGNED 3
#define DX_HASH_HALF_MD4_UNSIGNED 4
#define DX_HASH_TEA_UNSIGNED 5
+#define DX_HASH_SIPHASH 6
+#define DX_HASH_LAST DX_HASH_SIPHASH
-static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
- const void *address, unsigned int length)
+static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length)
{
- struct {
- struct shash_desc shash;
- char ctx[4];
- } desc;
-
- BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
-
- desc.shash.tfm = sbi->s_chksum_driver;
- *(u32 *)desc.ctx = crc;
-
- BUG_ON(crypto_shash_update(&desc.shash, address, length));
-
- return *(u32 *)desc.ctx;
+ return crc32c(crc, address, length);
}
#ifdef __KERNEL__
@@ -2216,12 +2576,13 @@ struct ext4_filename {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
- struct fscrypt_str cf_name;
+#if IS_ENABLED(CONFIG_UNICODE)
+ struct qstr cf_name;
#endif
};
#define fname_name(p) ((p)->disk_name.name)
+#define fname_usr_name(p) ((p)->usr_fname->name)
#define fname_len(p) ((p)->disk_name.len)
/*
@@ -2258,6 +2619,8 @@ struct dir_private_info {
__u32 curr_hash;
__u32 curr_minor_hash;
__u32 next_hash;
+ u64 cookie;
+ bool initialized;
};
/* calculate the first block number of the group */
@@ -2300,9 +2663,15 @@ struct ext4_lazy_init {
struct mutex li_list_mtx;
};
+enum ext4_li_mode {
+ EXT4_LI_MODE_PREFETCH_BBITMAP,
+ EXT4_LI_MODE_ITABLE,
+};
+
struct ext4_li_request {
struct super_block *lr_super;
- struct ext4_sb_info *lr_sbi;
+ enum ext4_li_mode lr_mode;
+ ext4_group_t lr_first_not_zeroed;
ext4_group_t lr_next_group;
struct list_head lr_request;
unsigned long lr_next_sched;
@@ -2390,16 +2759,16 @@ struct mmpd_data {
/* bitmap.c */
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
-void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz);
-int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct buffer_head *bh);
+int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz);
-void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct buffer_head *bh);
+void ext4_block_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh);
-int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+int ext4_block_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh);
@@ -2411,10 +2780,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb,
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block);
-extern unsigned int ext4_block_group(struct super_block *sb,
- ext4_fsblk_t blocknr);
-extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
- ext4_fsblk_t blocknr);
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
@@ -2426,14 +2791,16 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
-extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
+extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+ ext4_group_t group);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
- ext4_group_t block_group);
+ ext4_group_t block_group,
+ bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh);
@@ -2444,80 +2811,43 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-#ifdef CONFIG_UNICODE
-extern void ext4_fname_setup_ci_filename(struct inode *dir,
- const struct qstr *iname,
- struct fscrypt_str *fname);
-#endif
+#if IS_ENABLED(CONFIG_UNICODE)
+extern int ext4_fname_setup_ci_filename(struct inode *dir,
+ const struct qstr *iname,
+ struct ext4_filename *fname);
-#ifdef CONFIG_FS_ENCRYPTION
-static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
- const struct fscrypt_name *src)
+static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
{
- memset(dst, 0, sizeof(*dst));
-
- dst->usr_fname = src->usr_fname;
- dst->disk_name = src->disk_name;
- dst->hinfo.hash = src->hash;
- dst->hinfo.minor_hash = src->minor_hash;
- dst->crypto_buf = src->crypto_buf;
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
}
-
-static inline int ext4_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup,
- struct ext4_filename *fname)
+#else
+static inline int ext4_fname_setup_ci_filename(struct inode *dir,
+ const struct qstr *iname,
+ struct ext4_filename *fname)
{
- struct fscrypt_name name;
- int err;
-
- err = fscrypt_setup_filename(dir, iname, lookup, &name);
- if (err)
- return err;
-
- ext4_fname_from_fscrypt_name(fname, &name);
-
-#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
-#endif
return 0;
}
-static inline int ext4_fname_prepare_lookup(struct inode *dir,
- struct dentry *dentry,
- struct ext4_filename *fname)
+static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
{
- struct fscrypt_name name;
- int err;
-
- err = fscrypt_prepare_lookup(dir, dentry, &name);
- if (err)
- return err;
+}
+#endif
- ext4_fname_from_fscrypt_name(fname, &name);
+/* ext4 encryption related stuff goes here crypto.c */
+#ifdef CONFIG_FS_ENCRYPTION
+extern const struct fscrypt_operations ext4_cryptops;
-#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
-#endif
- return 0;
-}
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname);
-static inline void ext4_fname_free_filename(struct ext4_filename *fname)
-{
- struct fscrypt_name name;
+int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct ext4_filename *fname);
- name.crypto_buf = fname->crypto_buf;
- fscrypt_free_filename(&name);
+void ext4_fname_free_filename(struct ext4_filename *fname);
- fname->crypto_buf.name = NULL;
- fname->usr_fname = NULL;
- fname->disk_name.name = NULL;
+int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg);
-#ifdef CONFIG_UNICODE
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
-}
#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
const struct qstr *iname,
@@ -2528,11 +2858,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
-#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
-#endif
-
- return 0;
+ return ext4_fname_setup_ci_filename(dir, iname, fname);
}
static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2544,10 +2870,13 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
-#ifdef CONFIG_UNICODE
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ ext4_fname_free_ci_filename(fname);
+}
+
+static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
+ void __user *arg)
+{
+ return -EOPNOTSUPP;
}
#endif /* !CONFIG_FS_ENCRYPTION */
@@ -2557,26 +2886,26 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
struct ext4_dir_entry_2 *,
struct buffer_head *, char *, int,
unsigned int);
-#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
- (de), (bh), (buf), (size), (offset)))
+ (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent,
struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
-extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
- struct buffer_head *bh,
+extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
-void ext4_insert_dentry(struct inode *inode,
+void ext4_insert_dentry(struct inode *dir, struct inode *inode,
struct ext4_dir_entry_2 *de,
int buf_size,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!ext4_has_feature_dir_index(inode->i_sb)) {
+ if (!ext4_has_feature_dir_index(inode->i_sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
/* ext4_iget() should have caught this... */
WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
@@ -2604,18 +2933,20 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo);
/* ialloc.c */
-extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+extern int ext4_mark_inode_used(struct super_block *sb, int ino);
+extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *,
+ struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
uid_t *owner, __u32 i_flags,
int handle_type, unsigned int line_no,
int nblocks);
-#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
- __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
- i_flags, 0, 0, 0)
-#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
+ __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \
+ (goal), (owner), i_flags, 0, 0, 0)
+#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \
type, nblocks) \
- __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+ __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \
0, (type), __LINE__, (nblocks))
@@ -2623,24 +2954,55 @@ extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
-extern void ext4_check_inodes_bitmap(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+/* fast_commit.c */
+int ext4_fc_info_show(struct seq_file *seq, void *v);
+void ext4_fc_init(struct super_block *sb, journal_t *journal);
+void ext4_fc_init_inode(struct inode *inode);
+void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end);
+void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
+void ext4_fc_del(struct inode *inode);
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
+void ext4_fc_replay_cleanup(struct super_block *sb);
+int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
+int __init ext4_fc_init_dentry_cache(void);
+void ext4_fc_destroy_dentry_cache(void);
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk,
+ int len, int replay);
+
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
-extern long ext4_mb_stats;
-extern long ext4_mb_max_to_scan;
+extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
+extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
-extern int ext4_mb_release(struct super_block *);
+extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
-extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
+extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+ ext4_group_t group,
+ unsigned int nr, int *cnt);
+extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr);
+
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
@@ -2652,9 +3014,18 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
+extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, bool state);
+static inline bool ext4_mb_cr_expensive(enum criteria cr)
+{
+ return cr >= CR_GOAL_LEN_SLOW;
+}
/* inode.c */
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
+void ext4_check_map_extents_env(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
@@ -2666,21 +3037,25 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
+ struct inode *inode,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
- int (*fn)(handle_t *handle,
+ int (*fn)(handle_t *handle, struct inode *inode,
struct buffer_head *bh));
-int do_journal_get_write_access(handle_t *handle,
+int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+void ext4_set_inode_mapping_order(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
typedef enum {
EXT4_IGET_NORMAL = 0,
EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */
- EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */
+ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */
+ EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */
+ EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */
} ext4_iget_flags;
extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -2691,29 +3066,38 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
__ext4_iget((sb), (ino), (flags), __func__, __LINE__)
extern int ext4_write_inode(struct inode *, struct writeback_control *);
-extern int ext4_setattr(struct dentry *, struct iattr *);
-extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int ext4_setattr(struct mnt_idmap *, struct dentry *,
+ struct iattr *);
+extern u32 ext4_dio_alignment(struct inode *inode);
+extern int ext4_getattr(struct mnt_idmap *, const struct path *,
+ struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
-extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int ext4_sync_inode(handle_t *, struct inode *);
+extern int ext4_file_getattr(struct mnt_idmap *, const struct path *,
+ struct kstat *, u32, unsigned int);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern void ext4_set_inode_flags(struct inode *);
+extern int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
-extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
+extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
-extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
@@ -2722,6 +3106,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+static inline bool is_special_ino(struct super_block *sb, unsigned long ino)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
+ ino == le32_to_cpu(es->s_usr_quota_inum) ||
+ ino == le32_to_cpu(es->s_grp_quota_inum) ||
+ ino == le32_to_cpu(es->s_prj_quota_inum) ||
+ ino == le32_to_cpu(es->s_orphan_file_inum);
+}
+
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -2733,16 +3128,22 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+int ext4_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct file_kattr *fa);
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
+extern void ext4_reset_inode_seed(struct inode *inode);
+int ext4_update_overhead(struct super_block *sb, bool force);
+int ext4_force_shutdown(struct super_block *sb, u32 flags);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
+extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
struct buffer_head *bh);
-extern int ext4_orphan_add(handle_t *, struct inode *);
-extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
@@ -2752,8 +3153,7 @@ extern int ext4_search_dir(struct buffer_head *bh,
struct ext4_filename *fname,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir);
-extern int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+extern int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2769,12 +3169,26 @@ extern int ext4_group_extend(struct super_block *sb,
struct ext4_super_block *es,
ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
+extern unsigned int ext4_list_backups(struct super_block *sb,
+ unsigned int *three, unsigned int *five,
+ unsigned int *seven);
/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
- sector_t block, int op_flags);
+ sector_t block, blk_opf_t op_flags);
+extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
+ sector_t block);
+extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
+ sector_t block);
+extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io, bool simu_fail);
+extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io, bool simu_fail);
+extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
+extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
+extern __le32 ext4_superblock_csum(struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
@@ -2783,10 +3197,12 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno,
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
ext4_group_t block_group,
unsigned int flags);
+extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
+ ext4_group_t block_group);
-extern __printf(6, 7)
-void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
- const char *, ...);
+extern __printf(7, 8)
+void __ext4_error(struct super_block *, const char *, unsigned int, bool,
+ int, __u64, const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
ext4_fsblk_t, int, const char *, ...);
@@ -2795,9 +3211,6 @@ void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
-extern __printf(5, 6)
-void __ext4_abort(struct super_block *, const char *, unsigned int, int,
- const char *, ...);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
const char *, ...);
@@ -2827,6 +3240,9 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define EXT4_ERROR_FILE(file, block, fmt, a...) \
ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+#define ext4_abort(sb, err, fmt, a...) \
+ __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)
+
#ifdef CONFIG_PRINTK
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
@@ -2837,11 +3253,11 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define ext4_error_file(file, func, line, block, fmt, ...) \
__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...) \
- __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
+ __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \
+ ##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...) \
- __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
-#define ext4_abort(sb, err, fmt, ...) \
- __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
+ __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \
+ ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...) \
__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...) \
@@ -2874,17 +3290,12 @@ do { \
#define ext4_error(sb, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_error(sb, "", 0, 0, 0, " "); \
+ __ext4_error(sb, "", 0, false, 0, 0, " "); \
} while (0)
#define ext4_error_err(sb, err, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_error(sb, "", 0, err, 0, " "); \
-} while (0)
-#define ext4_abort(sb, err, fmt, ...) \
-do { \
- no_printk(fmt, ##__VA_ARGS__); \
- __ext4_abort(sb, "", 0, err, " "); \
+ __ext4_error(sb, "", 0, false, err, 0, " "); \
} while (0)
#define ext4_warning(sb, fmt, ...) \
do { \
@@ -2911,12 +3322,6 @@ do { \
#endif
-extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
- __u32 compat);
-extern int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat);
-extern int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat);
extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -2953,36 +3358,30 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
extern int ext4_register_li_request(struct super_block *sb,
ext4_group_t first_not_zeroed);
-static inline int ext4_has_metadata_csum(struct super_block *sb)
-{
- WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
- !EXT4_SB(sb)->s_chksum_driver);
-
- return ext4_has_feature_metadata_csum(sb) &&
- (EXT4_SB(sb)->s_chksum_driver != NULL);
-}
-
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
- return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+ return ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb);
}
+#define ext4_read_incompat_64bit_val(es, name) \
+ (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
+ ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
+ le32_to_cpu(es->name##_lo))
+
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_blocks_count);
}
static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_r_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}
static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_free_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}
static inline void ext4_blocks_count_set(struct ext4_super_block *es,
@@ -3023,19 +3422,6 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}
-static inline
-struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
- ext4_group_t group)
-{
- struct ext4_group_info **grp_info;
- long indexv, indexh;
- BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
- indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
- indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
- grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
- return grp_info[indexh];
-}
-
/*
* Reading s_groups_count requires using smp_rmb() afterwards. See
* the locking protocol documented in the comments of ext4_group_add()
@@ -3060,6 +3446,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline loff_t ext4_get_maxbytes(struct inode *inode)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return inode->i_sb->s_maxbytes;
+ return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
@@ -3076,7 +3469,7 @@ do { \
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
-/* Update i_disksize. Requires i_mutex to avoid races with truncate */
+/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
@@ -3087,7 +3480,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
-/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
int changed = 0;
@@ -3108,11 +3501,17 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
struct ext4_group_info {
unsigned long bb_state;
+#ifdef AGGRESSIVE_CHECK
+ unsigned long bb_check_counter;
+#endif
struct rb_root bb_free_root;
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ int bb_avg_fragment_size_order; /* order of average
+ fragment in BG */
ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
+ ext4_group_t bb_group; /* Group number */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
@@ -3132,6 +3531,7 @@ struct ext4_group_info {
(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3146,6 +3546,8 @@ struct ext4_group_info {
(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
@@ -3165,23 +3567,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}
+static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ if (!spin_trylock(ext4_group_lock_ptr(sb, group)))
+ return false;
+ /*
+ * We're able to grab the lock right away, so drop the lock
+ * contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ return true;
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spinlock_t *lock = ext4_group_lock_ptr(sb, group);
- if (spin_trylock(lock))
- /*
- * We're able to grab the lock right away, so drop the
- * lock contention counter.
- */
- atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
- else {
+ if (!ext4_try_lock_group(sb, group)) {
/*
* The lock is busy, so bump the contention counter,
* and then wait on the spin lock.
*/
atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
EXT4_MAX_CONTENTION);
- spin_lock(lock);
+ spin_lock(ext4_group_lock_ptr(sb, group));
}
}
@@ -3191,6 +3598,22 @@ static inline void ext4_unlock_group(struct super_block *sb,
spin_unlock(ext4_group_lock_ptr(sb, group));
}
+#ifdef CONFIG_QUOTA
+static inline bool ext4_quota_capable(struct super_block *sb)
+{
+ return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
+}
+
+static inline bool ext4_is_quota_journalled(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ return (ext4_has_feature_quota(sb) ||
+ sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
+}
+int ext4_enable_quotas(struct super_block *sb);
+#endif
+
/*
* Block validity checking
*/
@@ -3211,10 +3634,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
/* dir.c */
extern const struct file_operations ext4_dir_operations;
-#ifdef CONFIG_UNICODE
-extern const struct dentry_operations ext4_dentry_ops;
-#endif
-
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
@@ -3223,33 +3642,21 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+extern void ext4_update_final_de(void *de_buf, int old_size, int new_size);
-extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
- struct page **pagep);
-extern int ext4_write_inline_data_end(struct inode *inode,
- loff_t pos, unsigned len,
- unsigned copied,
- struct page *page);
-extern struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page);
-extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- unsigned flags,
- struct page **pagep,
- void **fsdata);
-extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
- unsigned len, unsigned copied,
- struct page *page);
+ struct folio **foliop);
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct folio *folio);
+extern int ext4_generic_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop,
+ void **fsdata, bool da);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
@@ -3277,9 +3684,7 @@ extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline, __u64 start, __u64 len);
+extern void *ext4_read_inline_link(struct inode *inode);
struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);
@@ -3298,17 +3703,18 @@ static inline int ext4_has_inline_data(struct inode *inode)
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
-extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len);
+extern int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *dir_block,
+ unsigned int parent_ino, void *inline_buf,
+ int inline_size);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-extern int ext4_ci_compare(const struct inode *parent,
- const struct qstr *fname,
- const struct qstr *entry, bool quick);
+extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode, struct dentry *dentry);
+extern int __ext4_link(struct inode *dir, struct inode *inode,
+ struct dentry *dentry);
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
@@ -3330,7 +3736,7 @@ static inline void ext4_set_de_type(struct super_block *sb,
/* readpages.c */
extern int ext4_mpage_readpages(struct inode *inode,
- struct readahead_control *rac, struct page *page);
+ struct readahead_control *rac, struct folio *folio);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);
@@ -3340,6 +3746,7 @@ extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;
/* sysfs.c */
+extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi);
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
@@ -3350,11 +3757,14 @@ extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
-extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
- ext4_fsblk_t start_blk,
- unsigned int count);
+extern int ext4_inode_block_valid(struct inode *inode,
+ ext4_fsblk_t start_blk,
+ unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
+extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count);
+
/* extents.c */
struct ext4_ext_path;
@@ -3379,6 +3789,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_extents_atomic(handle_t *handle,
+ struct inode *inode, loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
@@ -3386,13 +3798,14 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *,
- struct ext4_ext_path **,
- struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_insert_extent(
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int gb_flags);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path **,
+ struct ext4_ext_path *,
int flags);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -3409,6 +3822,11 @@ extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
int check_cred, int restart_cred,
int revoke_cred);
+extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
+extern int ext4_ext_replay_set_iblocks(struct inode *inode);
+extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
+ int len, int unwritten, ext4_fsblk_t pblk);
+extern int ext4_ext_clear_bb(struct inode *inode);
/* move_extent.c */
@@ -3431,20 +3849,33 @@ extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
-extern int ext4_bio_write_page(struct ext4_io_submit *io,
- struct page *page,
- int len,
- struct writeback_control *wbc,
- bool keep_towrite);
+int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
+ size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+/* mmp.c */
+extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);
+
/* verity.c */
extern const struct fsverity_operations ext4_verityops;
+/* orphan.c */
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern void ext4_orphan_cleanup(struct super_block *sb,
+ struct ext4_super_block *es);
+extern void ext4_release_orphan_info(struct super_block *sb);
+extern int ext4_init_orphan_info(struct super_block *sb);
+extern int ext4_orphan_file_empty(struct super_block *sb);
+extern void ext4_orphan_file_block_trigger(
+ struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size);
+
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
@@ -3462,36 +3893,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
-/* For ioend & aio unwritten conversion wait queues */
-#define EXT4_WQ_HASH_SZ 37
-#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
- EXT4_WQ_HASH_SZ])
-extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-
extern int ext4_resize_begin(struct super_block *sb);
-extern void ext4_resize_end(struct super_block *sb);
+extern int ext4_resize_end(struct super_block *sb, bool update_backups);
-static inline void ext4_set_io_unwritten_flag(struct inode *inode,
- struct ext4_io_end *io_end)
+static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end)
{
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN))
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_unwritten);
- }
}
static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
- struct inode *inode = io_end->inode;
-
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
- }
}
extern const struct iomap_ops ext4_iomap_ops;
@@ -3506,11 +3920,22 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
* have to read the block because we may read the old data
* successfully.
*/
- if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
+ if (buffer_write_io_error(bh))
set_buffer_uptodate(bh);
return buffer_uptodate(bh);
}
+static inline bool ext4_inode_can_atomic_write(struct inode *inode)
+{
+
+ return S_ISREG(inode->i_mode) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ EXT4_SB(inode->i_sb)->s_awu_min > 0;
+}
+
+extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
+ loff_t pos, unsigned len,
+ get_block_t *get_block);
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 44e59881a1f0..c484125d963f 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -31,13 +31,6 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_STATS is defined then stats numbers are collected.
- * These number will be displayed at umount time.
- */
-#define EXT_STATS_
-
-
-/*
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;
* the remainder stores an array of ext4_extent.
@@ -173,10 +166,11 @@ struct partial_cluster {
#define EXT_MAX_EXTENT(__hdr__) \
((le16_to_cpu((__hdr__)->eh_max)) ? \
((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
- : 0)
+ : NULL)
#define EXT_MAX_INDEX(__hdr__) \
((le16_to_cpu((__hdr__)->eh_max)) ? \
- ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0)
+ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
+ : NULL)
static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
{
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 0c76cdd44d90..05e5946ed9b3 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -63,15 +63,18 @@ static void ext4_put_nojournal(handle_t *handle)
*/
static int ext4_journal_check_start(struct super_block *sb)
{
+ int ret;
journal_t *journal;
might_sleep();
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
- if (sb_rdonly(sb))
+ if (WARN_ON_ONCE(sb_rdonly(sb)))
return -EROFS;
+
WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
/*
@@ -86,21 +89,27 @@ static int ext4_journal_check_start(struct super_block *sb)
return 0;
}
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+handle_t *__ext4_journal_start_sb(struct inode *inode,
+ struct super_block *sb, unsigned int line,
int type, int blocks, int rsv_blocks,
int revoke_creds)
{
journal_t *journal;
int err;
-
- trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
- _RET_IP_);
+ if (inode)
+ trace_ext4_journal_start_inode(inode, blocks, rsv_blocks,
+ revoke_creds, type,
+ _RET_IP_);
+ else
+ trace_ext4_journal_start_sb(sb, blocks, rsv_blocks,
+ revoke_creds, type,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
journal = EXT4_SB(sb)->s_journal;
- if (!journal)
+ if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return ext4_get_nojournal();
return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
GFP_NOFS, type, line);
@@ -162,6 +171,8 @@ int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
{
if (!ext4_handle_valid(handle))
return 0;
+ if (is_handle_aborted(handle))
+ return -EROFS;
if (jbd2_handle_buffer_credits(handle) >= check_cred &&
handle->h_revoke_credits >= revoke_cred)
return 0;
@@ -195,20 +206,53 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line,
jbd2_journal_abort_handle(handle);
}
+static void ext4_check_bdev_write_error(struct super_block *sb)
+{
+ struct address_space *mapping = sb->s_bdev->bd_mapping;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ /*
+ * If the block device has write error flag, it may have failed to
+ * async write out metadata buffers in the background. In this case,
+ * we could read old data from disk and write it out again, which
+ * may lead to on-disk filesystem inconsistency.
+ */
+ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
+ spin_lock(&sbi->s_bdev_wb_lock);
+ err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
+ spin_unlock(&sbi->s_bdev_wb_lock);
+ if (err)
+ ext4_error_err(sb, -err,
+ "Error while async write back metadata");
+ }
+}
+
int __ext4_journal_get_write_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type)
{
- int err = 0;
+ int err;
might_sleep();
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
- if (err)
+ if (err) {
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
- }
- return err;
+ return err;
+ }
+ } else
+ ext4_check_bdev_write_error(sb);
+ if (trigger_type == EXT4_JTR_NONE ||
+ !ext4_has_feature_metadata_csum(sb))
+ return 0;
+ BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
+ jbd2_journal_set_triggers(bh,
+ &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
+ return 0;
}
/*
@@ -219,9 +263,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
* "bh" may be NULL: a metadata block may have been freed from memory
* but there may still be a record of it in the journal, and that record
* still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
*/
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
@@ -234,14 +275,20 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
trace_ext4_forget(inode, is_metadata, blocknr);
BUFFER_TRACE(bh, "enter");
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %x\n",
+ ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n",
bh, is_metadata, inode->i_mode,
test_opt(inode->i_sb, DATA_FLAGS));
- /* In the no journal case, we can just do a bforget and return */
+ /*
+ * In the no journal case, we should wait for the ongoing buffer
+ * to complete and do a forget.
+ */
if (!ext4_handle_valid(handle)) {
- bforget(bh);
+ if (bh) {
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+ __bforget(bh);
+ }
return 0;
}
@@ -271,25 +318,36 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
if (err) {
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
- __ext4_abort(inode->i_sb, where, line, -err,
- "error %d when attempting revoke", err);
+ __ext4_error(inode->i_sb, where, line, true, -err, 0,
+ "error %d when attempting revoke", err);
}
BUFFER_TRACE(bh, "exit");
return err;
}
int __ext4_journal_get_create_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type)
{
- int err = 0;
+ int err;
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_get_create_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__,
- bh, handle, err);
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ err = jbd2_journal_get_create_access(handle, bh);
+ if (err) {
+ ext4_journal_abort_handle(where, line, __func__, bh, handle,
+ err);
+ return err;
}
- return err;
+ if (trigger_type == EXT4_JTR_NONE ||
+ !ext4_has_feature_metadata_csum(sb))
+ return 0;
+ BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
+ jbd2_journal_set_triggers(bh,
+ &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
+ return 0;
}
int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
@@ -302,6 +360,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
set_buffer_meta(bh);
set_buffer_prio(bh);
+ set_buffer_uptodate(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
/* Errors can only happen due to aborted journal or a nasty bug */
@@ -330,7 +389,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
err);
}
} else {
- set_buffer_uptodate(bh);
if (inode)
mark_buffer_dirty_inode(bh, inode);
else
@@ -347,20 +405,3 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
}
return err;
}
-
-int __ext4_handle_dirty_super(const char *where, unsigned int line,
- handle_t *handle, struct super_block *sb)
-{
- struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
- int err = 0;
-
- ext4_superblock_csum_set(sb);
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_dirty_metadata(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__,
- bh, handle, err);
- } else
- mark_buffer_dirty(bh);
- return err;
-}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 00dc668e052b..63d17c5201b5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,17 +86,14 @@
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- ext4_has_feature_quota(sb)) ? 1 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
-#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- ext4_has_feature_quota(sb)) ?\
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_INIT_REWRITE) : 0)
-#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- ext4_has_feature_quota(sb)) ?\
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_DEL_REWRITE) : 0)
#else
@@ -125,90 +122,6 @@
#define EXT4_HT_EXT_CONVERT 11
#define EXT4_HT_MAX 12
-/**
- * struct ext4_journal_cb_entry - Base structure for callback information.
- *
- * This struct is a 'seed' structure for a using with your own callback
- * structs. If you are using callbacks you must allocate one of these
- * or another struct of your own definition which has this struct
- * as it's first element and pass it to ext4_journal_callback_add().
- */
-struct ext4_journal_cb_entry {
- /* list information for other callbacks attached to the same handle */
- struct list_head jce_list;
-
- /* Function to call with this callback structure */
- void (*jce_func)(struct super_block *sb,
- struct ext4_journal_cb_entry *jce, int error);
-
- /* user data goes here */
-};
-
-/**
- * ext4_journal_callback_add: add a function to call after transaction commit
- * @handle: active journal transaction handle to register callback on
- * @func: callback function to call after the transaction has committed:
- * @sb: superblock of current filesystem for transaction
- * @jce: returned journal callback data
- * @rc: journal state at commit (0 = transaction committed properly)
- * @jce: journal callback data (internal and function private data struct)
- *
- * The registered function will be called in the context of the journal thread
- * after the transaction for which the handle was created has completed.
- *
- * No locks are held when the callback function is called, so it is safe to
- * call blocking functions from within the callback, but the callback should
- * not block or run for too long, or the filesystem will be blocked waiting for
- * the next transaction to commit. No journaling functions can be used, or
- * there is a risk of deadlock.
- *
- * There is no guaranteed calling order of multiple registered callbacks on
- * the same transaction.
- */
-static inline void _ext4_journal_callback_add(handle_t *handle,
- struct ext4_journal_cb_entry *jce)
-{
- /* Add the jce to transaction's private list */
- list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
-}
-
-static inline void ext4_journal_callback_add(handle_t *handle,
- void (*func)(struct super_block *sb,
- struct ext4_journal_cb_entry *jce,
- int rc),
- struct ext4_journal_cb_entry *jce)
-{
- struct ext4_sb_info *sbi =
- EXT4_SB(handle->h_transaction->t_journal->j_private);
-
- /* Add the jce to transaction's private list */
- jce->jce_func = func;
- spin_lock(&sbi->s_md_lock);
- _ext4_journal_callback_add(handle, jce);
- spin_unlock(&sbi->s_md_lock);
-}
-
-
-/**
- * ext4_journal_callback_del: delete a registered callback
- * @handle: active journal transaction handle on which callback was registered
- * @jce: registered journal callback entry to unregister
- * Return true if object was successfully removed
- */
-static inline bool ext4_journal_callback_try_del(handle_t *handle,
- struct ext4_journal_cb_entry *jce)
-{
- bool deleted;
- struct ext4_sb_info *sbi =
- EXT4_SB(handle->h_transaction->t_journal->j_private);
-
- spin_lock(&sbi->s_md_lock);
- deleted = !list_empty(&jce->jce_list);
- list_del_init(&jce->jce_list);
- spin_unlock(&sbi->s_md_lock);
- return deleted;
-}
-
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
@@ -234,38 +147,39 @@ int ext4_expand_extra_isize(struct inode *inode,
* Wrapper functions with which ext4 calls into JBD.
*/
int __ext4_journal_get_write_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type);
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
int __ext4_journal_get_create_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type);
int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-int __ext4_handle_dirty_super(const char *where, unsigned int line,
- handle_t *handle, struct super_block *sb);
-
-#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
+#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \
+ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \
+ (bh), (trigger_type))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
__ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
(bh), (block_nr))
-#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
+#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \
+ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \
+ (bh), (trigger_type))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
-#define ext4_handle_dirty_super(handle, sb) \
- __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks,
- int revoke_creds);
+handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
+ unsigned int line, int type, int blocks,
+ int rsv_blocks, int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -305,7 +219,7 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
@@ -325,7 +239,7 @@ static inline handle_t *__ext4_journal_start(struct inode *inode,
int blocks, int rsv_blocks,
int revoke_creds)
{
- return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
rsv_blocks, revoke_creds);
}
@@ -405,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
revoke_creds, 0);
}
-static inline int ext4_journal_blocks_per_page(struct inode *inode)
+static inline int ext4_journal_blocks_per_folio(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
- return jbd2_journal_blocks_per_page(inode);
+ return jbd2_journal_blocks_per_folio(inode);
return 0;
}
@@ -493,7 +407,7 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
- * i_mutex for direct I/O reads. This only works for extent-based
+ * i_rwsem for direct I/O reads. This only works for extent-based
* files, and it doesn't work if data journaling is enabled, since the
* dioread_nolock code uses b_private to pass information back to the
* I/O completion handler, and this conflicts with the jbd's use of
@@ -515,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 1;
}
+/*
+ * Pass journal explicitly as it may not be cached in the sbi->s_journal in some
+ * cases
+ */
+static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal)
+{
+ int err = 0;
+
+ /*
+ * At this point only two things can be operating on the journal.
+ * JBD2 thread performing transaction commit and s_sb_upd_work
+ * issuing sb update through the journal. Once we set
+ * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not
+ * queue s_sb_upd_work and ext4_force_commit() makes sure any
+ * ext4_handle_error() calls from the running transaction commit are
+ * finished. Hence no new s_sb_upd_work can be queued after we
+ * flush it here.
+ */
+ ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY);
+
+ ext4_force_commit(sbi->s_sb);
+ flush_work(&sbi->s_sb_upd_work);
+
+ err = jbd2_journal_destroy(journal);
+ sbi->s_journal = NULL;
+
+ return err;
+}
+
#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7d088ff1e902..2cf5759ba689 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -27,8 +27,8 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fiemap.h>
-#include <linux/backing-dev.h>
#include <linux/iomap.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
@@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh,
EXT4_EXTENT_TAIL_OFFSET(eh));
return cpu_to_le32(csum);
}
@@ -63,7 +62,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
et = find_ext4_extent_tail(eh);
@@ -77,19 +76,18 @@ static void ext4_extent_block_csum_set(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
et = find_ext4_extent_tail(eh);
et->et_checksum = ext4_extent_block_csum(inode, eh);
}
-static int ext4_split_extent_at(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- ext4_lblk_t split,
- int split_flag,
- int flags);
+static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag, int flags);
static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
@@ -97,7 +95,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
+ * i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode);
@@ -106,6 +104,31 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
return 0;
}
+static inline void ext4_ext_path_brelse(struct ext4_ext_path *path)
+{
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+}
+
+static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+ int depth, i;
+
+ if (IS_ERR_OR_NULL(path))
+ return;
+ depth = path->p_depth;
+ for (i = 0; i <= depth; i++, path++)
+ ext4_ext_path_brelse(path);
+}
+
+void ext4_free_ext_path(struct ext4_ext_path *path)
+{
+ if (IS_ERR_OR_NULL(path))
+ return;
+ ext4_ext_drop_refs(path);
+ kfree(path);
+}
+
/*
* Make sure 'handle' has at least 'check_cred' credits. If not, restart
* transaction with 'restart_cred' credits. The function drops i_data_sem
@@ -136,14 +159,25 @@ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
+ int err = 0;
+
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, path->p_bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ path->p_bh, EXT4_JTR_NONE);
+ /*
+ * The extent buffer's verified bit will be set again in
+ * __ext4_ext_dirty(). We could leave an inconsistent
+ * buffer if the extents updating procudure break off du
+ * to some error happens, force to check it again.
+ */
+ if (!err)
+ clear_buffer_verified(path->p_bh);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
- return 0;
+ return err;
}
/*
@@ -164,6 +198,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
+ /* Extents updating done, re-set verified flag */
+ if (!err)
+ set_buffer_verified(path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -290,19 +327,18 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
return size;
}
-static inline int
+static inline struct ext4_ext_path *
ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
- struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+ struct ext4_ext_path *path, ext4_lblk_t lblk,
int nofail)
{
- struct ext4_ext_path *path = *ppath;
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
- int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
+ int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
- return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
+ return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
flags);
}
@@ -340,7 +376,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
*/
if (lblock + len <= lblock)
return 0;
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+ return ext4_inode_block_valid(inode, block, len);
}
static int ext4_valid_extent_idx(struct inode *inode,
@@ -348,14 +384,18 @@ static int ext4_valid_extent_idx(struct inode *inode,
{
ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+ return ext4_inode_block_valid(inode, block, 1);
}
static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent_header *eh,
- ext4_fsblk_t *pblk, int depth)
+ ext4_lblk_t lblk, ext4_fsblk_t *pblk,
+ int depth)
{
unsigned short entries;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t cur = 0;
+
if (eh->eh_entries == 0)
return 1;
@@ -364,31 +404,51 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
- ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
- int len = 0;
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext->ee_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- len = ext4_ext_get_actual_len(ext);
- if ((lblock <= prev) && prev) {
+ if (lblock < cur) {
*pblk = ext4_ext_pblock(ext);
return 0;
}
+ cur = lblock + ext4_ext_get_actual_len(ext);
ext++;
entries--;
- prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the parent index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext_idx->ei_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
+
+ /* Check for overlapping index extents */
+ lblock = le32_to_cpu(ext_idx->ei_block);
+ if (lblock < cur) {
+ *pblk = ext4_idx_pblock(ext_idx);
+ return 0;
+ }
ext_idx++;
entries--;
+ cur = lblock + 1;
}
}
return 1;
@@ -396,7 +456,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth, ext4_fsblk_t pblk)
+ int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
const char *error_msg;
int max = 0, err = -EFSCORRUPTED;
@@ -422,7 +482,11 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
- if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
+ if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
+ error_msg = "eh_entries is 0 but eh_depth is > 0";
+ goto corrupted;
+ }
+ if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
@@ -452,7 +516,7 @@ corrupted:
}
#define ext4_ext_check(inode, eh, depth, pblk) \
- __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
int ext4_ext_check_inode(struct inode *inode)
{
@@ -485,36 +549,34 @@ static void ext4_cache_extents(struct inode *inode,
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
- struct inode *inode, ext4_fsblk_t pblk, int depth,
- int flags)
+ struct inode *inode, struct ext4_extent_idx *idx,
+ int depth, int flags)
{
struct buffer_head *bh;
int err;
gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS;
+ ext4_fsblk_t pblk;
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
+ pblk = ext4_idx_pblock(idx);
bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
- err = bh_submit_read(bh);
+ err = ext4_read_bh(bh, 0, NULL, false);
if (err < 0)
goto errout;
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- if (!ext4_has_feature_journal(inode->i_sb) ||
- (inode->i_ino !=
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
- if (err)
- goto errout;
- }
+ err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+ depth, pblk, le32_to_cpu(idx->ei_block));
+ if (err)
+ goto errout;
set_buffer_verified(bh);
/*
* If this is a leaf block, cache all of its entries
@@ -530,8 +592,8 @@ errout:
}
-#define read_extent_tree_block(inode, pblk, depth, flags) \
- __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+#define read_extent_tree_block(inode, idx, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
(depth), (flags))
/*
@@ -548,6 +610,8 @@ int ext4_ext_precache(struct inode *inode)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return 0; /* not an extent-mapped inode */
+ ext4_check_map_extents_env(inode);
+
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
@@ -576,13 +640,11 @@ int ext4_ext_precache(struct inode *inode)
*/
if ((i == depth) ||
path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
continue;
}
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx++),
+ bh = read_extent_tree_block(inode, path[i].p_idx++,
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
@@ -597,8 +659,7 @@ int ext4_ext_precache(struct inode *inode)
ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
up_read(&ei->i_data_sem);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -632,7 +693,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
struct ext4_extent *ex;
int i;
- if (!path)
+ if (IS_ERR_OR_NULL(path))
return;
eh = path[depth].p_hdr;
@@ -685,21 +746,6 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
#define ext4_ext_show_move(inode, path, newblock, level)
#endif
-void ext4_ext_drop_refs(struct ext4_ext_path *path)
-{
- int depth, i;
-
- if (!path)
- return;
- depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++) {
- if (path->p_bh) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
- }
-}
-
/*
* ext4_ext_binsearch_idx:
* binary search for the closest index of the given block
@@ -719,13 +765,14 @@ ext4_ext_binsearch_idx(struct inode *inode,
r = EXT_LAST_INDEX(eh);
while (l <= r) {
m = l + (r - l) / 2;
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
+ r, le32_to_cpu(r->ei_block));
+
if (block < le32_to_cpu(m->ei_block))
r = m - 1;
else
l = m + 1;
- ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
- le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
- r, le32_to_cpu(r->ei_block));
}
path->p_idx = l - 1;
@@ -787,13 +834,14 @@ ext4_ext_binsearch(struct inode *inode,
while (l <= r) {
m = l + (r - l) / 2;
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
+ r, le32_to_cpu(r->ee_block));
+
if (block < le32_to_cpu(m->ee_block))
r = m - 1;
else
l = m + 1;
- ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
- le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
- r, le32_to_cpu(r->ee_block));
}
path->p_ext = l - 1;
@@ -831,16 +879,16 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
+ eh->eh_generation = 0;
ext4_mark_inode_dirty(handle, inode);
}
struct ext4_ext_path *
ext4_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path **orig_path, int flags)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
- struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
short int depth, i, ppos = 0;
int ret;
gfp_t gfp_flags = GFP_NOFS;
@@ -861,7 +909,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
ext4_ext_drop_refs(path);
if (depth > path[0].p_maxdepth) {
kfree(path);
- *orig_path = path = NULL;
+ path = NULL;
}
}
if (!path) {
@@ -888,8 +936,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
- flags);
+ bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
@@ -916,10 +963,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
return path;
err:
- ext4_ext_drop_refs(path);
- kfree(path);
- if (orig_path)
- *orig_path = NULL;
+ ext4_free_ext_path(path);
return ERR_PTR(ret);
}
@@ -967,6 +1011,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
ix = curp->p_idx;
}
+ if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+ return -EFSCORRUPTED;
+ }
+
len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
BUG_ON(len < 0);
if (len > 0) {
@@ -976,11 +1025,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
}
- if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
- EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
- return -EFSCORRUPTED;
- }
-
ix->ei_block = cpu_to_le32(logical);
ext4_idx_store_pblock(ix, ptr);
le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -1087,7 +1131,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
}
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto cleanup;
@@ -1096,6 +1141,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
+ neh->eh_generation = 0;
/* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -1164,7 +1210,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
}
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto cleanup;
@@ -1173,6 +1220,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
+ neh->eh_generation = 0;
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
ext4_idx_store_pblock(fidx, oldblock);
@@ -1289,7 +1337,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
return -ENOMEM;
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err) {
unlock_buffer(bh);
goto out;
@@ -1312,6 +1361,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
neh->eh_magic = EXT4_EXT_MAGIC;
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -1346,15 +1396,15 @@ out:
* finds empty index and adds new leaf.
* if no free index is found, then it requests in-depth growing.
*/
-static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- unsigned int mb_flags,
- unsigned int gb_flags,
- struct ext4_ext_path **ppath,
- struct ext4_extent *newext)
+static struct ext4_ext_path *
+ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+ unsigned int mb_flags, unsigned int gb_flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_ext_path *curp;
int depth, i, err = 0;
+ ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block);
repeat:
i = depth = ext_depth(inode);
@@ -1373,42 +1423,38 @@ repeat:
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
if (err)
- goto out;
+ goto errout;
/* refill path */
- path = ext4_find_extent(inode,
- (ext4_lblk_t)le32_to_cpu(newext->ee_block),
- ppath, gb_flags);
- if (IS_ERR(path))
- err = PTR_ERR(path);
- } else {
- /* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, mb_flags);
- if (err)
- goto out;
+ path = ext4_find_extent(inode, ee_block, path, gb_flags);
+ return path;
+ }
- /* refill path */
- path = ext4_find_extent(inode,
- (ext4_lblk_t)le32_to_cpu(newext->ee_block),
- ppath, gb_flags);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
+ /* tree is full, time to grow in depth */
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags);
+ if (err)
+ goto errout;
- /*
- * only first (depth 0 -> 1) produces free space;
- * in all other cases we have to split the grown tree
- */
- depth = ext_depth(inode);
- if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
- /* now we need to split */
- goto repeat;
- }
+ /* refill path */
+ path = ext4_find_extent(inode, ee_block, path, gb_flags);
+ if (IS_ERR(path))
+ return path;
+
+ /*
+ * only first (depth 0 -> 1) produces free space;
+ * in all other cases we have to split the grown tree
+ */
+ depth = ext_depth(inode);
+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+ /* now we need to split */
+ goto repeat;
}
-out:
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -1455,8 +1501,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
- EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
- le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
+ le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
depth);
return -EFSCORRUPTED;
}
@@ -1477,22 +1522,21 @@ static int ext4_ext_search_left(struct inode *inode,
}
/*
- * search the closest allocated block to the right for *logical
- * and returns it at @logical + it's physical address at @phys
- * if *logical is the largest allocated block, the function
- * returns 0 at @phys
- * return value contains 0 (success) or error code
+ * Search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys.
+ * If not exists, return 0 and @phys is set to 0. We will return
+ * 1 which means we found an allocated block and ret_ex is valid.
+ * Or return a (< 0) error code.
*/
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent **ret_ex)
+ struct ext4_extent *ret_ex, int flags)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
- ext4_fsblk_t block;
int depth; /* Note, NOT eh_depth; depth from top of tree */
int ee_len;
@@ -1559,20 +1603,18 @@ got_index:
* follow it and find the closest allocated
* block to the right */
ix++;
- block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, block,
- path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth,
+ flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
- block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1580,10 +1622,11 @@ got_index:
found_extent:
*logical = le32_to_cpu(ex->ee_block);
*phys = ext4_ext_pblock(ex);
- *ret_ex = ex;
+ if (ret_ex)
+ *ret_ex = *ex;
if (bh)
put_bh(bh);
- return 0;
+ return 1;
}
/*
@@ -1704,12 +1747,23 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
break;
err = ext4_ext_get_access(handle, inode, path + k);
if (err)
- break;
+ goto clean;
path[k].p_idx->ei_block = border;
err = ext4_ext_dirty(handle, inode, path + k);
if (err)
- break;
+ goto clean;
}
+ return 0;
+
+clean:
+ /*
+ * The path[k].p_bh is either unmodified or with no verified bit
+ * set (see ext4_ext_get_access()). So just clear the verified bit
+ * of the successfully modified extents buffers, which will force
+ * these extents to be checked to avoid using inconsistent data.
+ */
+ while (++k < depth)
+ clear_buffer_verified(path[k].p_bh);
return err;
}
@@ -1831,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
path[0].p_hdr->eh_max = cpu_to_le16(max_root);
- brelse(path[1].p_bh);
+ ext4_ext_path_brelse(path + 1);
ext4_free_blocks(handle, inode, NULL, blk, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
}
@@ -1915,20 +1969,19 @@ out:
/*
* ext4_ext_insert_extent:
- * tries to merge requsted extent into the existing extent or
+ * tries to merge requested extent into the existing extent or
* inserts requested extent as new one into the tree,
* creating new leaf in the no-space case.
*/
-int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_extent *newext, int gb_flags)
+struct ext4_ext_path *
+ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int gb_flags)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
- struct ext4_ext_path *npath = NULL;
- int depth, len, err;
+ int depth, len, err = 0;
ext4_lblk_t next;
int mb_flags = 0, unwritten;
@@ -1936,18 +1989,20 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
/* try to insert block into found extent and return */
- if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
+ if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) {
/*
* Try to see whether we should rather test the extent on
@@ -1981,13 +2036,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode,
path + depth);
if (err)
- return err;
+ goto errout;
unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
if (unwritten)
ext4_ext_mark_unwritten(ex);
- eh = path[depth].p_hdr;
nearex = ex;
goto merge;
}
@@ -2007,7 +2061,7 @@ prepend:
err = ext4_ext_get_access(handle, inode,
path + depth);
if (err)
- return err;
+ goto errout;
unwritten = ext4_ext_is_unwritten(ex);
ex->ee_block = newext->ee_block;
@@ -2016,7 +2070,6 @@ prepend:
+ ext4_ext_get_actual_len(newext));
if (unwritten)
ext4_ext_mark_unwritten(ex);
- eh = path[depth].p_hdr;
nearex = ex;
goto merge;
}
@@ -2033,21 +2086,26 @@ prepend:
if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
next = ext4_ext_next_leaf_block(path);
if (next != EXT_MAX_BLOCKS) {
+ struct ext4_ext_path *npath;
+
ext_debug(inode, "next leaf block - %u\n", next);
- BUG_ON(npath != NULL);
npath = ext4_find_extent(inode, next, NULL, gb_flags);
- if (IS_ERR(npath))
- return PTR_ERR(npath);
+ if (IS_ERR(npath)) {
+ err = PTR_ERR(npath);
+ goto errout;
+ }
BUG_ON(npath->p_depth != path->p_depth);
eh = npath[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
ext_debug(inode, "next leaf isn't full(%d)\n",
le16_to_cpu(eh->eh_entries));
+ ext4_free_ext_path(path);
path = npath;
goto has_space;
}
ext_debug(inode, "next leaf has no free space(%d,%d)\n",
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+ ext4_free_ext_path(npath);
}
/*
@@ -2056,10 +2114,10 @@ prepend:
*/
if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
mb_flags |= EXT4_MB_USE_RESERVED;
- err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
- ppath, newext);
- if (err)
- goto cleanup;
+ path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ path, newext);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
@@ -2068,7 +2126,7 @@ has_space:
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto cleanup;
+ goto errout;
if (!nearex) {
/* there is no extent in this leaf, create first one */
@@ -2123,21 +2181,23 @@ has_space:
merge:
/* try to merge extents */
- if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
ext4_ext_try_to_merge(handle, inode, path, nearex);
-
/* time to correct all indexes above */
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
- goto cleanup;
+ goto errout;
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (err)
+ goto errout;
-cleanup:
- ext4_ext_drop_refs(npath);
- kfree(npath);
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
static int ext4_fill_es_cache_info(struct inode *inode,
@@ -2153,7 +2213,7 @@ static int ext4_fill_es_cache_info(struct inode *inode,
while (block <= end) {
next = 0;
flags = 0;
- if (!ext4_es_lookup_extent(inode, block, &next, &es))
+ if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL))
break;
if (ext4_es_is_unwritten(&es))
flags |= FIEMAP_EXTENT_UNWRITTEN;
@@ -2187,7 +2247,7 @@ static int ext4_fill_es_cache_info(struct inode *inode,
/*
- * ext4_ext_determine_hole - determine hole around given block
+ * ext4_ext_find_hole - find hole around given block according to the given path
* @inode: inode we lookup in
* @path: path in extent tree to @lblk
* @lblk: pointer to logical block around which we want to determine hole
@@ -2199,9 +2259,9 @@ static int ext4_fill_es_cache_info(struct inode *inode,
* The function returns the length of a hole starting at @lblk. We update @lblk
* to the beginning of the hole if we managed to find it.
*/
-static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
- struct ext4_ext_path *path,
- ext4_lblk_t *lblk)
+static ext4_lblk_t ext4_ext_find_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *lblk)
{
int depth = ext_depth(inode);
struct ext4_extent *ex;
@@ -2229,30 +2289,6 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
}
/*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
- */
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
- ext4_lblk_t hole_len)
-{
- struct extent_status es;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
- hole_start + hole_len - 1, &es);
- if (es.es_len) {
- /* There's delayed extent containing lblock? */
- if (es.es_lblk <= hole_start)
- return;
- hole_len = min(es.es_lblk - hole_start, hole_len);
- }
- ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
- ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
- EXTENT_STATUS_HOLE);
-}
-
-/*
* ext4_ext_rm_idx:
* removes index from the index block.
*/
@@ -2261,27 +2297,26 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
{
int err;
ext4_fsblk_t leaf;
+ int k = depth - 1;
/* free index block */
- depth--;
- path = path + depth;
- leaf = ext4_idx_pblock(path->p_idx);
- if (unlikely(path->p_hdr->eh_entries == 0)) {
- EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+ leaf = ext4_idx_pblock(path[k].p_idx);
+ if (unlikely(path[k].p_hdr->eh_entries == 0)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k);
return -EFSCORRUPTED;
}
- err = ext4_ext_get_access(handle, inode, path);
+ err = ext4_ext_get_access(handle, inode, path + k);
if (err)
return err;
- if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
- int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+ if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) {
+ int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx;
len *= sizeof(struct ext4_extent_idx);
- memmove(path->p_idx, path->p_idx + 1, len);
+ memmove(path[k].p_idx, path[k].p_idx + 1, len);
}
- le16_add_cpu(&path->p_hdr->eh_entries, -1);
- err = ext4_ext_dirty(handle, inode, path);
+ le16_add_cpu(&path[k].p_hdr->eh_entries, -1);
+ err = ext4_ext_dirty(handle, inode, path + k);
if (err)
return err;
ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
@@ -2290,18 +2325,29 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
- while (--depth >= 0) {
- if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+ while (--k >= 0) {
+ if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr))
break;
- path--;
- err = ext4_ext_get_access(handle, inode, path);
+ err = ext4_ext_get_access(handle, inode, path + k);
if (err)
- break;
- path->p_idx->ei_block = (path+1)->p_idx->ei_block;
- err = ext4_ext_dirty(handle, inode, path);
+ goto clean;
+ path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block;
+ err = ext4_ext_dirty(handle, inode, path + k);
if (err)
- break;
+ goto clean;
}
+ return 0;
+
+clean:
+ /*
+ * The path[k].p_bh is either unmodified or with no verified bit
+ * set (see ext4_ext_get_access()). So just clear the verified bit
+ * of the successfully modified extents buffers, which will force
+ * these extents to be checked to avoid using inconsistent data.
+ */
+ while (++k < depth)
+ clear_buffer_verified(path[k].p_bh);
+
return err;
}
@@ -2352,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
- depth = ext_depth(inode);
-
+ /*
+ * Extent tree can change between the time we estimate credits and
+ * the time we actually modify the tree. Assume the worst case.
+ */
if (extents <= 1)
- index = depth * 2;
+ index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents;
else
- index = depth * 3;
+ index = (EXT4_MAX_EXTENT_DEPTH * 3) +
+ DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0));
return index;
}
@@ -2593,9 +2641,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unwritten, ex_ee_len);
path[depth].p_ext = ex;
- a = ex_ee_block > start ? ex_ee_block : start;
- b = ex_ee_block+ex_ee_len - 1 < end ?
- ex_ee_block+ex_ee_len - 1 : end;
+ a = max(ex_ee_block, start);
+ b = min(ex_ee_block + ex_ee_len - 1, end);
ext_debug(inode, " border %u:%u\n", a, b);
@@ -2778,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
+ int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL;
partial.pclu = 0;
partial.lblk = 0;
@@ -2808,8 +2856,7 @@ again:
ext4_fsblk_t pblk;
/* find extent for or closest extent to this block */
- path = ext4_find_extent(inode, end, NULL,
- EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
+ path = ext4_find_extent(inode, end, NULL, flags);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2844,7 +2891,7 @@ again:
* in use to avoid freeing it when removing blocks.
*/
if (sbi->s_cluster_ratio > 1) {
- pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+ pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
partial.pclu = EXT4_B2C(sbi, pblk);
partial.state = nofree;
}
@@ -2855,11 +2902,12 @@ again:
* fail removing space due to ENOSPC so try to use
* reserved block if that happens.
*/
- err = ext4_force_split_extent_at(handle, inode, &path,
- end + 1, 1);
- if (err < 0)
+ path = ext4_force_split_extent_at(handle, inode, path,
+ end + 1, 1);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
goto out;
-
+ }
} else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
partial.state == initial) {
/*
@@ -2874,8 +2922,8 @@ again:
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- &ex);
- if (err)
+ NULL, flags);
+ if (err < 0)
goto out;
if (pblk) {
partial.pclu = EXT4_B2C(sbi, pblk);
@@ -2917,8 +2965,7 @@ again:
err = ext4_ext_rm_leaf(handle, inode, path,
&partial, start, end);
/* root level has p_bh == NULL, brelse() eats this */
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
continue;
}
@@ -2950,9 +2997,8 @@ again:
ext_debug(inode, "move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx), depth - i - 1,
- EXT4_EX_NOCACHE);
+ bh = read_extent_tree_block(inode, path[i].p_idx,
+ depth - i - 1, flags);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -2980,8 +3026,7 @@ again:
err = ext4_ext_rm_idx(handle, inode, path, i);
}
/* root level has p_bh == NULL, brelse() eats this */
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
ext_debug(inode, "return to level %d\n", i);
}
@@ -3022,8 +3067,7 @@ again:
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
path = NULL;
if (err == -EAGAIN)
goto again;
@@ -3083,7 +3127,7 @@ void ext4_ext_release(struct super_block *sb)
#endif
}
-static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
{
ext4_lblk_t ee_block;
ext4_fsblk_t ee_pblock;
@@ -3094,10 +3138,10 @@ static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
ee_pblock = ext4_ext_pblock(ex);
if (ee_len == 0)
- return 0;
+ return;
- return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN);
+ ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+ EXTENT_STATUS_WRITTEN, false);
}
/* FIXME!! we need to try to merge to left or right after zero-out */
@@ -3125,22 +3169,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
*
*
* Splits extent [a, b] into two extents [a, @split) and [@split, b], states
- * of which are deterimined by split_flag.
+ * of which are determined by split_flag.
*
* There are two cases:
* a> the extent are splitted into two extent.
* b> split is not needed, and just mark the extent.
*
- * return 0 on success.
+ * Return an extent path pointer on success, or an error pointer on failure.
*/
-static int ext4_split_extent_at(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- ext4_lblk_t split,
- int split_flag,
- int flags)
+static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag, int flags)
{
- struct ext4_ext_path *path = *ppath;
ext4_fsblk_t newblock;
ext4_lblk_t ee_block;
struct ext4_extent *ex, newex, orig_ex, zero_ex;
@@ -3182,7 +3224,7 @@ static int ext4_split_extent_at(handle_t *handle,
else
ext4_ext_mark_initialized(ex);
- if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
ext4_ext_try_to_merge(handle, inode, path, ex);
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
@@ -3210,8 +3252,32 @@ static int ext4_split_extent_at(handle_t *handle,
if (split_flag & EXT4_EXT_MARK_UNWRIT2)
ext4_ext_mark_unwritten(ex2);
- err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
- if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (!IS_ERR(path))
+ goto out;
+
+ err = PTR_ERR(path);
+ if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
+ return path;
+
+ /*
+ * Get a new path to try to zeroout or fix the extent length.
+ * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent()
+ * will not return -ENOMEM, otherwise -ENOMEM will cause a
+ * retry in do_writepages(), and a WARN_ON may be triggered
+ * in ext4_da_update_reserve_space() due to an incorrect
+ * ee_len causing the i_reserved_data_blocks exception.
+ */
+ path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL);
+ if (IS_ERR(path)) {
+ EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
+ split, PTR_ERR(path));
+ return path;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
if (split_flag & EXT4_EXT_DATA_VALID1) {
err = ext4_ext_zeroout(inode, ex2);
@@ -3237,25 +3303,22 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_pblock(&orig_ex));
}
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_len = cpu_to_le16(ee_len);
- ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- if (err)
- goto fix_extent_len;
-
- /* update extent status tree */
- err = ext4_zeroout_es(inode, &zero_ex);
-
- goto out;
- } else if (err)
- goto fix_extent_len;
-
-out:
- ext4_ext_show_leaf(inode, path);
- return err;
+ if (!err) {
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le16(ee_len);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (!err)
+ /* update extent status tree */
+ ext4_zeroout_es(inode, &zero_ex);
+ /* If we failed at this point, we don't know in which
+ * state the extent tree exactly is so don't try to fix
+ * length of the original extent as it may do even more
+ * damage.
+ */
+ goto out;
+ }
+ }
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
@@ -3264,11 +3327,17 @@ fix_extent_len:
* and err is a non-zero error code.
*/
ext4_ext_dirty(handle, inode, path + path->p_depth);
- return err;
+out:
+ if (err) {
+ ext4_free_ext_path(path);
+ path = ERR_PTR(err);
+ }
+ ext4_ext_show_leaf(inode, path);
+ return path;
}
/*
- * ext4_split_extents() splits an extent and mark extent which is covered
+ * ext4_split_extent() splits an extent and mark extent which is covered
* by @map as split_flags indicates
*
* It may result in splitting the extent into multiple extents (up to three)
@@ -3278,21 +3347,18 @@ fix_extent_len:
* c> Splits in three extents: Somone is splitting in middle of the extent
*
*/
-static int ext4_split_extent(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_map_blocks *map,
- int split_flag,
- int flags)
+static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag, int flags,
+ unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
ext4_lblk_t ee_block;
struct ext4_extent *ex;
unsigned int ee_len, depth;
- int err = 0;
int unwritten;
int split_flag1, flags1;
- int allocated = map->m_len;
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3302,35 +3368,33 @@ static int ext4_split_extent(handle_t *handle,
if (map->m_lblk + map->m_len < ee_block + ee_len) {
split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
- flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+ flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (unwritten)
split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
if (split_flag & EXT4_EXT_DATA_VALID2)
split_flag1 |= EXT4_EXT_DATA_VALID1;
- err = ext4_split_extent_at(handle, inode, ppath,
+ path = ext4_split_extent_at(handle, inode, path,
map->m_lblk + map->m_len, split_flag1, flags1);
- if (err)
- goto out;
- } else {
- allocated = ee_len - (map->m_lblk - ee_block);
- }
- /*
- * Update path is required because previous ext4_split_extent_at() may
- * result in split of original leaf or extent zeroout.
- */
- path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
- if (IS_ERR(path))
- return PTR_ERR(path);
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- if (!ex) {
- EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
- (unsigned long) map->m_lblk);
- return -EFSCORRUPTED;
+ if (IS_ERR(path))
+ return path;
+ /*
+ * Update path is required because previous ext4_split_extent_at
+ * may result in split of original leaf or extent zeroout.
+ */
+ path = ext4_find_extent(inode, map->m_lblk, path, flags);
+ if (IS_ERR(path))
+ return path;
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ ext4_free_ext_path(path);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ unwritten = ext4_ext_is_unwritten(ex);
}
- unwritten = ext4_ext_is_unwritten(ex);
- split_flag1 = 0;
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
@@ -3339,15 +3403,20 @@ static int ext4_split_extent(handle_t *handle,
split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
EXT4_EXT_MARK_UNWRIT2);
}
- err = ext4_split_extent_at(handle, inode, ppath,
+ path = ext4_split_extent_at(handle, inode, path,
map->m_lblk, split_flag1, flags);
- if (err)
- goto out;
+ if (IS_ERR(path))
+ return path;
}
+ if (allocated) {
+ if (map->m_lblk + map->m_len > ee_block + ee_len)
+ *allocated = ee_len - (map->m_lblk - ee_block);
+ else
+ *allocated = map->m_len;
+ }
ext4_ext_show_leaf(inode, path);
-out:
- return err ? err : allocated;
+ return path;
}
/*
@@ -3370,13 +3439,11 @@ out:
* that are allocated and initialized.
* It is guaranteed to be >= map->m_len.
*/
-static int ext4_ext_convert_to_initialized(handle_t *handle,
- struct inode *inode,
- struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
- int flags)
+static struct ext4_ext_path *
+ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, struct ext4_ext_path *path,
+ int flags, unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
@@ -3384,9 +3451,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_extent *ex, *abut_ex;
ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth, map_len = map->m_len;
- int allocated = 0, max_zeroout = 0;
int err = 0;
int split_flag = EXT4_EXT_DATA_VALID2;
+ unsigned int max_zeroout = 0;
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)map->m_lblk, map_len);
@@ -3426,6 +3493,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* - L2: we only attempt to merge with an extent stored in the
* same extent tree node.
*/
+ *allocated = 0;
if ((map->m_lblk == ee_block) &&
/* See if we can merge left */
(map_len < ee_len) && /*L1*/
@@ -3455,7 +3523,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
(prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
map, ex, abut_ex);
@@ -3470,7 +3538,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
/* Result: number of initialized blocks past m_lblk */
- allocated = map_len;
+ *allocated = map_len;
}
} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
(map_len < ee_len) && /*L1*/
@@ -3501,7 +3569,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
(next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
map, ex, abut_ex);
@@ -3516,18 +3584,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
abut_ex->ee_len = cpu_to_le16(next_len + map_len);
/* Result: number of initialized blocks past m_lblk */
- allocated = map_len;
+ *allocated = map_len;
}
}
- if (allocated) {
+ if (*allocated) {
/* Mark the block containing both extents as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
/* Update path to point to the right extent */
path[depth].p_ext = abut_ex;
+ if (err)
+ goto errout;
goto out;
} else
- allocated = ee_len - (map->m_lblk - ee_block);
+ *allocated = ee_len - (map->m_lblk - ee_block);
WARN_ON(map->m_lblk < ee_block);
/*
@@ -3554,21 +3624,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_lblk = map->m_lblk;
split_map.m_len = map->m_len;
- if (max_zeroout && (allocated > split_map.m_len)) {
- if (allocated <= max_zeroout) {
+ if (max_zeroout && (*allocated > split_map.m_len)) {
+ if (*allocated <= max_zeroout) {
/* case 3 or 5 */
zero_ex1.ee_block =
cpu_to_le32(split_map.m_lblk +
split_map.m_len);
zero_ex1.ee_len =
- cpu_to_le16(allocated - split_map.m_len);
+ cpu_to_le16(*allocated - split_map.m_len);
ext4_ext_store_pblock(&zero_ex1,
ext4_ext_pblock(ex) + split_map.m_lblk +
split_map.m_len - ee_block);
err = ext4_ext_zeroout(inode, &zero_ex1);
if (err)
- goto out;
- split_map.m_len = allocated;
+ goto fallback;
+ split_map.m_len = *allocated;
}
if (split_map.m_lblk - ee_block + split_map.m_len <
max_zeroout) {
@@ -3581,27 +3651,29 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_pblock(ex));
err = ext4_ext_zeroout(inode, &zero_ex2);
if (err)
- goto out;
+ goto fallback;
}
split_map.m_len += split_map.m_lblk - ee_block;
split_map.m_lblk = ee_block;
- allocated = map->m_len;
+ *allocated = map->m_len;
}
}
- err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
- flags);
- if (err > 0)
- err = 0;
+fallback:
+ path = ext4_split_extent(handle, inode, path, &split_map, split_flag,
+ flags, NULL);
+ if (IS_ERR(path))
+ return path;
out:
/* If we have gotten a failure, don't zero out status tree */
- if (!err) {
- err = ext4_zeroout_es(inode, &zero_ex1);
- if (!err)
- err = ext4_zeroout_es(inode, &zero_ex2);
- }
- return err ? err : allocated;
+ ext4_zeroout_es(inode, &zero_ex1);
+ ext4_zeroout_es(inode, &zero_ex2);
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -3626,15 +3698,16 @@ out:
* being filled will be convert to initialized by the end_io callback function
* via ext4_convert_unwritten_extents().
*
- * Returns the size of unwritten extent to be written on success.
+ * The size of unwritten extent to be written is passed to the caller via the
+ * allocated pointer. Return an extent path pointer on success, or an error
+ * pointer on failure.
*/
-static int ext4_split_convert_extents(handle_t *handle,
+static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
- int flags)
+ struct ext4_ext_path *path,
+ int flags, unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
ext4_lblk_t eof_block;
ext4_lblk_t ee_block;
struct ext4_extent *ex;
@@ -3648,10 +3721,6 @@ static int ext4_split_convert_extents(handle_t *handle,
>> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
- /*
- * It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
- */
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
@@ -3662,20 +3731,24 @@ static int ext4_split_convert_extents(handle_t *handle,
split_flag |= EXT4_EXT_DATA_VALID1;
/* Convert to initialized */
} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully inside i_size or new_size.
+ */
split_flag |= ee_block + ee_len <= eof_block ?
EXT4_EXT_MAY_ZEROOUT : 0;
split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
}
- flags |= EXT4_GET_BLOCKS_PRE_IO;
- return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
+ flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
+ return ext4_split_extent(handle, inode, path, map, split_flag, flags,
+ allocated);
}
-static int ext4_convert_unwritten_extents_endio(handle_t *handle,
- struct inode *inode,
- struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath)
+static struct ext4_ext_path *
+ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
@@ -3703,20 +3776,21 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
- err = ext4_split_convert_extents(handle, inode, map, ppath,
- EXT4_GET_BLOCKS_CONVERT);
- if (err < 0)
- return err;
- path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT, NULL);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
+
+ path = ext4_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
/* first mark the extent as initialized */
ext4_ext_mark_initialized(ex);
@@ -3727,18 +3801,23 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-out:
+ if (err)
+ goto errout;
+
ext4_ext_show_leaf(inode, path);
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
-static int
+static struct ext4_ext_path *
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
+ struct ext4_ext_path *path,
unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
@@ -3761,25 +3840,27 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
(unsigned long long)ee_block, ee_len);
if (ee_block != map->m_lblk || ee_len > map->m_len) {
- err = ext4_split_convert_extents(handle, inode, map, ppath,
- EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
- if (err < 0)
- return err;
- path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
+
+ path = ext4_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- return err;
+ goto errout;
/* first mark the extent as unwritten */
ext4_ext_mark_unwritten(ex);
@@ -3791,7 +3872,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (err)
- return err;
+ goto errout;
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3800,22 +3881,24 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
- return 0;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
-static int
+static struct ext4_ext_path *
ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath, int flags,
- unsigned int allocated, ext4_fsblk_t newblock)
+ struct ext4_ext_path *path, int flags,
+ unsigned int *allocated, ext4_fsblk_t newblock)
{
- struct ext4_ext_path __maybe_unused *path = *ppath;
- int ret = 0;
int err = 0;
ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
(unsigned long long)map->m_lblk, map->m_len, flags,
- allocated);
+ *allocated);
ext4_ext_show_leaf(inode, path);
/*
@@ -3825,36 +3908,34 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
- allocated, newblock);
+ *allocated, newblock);
/* get_block() before submitting IO, split the extent */
- if (flags & EXT4_GET_BLOCKS_PRE_IO) {
- ret = ext4_split_convert_extents(handle, inode, map, ppath,
- flags | EXT4_GET_BLOCKS_CONVERT);
- if (ret < 0) {
- err = ret;
- goto out2;
- }
+ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) {
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ flags | EXT4_GET_BLOCKS_CONVERT, allocated);
+ if (IS_ERR(path))
+ return path;
/*
- * shouldn't get a 0 return when splitting an extent unless
+ * shouldn't get a 0 allocated when splitting an extent unless
* m_len is 0 (bug) or extent has been corrupted
*/
- if (unlikely(ret == 0)) {
+ if (unlikely(*allocated == 0)) {
EXT4_ERROR_INODE(inode,
- "unexpected ret == 0, m_len = %u",
+ "unexpected allocated == 0, m_len = %u",
map->m_len);
err = -EFSCORRUPTED;
- goto out2;
+ goto errout;
}
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
- err = ext4_convert_unwritten_extents_endio(handle, inode, map,
- ppath);
- if (err < 0)
- goto out2;
+ path = ext4_convert_unwritten_extents_endio(handle, inode,
+ map, path);
+ if (IS_ERR(path))
+ return path;
ext4_update_inode_fsync_trans(handle, inode, 1);
goto map_out;
}
@@ -3886,36 +3967,37 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
* For buffered writes, at writepage time, etc. Convert a
* discovered unwritten extent to written.
*/
- ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
- if (ret < 0) {
- err = ret;
- goto out2;
- }
+ path = ext4_ext_convert_to_initialized(handle, inode, map, path,
+ flags, allocated);
+ if (IS_ERR(path))
+ return path;
ext4_update_inode_fsync_trans(handle, inode, 1);
/*
- * shouldn't get a 0 return when converting an unwritten extent
+ * shouldn't get a 0 allocated when converting an unwritten extent
* unless m_len is 0 (bug) or extent has been corrupted
*/
- if (unlikely(ret == 0)) {
- EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
+ if (unlikely(*allocated == 0)) {
+ EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u",
map->m_len);
err = -EFSCORRUPTED;
- goto out2;
+ goto errout;
}
out:
- allocated = ret;
map->m_flags |= EXT4_MAP_NEW;
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
out1:
map->m_pblk = newblock;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
+ if (*allocated > map->m_len)
+ *allocated = map->m_len;
+ map->m_len = *allocated;
ext4_ext_show_leaf(inode, path);
-out2:
- return err ? err : allocated;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -4020,6 +4102,73 @@ static int get_implied_cluster_alloc(struct super_block *sb,
return 0;
}
+/*
+ * Determine hole length around the given logical block, first try to
+ * locate and expand the hole from the given @path, and then adjust it
+ * if it's partially or completely converted to delayed extents, insert
+ * it into the extent cache tree if it's indeed a hole, finally return
+ * the length of the determined extent.
+ */
+static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t lblk)
+{
+ ext4_lblk_t hole_start, len;
+ struct extent_status es;
+
+ hole_start = lblk;
+ len = ext4_ext_find_hole(inode, path, &hole_start);
+again:
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
+ hole_start + len - 1, &es);
+ if (!es.es_len)
+ goto insert_hole;
+
+ /*
+ * There's a delalloc extent in the hole, handle it if the delalloc
+ * extent is in front of, behind and straddle the queried range.
+ */
+ if (lblk >= es.es_lblk + es.es_len) {
+ /*
+ * The delalloc extent is in front of the queried range,
+ * find again from the queried start block.
+ */
+ len -= lblk - hole_start;
+ hole_start = lblk;
+ goto again;
+ } else if (in_range(lblk, es.es_lblk, es.es_len)) {
+ /*
+ * The delalloc extent containing lblk, it must have been
+ * added after ext4_map_blocks() checked the extent status
+ * tree so we are not holding i_rwsem and delalloc info is
+ * only stabilized by i_data_sem we are going to release
+ * soon. Don't modify the extent status tree and report
+ * extent as a hole, just adjust the length to the delalloc
+ * extent's after lblk.
+ */
+ len = es.es_lblk + es.es_len - lblk;
+ return len;
+ } else {
+ /*
+ * The delalloc extent is partially or completely behind
+ * the queried range, update hole length until the
+ * beginning of the delalloc extent.
+ */
+ len = min(es.es_lblk - hole_start, len);
+ }
+
+insert_hole:
+ /* Put just found gap into cache to speed up subsequent requests */
+ ext_debug(inode, " -> %u:%u\n", hole_start, len);
+ ext4_es_insert_extent(inode, hole_start, len, ~0,
+ EXTENT_STATUS_HOLE, false);
+
+ /* Update hole_len to reflect hole size after lblk */
+ if (hole_start != lblk)
+ len -= lblk - hole_start;
+
+ return len;
+}
/*
* Block allocation/map/preallocation routine for extents based files
@@ -4027,10 +4176,10 @@ static int get_implied_cluster_alloc(struct super_block *sb,
*
* Need to be called with
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
*
- * return > 0, number of of blocks already mapped/allocated
- * if create == 0 and these are pre-allocated blocks
+ * return > 0, number of blocks already mapped/allocated
+ * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
* buffer head is unmapped
* otherwise blocks are mapped
*
@@ -4043,10 +4192,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent newex, *ex, *ex2;
+ struct ext4_extent newex, *ex, ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0, pblk;
- int err = 0, depth, ret;
+ int err = 0, depth;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
@@ -4056,10 +4205,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
- path = NULL;
goto out;
}
@@ -4108,8 +4256,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
- err = convert_initialized_extent(handle,
- inode, map, &path, &allocated);
+ path = convert_initialized_extent(handle,
+ inode, map, path, &allocated);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
goto out;
} else if (!ext4_ext_is_unwritten(ex)) {
map->m_flags |= EXT4_MAP_MAPPED;
@@ -4121,38 +4271,26 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
goto out;
}
- ret = ext4_ext_handle_unwritten_extents(
- handle, inode, map, &path, flags,
- allocated, newblock);
- if (ret < 0)
- err = ret;
- else
- allocated = ret;
+ path = ext4_ext_handle_unwritten_extents(
+ handle, inode, map, path, flags,
+ &allocated, newblock);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
goto out;
}
}
/*
* requested block isn't allocated yet;
- * we couldn't try to create block if create flag is zero
+ * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- ext4_lblk_t hole_start, hole_len;
+ ext4_lblk_t len;
- hole_start = map->m_lblk;
- hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
- /*
- * put just found gap into cache to speed up
- * subsequent requests
- */
- ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+ len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk);
- /* Update hole_len to reflect hole size after map->m_lblk */
- if (hole_start != map->m_lblk)
- hole_len -= map->m_lblk - hole_start;
map->m_pblk = 0;
- map->m_len = min_t(unsigned int, map->m_len, hole_len);
-
+ map->m_len = min_t(unsigned int, map->m_len, len);
goto out;
}
@@ -4179,17 +4317,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out;
ar.lright = map->m_lblk;
- ex2 = NULL;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
- if (err)
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
+ &ex2, flags);
+ if (err < 0)
goto out;
/* Check if the extent after searching to the right implies a
* cluster we can use. */
- if ((sbi->s_cluster_ratio > 1) && ex2 &&
- get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+ if ((sbi->s_cluster_ratio > 1) && err &&
+ get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
+ err = 0;
goto got_allocated_blocks;
}
@@ -4262,8 +4401,9 @@ got_allocated_blocks:
map->m_flags |= EXT4_MAP_UNWRITTEN;
}
- err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
- if (err) {
+ path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
if (allocated_clusters) {
int fb_flags = 0;
@@ -4283,43 +4423,6 @@ got_allocated_blocks:
}
/*
- * Reduce the reserved cluster count to reflect successful deferred
- * allocation of delayed allocated clusters or direct allocation of
- * clusters discovered to be delayed allocated. Once allocated, a
- * cluster is not included in the reserved count.
- */
- if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- /*
- * When allocating delayed allocated clusters, simply
- * reduce the reserved cluster count and claim quota
- */
- ext4_da_update_reserve_space(inode, allocated_clusters,
- 1);
- } else {
- ext4_lblk_t lblk, len;
- unsigned int n;
-
- /*
- * When allocating non-delayed allocated clusters
- * (from fallocate, filemap, DIO, or clusters
- * allocated when delalloc has been disabled by
- * ext4_nonda_switch), reduce the reserved cluster
- * count by the number of allocated clusters that
- * have previously been delayed allocated. Quota
- * has been claimed by ext4_mb_new_blocks() above,
- * so release the quota reservations made for any
- * previously delayed allocated clusters.
- */
- lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
- len = allocated_clusters << sbi->s_cluster_bits;
- n = ext4_es_delayed_clu(inode, lblk, len);
- if (n > 0)
- ext4_da_update_reserve_space(inode, (int) n, 0);
- }
- }
-
- /*
* Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an unwritten extent.
*/
@@ -4333,10 +4436,22 @@ got_allocated_blocks:
map->m_len = ar.len;
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
-
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ /*
+ * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag.
+ * So we know that the depth used here is correct, since there was no
+ * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set.
+ * If tomorrow we start using this QUERY flag with CREATE, then we will
+ * need to re-calculate the depth as it might have changed due to block
+ * allocation.
+ */
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) {
+ WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE);
+ if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
+ map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF;
+ }
+
+ ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
@@ -4363,21 +4478,12 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
-retry:
- err = ext4_es_remove_extent(inode, last_block,
- EXT_MAX_BLOCKS - last_block);
- if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry;
- }
- if (err)
- return err;
+ ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
+
retry_remove_space:
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(GFP_ATOMIC);
goto retry_remove_space;
}
return err;
@@ -4389,13 +4495,14 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
{
struct inode *inode = file_inode(file);
handle_t *handle;
- int ret = 0;
- int ret2 = 0, ret3 = 0;
+ int ret = 0, ret2 = 0, ret3 = 0;
int retries = 0;
int depth = 0;
struct ext4_map_blocks map;
unsigned int credits;
- loff_t epos;
+ loff_t epos, old_size = i_size_read(inode);
+ unsigned int blkbits = inode->i_blkbits;
+ bool alloc_zero = false;
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4409,13 +4516,24 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
/*
+ * Do the actual write zero during a running journal transaction
+ * costs a lot. First allocate an unwritten extent and then
+ * convert it to written after zeroing it out.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ flags &= ~EXT4_GET_BLOCKS_ZERO;
+ flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
+ alloc_zero = true;
+ }
+
+ /*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, len);
depth = ext_depth(inode);
retry:
- while (ret >= 0 && len) {
+ while (len) {
/*
* Recalculate credits when extent tree depth changes.
*/
@@ -4437,18 +4555,26 @@ retry:
inode->i_ino, map.m_lblk,
map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
+ ext4_journal_stop(handle);
break;
}
- map.m_lblk += ret;
- map.m_len = len = len - ret;
- epos = (loff_t)map.m_lblk << inode->i_blkbits;
- inode->i_ctime = current_time(inode);
+ /*
+ * allow a full retry cycle for any remaining allocations
+ */
+ retries = 0;
+ epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret);
+ inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
- inode->i_mtime = inode->i_ctime;
+ inode_set_mtime_to_ts(inode,
+ inode_get_ctime(inode));
+ if (epos > old_size) {
+ pagecache_isize_extended(inode, old_size, epos);
+ ext4_zero_partial_blocks(handle, inode,
+ old_size, epos - old_size);
+ }
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4456,135 +4582,99 @@ retry:
ret2 = ret3 ? ret3 : ret2;
if (unlikely(ret2))
break;
+
+ if (alloc_zero &&
+ (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
+ ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+ map.m_len);
+ if (likely(!ret2))
+ ret2 = ext4_convert_unwritten_extents(NULL,
+ inode, (loff_t)map.m_lblk << blkbits,
+ (loff_t)map.m_len << blkbits);
+ if (ret2)
+ break;
+ }
+
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries)) {
- ret = 0;
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
- }
return ret > 0 ? ret2 : ret;
}
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
handle_t *handle = NULL;
- unsigned int max_blocks;
loff_t new_size = 0;
- int ret = 0;
- int flags;
- int credits;
- int partial_begin, partial_end;
- loff_t start, end;
- ext4_lblk_t lblk;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
+ unsigned int blocksize = i_blocksize(inode);
unsigned int blkbits = inode->i_blkbits;
+ int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
- /* Call ext4_force_commit to flush all data in case of data=journal. */
- if (ext4_should_journal_data(inode)) {
- ret = ext4_force_commit(inode->i_sb);
- if (ret)
- return ret;
- }
-
- /*
- * Round up offset. This is not fallocate, we neet to zero out
- * blocks, so convert interior block aligned part of the range to
- * unwritten and possibly manually zero out unaligned parts of the
- * range.
- */
- start = round_up(offset, 1 << blkbits);
- end = round_down((offset + len), 1 << blkbits);
-
- if (start < offset || end > offset + len)
- return -EINVAL;
- partial_begin = offset & ((1 << blkbits) - 1);
- partial_end = (offset + len) & ((1 << blkbits) - 1);
-
- lblk = start >> blkbits;
- max_blocks = (end >> blkbits);
- if (max_blocks < lblk)
- max_blocks = 0;
- else
- max_blocks -= lblk;
-
- inode_lock(inode);
-
- /*
- * Indirect files do not support unwritten extents
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
+ /* Indirect files do not support unwritten extents */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
- goto out_mutex;
+ return ret;
}
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- inode_dio_wait(inode);
-
/* Preallocate the range including the unaligned edges */
- if (partial_begin || partial_end) {
- ret = ext4_alloc_file_blocks(file,
- round_down(offset, 1 << blkbits) >> blkbits,
- (round_up((offset + len), 1 << blkbits) -
- round_down(offset, 1 << blkbits)) >> blkbits,
- new_size, flags);
- if (ret)
- goto out_mutex;
+ if (!IS_ALIGNED(offset | end, blocksize)) {
+ ext4_lblk_t alloc_lblk = offset >> blkbits;
+ ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
+ ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
+ new_size, flags);
+ if (ret)
+ return ret;
}
- /* Zero range excluding the unaligned edges */
- if (max_blocks > 0) {
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
- EXT4_EX_NOCACHE);
-
- /*
- * Prevent page faults from reinstantiating pages we have
- * released from page cache.
- */
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret)
+ return ret;
- ret = ext4_break_layouts(inode);
- if (ret) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
- goto out_mutex;
- }
+ /* Now release the pages and zero block aligned part of pages */
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
- ret = ext4_update_disksize_before_punch(inode, offset, len);
- if (ret) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
- goto out_mutex;
- }
- /* Now release the pages and zero block aligned part of pages */
- truncate_pagecache_range(inode, start, end - 1);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ /* Zero range excluding the unaligned edges */
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> blkbits;
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t zero_blks = end_lblk - start_lblk;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
- flags);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ if (mode & FALLOC_FL_WRITE_ZEROES)
+ flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
+ else
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
+ ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
+ new_size, flags);
if (ret)
- goto out_mutex;
+ return ret;
}
- if (!partial_begin && !partial_end)
- goto out_mutex;
+ /* Finish zeroing out if it doesn't contain partial block */
+ if (IS_ALIGNED(offset | end, blocksize))
+ return ret;
/*
* In worst case we have to writeout two nonadjacent unwritten
@@ -4597,28 +4687,69 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
- goto out_mutex;
+ return ret;
}
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ if (ret)
+ goto out_handle;
+
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- /* Zero out partial block at the edges of the range */
- ret = ext4_zero_partial_blocks(handle, inode, offset, len);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
-out_mutex:
- inode_unlock(inode);
+ return ret;
+}
+
+static long ext4_do_fallocate(struct file *file, loff_t offset,
+ loff_t len, int mode)
+{
+ struct inode *inode = file_inode(file);
+ loff_t end = offset + len;
+ loff_t new_size = 0;
+ ext4_lblk_t start_lblk, len_lblk;
+ int ret;
+
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
+
+ /* We only support preallocation for extent-based files only. */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ if (ret)
+ goto out;
+
+ if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
+ }
+out:
+ trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
return ret;
}
@@ -4632,12 +4763,8 @@ out_mutex:
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- loff_t new_size = 0;
- unsigned int max_blocks;
- int ret = 0;
- int flags;
- ext4_lblk_t lblk;
- unsigned int blkbits = inode->i_blkbits;
+ struct address_space *mapping = file->f_mapping;
+ int ret;
/*
* Encrypted inodes can't handle collapse range or insert
@@ -4648,69 +4775,158 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the underlying device does not
+ * enable the unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
+ return -EOPNOTSUPP;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_INSERT_RANGE))
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return ext4_punch_hole(inode, offset, len);
-
+ inode_lock(inode);
ret = ext4_convert_inline_data(inode);
if (ret)
- return ret;
-
- if (mode & FALLOC_FL_COLLAPSE_RANGE)
- return ext4_collapse_range(inode, offset, len);
-
- if (mode & FALLOC_FL_INSERT_RANGE)
- return ext4_insert_range(inode, offset, len);
+ goto out_inode_lock;
- if (mode & FALLOC_FL_ZERO_RANGE)
- return ext4_zero_range(file, offset, len, mode);
-
- trace_ext4_fallocate_enter(inode, offset, len, mode);
- lblk = offset >> blkbits;
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
- max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+ ret = file_modified(file);
+ if (ret)
+ goto out_inode_lock;
- inode_lock(inode);
+ if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
+ ret = ext4_do_fallocate(file, offset, len, mode);
+ goto out_inode_lock;
+ }
/*
- * We only support preallocation for extent-based files only
+ * Follow-up operations will drop page cache, hold invalidate lock
+ * to prevent page faults from reinstantiating pages we have
+ * released from page cache.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ filemap_invalidate_lock(mapping);
+
+ ret = ext4_break_layouts(inode);
+ if (ret)
+ goto out_invalidate_lock;
+
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
+ ret = ext4_punch_hole(file, offset, len);
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ ret = ext4_collapse_range(file, offset, len);
+ break;
+ case FALLOC_FL_INSERT_RANGE:
+ ret = ext4_insert_range(file, offset, len);
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
+ ret = ext4_zero_range(file, offset, len, mode);
+ break;
+ default:
ret = -EOPNOTSUPP;
- goto out;
}
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
- ret = inode_newsize_ok(inode, new_size);
- if (ret)
- goto out;
+out_invalidate_lock:
+ filemap_invalidate_unlock(mapping);
+out_inode_lock:
+ inode_unlock(inode);
+ return ret;
+}
+
+/*
+ * This function converts a range of blocks to written extents. The caller of
+ * this function will pass the start offset and the size. all unwritten extents
+ * within this range will be converted to written extents.
+ *
+ * This function is called from the direct IO end io call back function for
+ * atomic writes, to convert the unwritten extents after IO is completed.
+ *
+ * Note that the requirement for atomic writes is that all conversion should
+ * happen atomically in a single fs journal transaction. We mainly only allocate
+ * unwritten extents either on a hole on a pre-exiting unwritten extent range in
+ * ext4_map_blocks_atomic_write(). The only case where we can have multiple
+ * unwritten extents in a range [offset, offset+len) is when there is a split
+ * unwritten extent between two leaf nodes which was cached in extent status
+ * cache during ext4_iomap_alloc() time. That will allow
+ * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going
+ * into the slow path. That means we might need a loop for conversion of this
+ * unwritten extent split across leaf block within a single journal transaction.
+ * Split extents across leaf nodes is a rare case, but let's still handle that
+ * to meet the requirements of multi-fsblock atomic writes.
+ *
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
+{
+ unsigned int max_blocks;
+ int ret = 0, ret2 = 0, ret3 = 0;
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
+ int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE;
+
+ map.m_lblk = offset >> blkbits;
+ max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
+
+ if (!handle) {
+ /*
+ * TODO: An optimization can be added later by having an extent
+ * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that
+ * it can tell if the extent in the cache is a split extent.
+ * But for now let's assume pextents as 2 always.
+ */
+ credits = ext4_meta_trans_blocks(inode, max_blocks, 2);
}
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- inode_dio_wait(inode);
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+ }
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
- if (ret)
- goto out;
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret != max_blocks)
+ ext4_msg(inode->i_sb, KERN_INFO,
+ "inode #%lu: block %u: len %u: "
+ "split block mapping found for atomic write, "
+ "ret = %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ if (ret <= 0)
+ break;
+ }
- if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+
+ if (credits) {
+ ret3 = ext4_journal_stop(handle);
+ if (unlikely(ret3))
+ ret2 = ret3;
}
-out:
- inode_unlock(inode);
- trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
- return ret;
+
+ if (ret <= 0 || ret2)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "returned %d or %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret, ret2);
+
+ return ret > 0 ? ret2 : ret;
}
/*
@@ -4752,8 +4968,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
break;
}
}
+ /*
+ * Do not cache any unrelated extents, as it does not hold the
+ * i_rwsem or invalidate_lock, which could corrupt the extent
+ * status tree.
+ */
ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT |
+ EXT4_EX_NOCACHE);
if (ret <= 0)
ext4_warning(inode->i_sb,
"inode #%lu: block %u: len %u: "
@@ -4775,7 +4997,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
- int ret, err = 0;
+ int ret = 0, err = 0;
struct ext4_io_end_vec *io_end_vec;
/*
@@ -4864,12 +5086,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
- u64 maxbytes;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- maxbytes = inode->i_sb->s_maxbytes;
- else
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ u64 maxbytes = ext4_get_maxbytes(inode);
if (*len == 0)
return -EINVAL;
@@ -4889,10 +5106,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
int error = 0;
+ inode_lock_shared(inode);
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
if (error)
- return error;
+ goto unlock;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
@@ -4903,15 +5121,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
error = ext4_fiemap_check_ranges(inode, start, &len);
if (error)
- return error;
+ goto unlock;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
- return iomap_fiemap(inode, fieinfo, start, len,
- &ext4_iomap_xattr_ops);
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
+ } else {
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_report_ops);
}
-
- return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
+unlock:
+ inode_unlock_shared(inode);
+ return error;
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -4932,7 +5154,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ inode_lock_shared(inode);
error = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
if (error)
return error;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
@@ -4960,36 +5184,6 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
/*
- * ext4_access_path:
- * Function to access the path buffer for marking it dirty.
- * It also checks if there are sufficient credits left in the journal handle
- * to update path.
- */
-static int
-ext4_access_path(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
-{
- int credits, err;
-
- if (!ext4_handle_valid(handle))
- return 0;
-
- /*
- * Check if need to extend journal credits
- * 3 for leaf, sb, and inode plus 2 (bmap and group
- * descriptor) for each block group; assume two block
- * groups
- */
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
- if (err < 0)
- return err;
-
- err = ext4_ext_get_access(handle, inode, path);
- return err;
-}
-
-/*
* ext4_ext_shift_path_extents:
* Shift the extents of a path structure lying between path[depth].p_ext
* and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
@@ -5003,6 +5197,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
int depth, err = 0;
struct ext4_extent *ex_start, *ex_last;
bool update = false;
+ int credits, restart_credits;
depth = path->p_depth;
while (depth >= 0) {
@@ -5012,13 +5207,26 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ /* leaf + sb + inode */
+ credits = 3;
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+ update = true;
+ /* extent tree + sb + inode */
+ credits = depth + 2;
+ }
- err = ext4_access_path(handle, inode, path + depth);
- if (err)
+ restart_credits = ext4_chunk_trans_extent(inode, 0);
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ restart_credits, 0);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
- if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
- update = true;
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
while (ex_start <= ex_last) {
if (SHIFT == SHIFT_LEFT) {
@@ -5049,7 +5257,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
}
/* Update index too */
- err = ext4_access_path(handle, inode, path + depth);
+ err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
@@ -5088,6 +5296,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
int ret = 0, depth;
struct ext4_extent *extent;
ext4_lblk_t stop, *iterator, ex_start, ex_end;
+ ext4_lblk_t tmp = EXT_MAX_BLOCKS;
/* Let path point to the last extent */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
@@ -5108,7 +5317,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* won't be shifted beyond EXT_MAX_BLOCKS.
*/
if (SHIFT == SHIFT_LEFT) {
- path = ext4_find_extent(inode, start - 1, &path,
+ path = ext4_find_extent(inode, start - 1, path,
EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
@@ -5141,18 +5350,23 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* till we reach stop. In case of right shift, iterator points to stop
* and it is decreased till we reach start.
*/
+again:
+ ret = 0;
if (SHIFT == SHIFT_LEFT)
iterator = &start;
else
iterator = &stop;
+ if (tmp != EXT_MAX_BLOCKS)
+ *iterator = tmp;
+
/*
* Its safe to start updating extents. Start and stop are unsigned, so
* in case of right shift if extent with 0 block is reached, iterator
* becomes NULL to indicate the end of the loop.
*/
while (iterator && start <= stop) {
- path = ext4_find_extent(inode, *iterator, &path,
+ path = ext4_find_extent(inode, *iterator, path,
EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
@@ -5174,30 +5388,40 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
}
}
+ tmp = *iterator;
if (SHIFT == SHIFT_LEFT) {
extent = EXT_LAST_EXTENT(path[depth].p_hdr);
*iterator = le32_to_cpu(extent->ee_block) +
ext4_ext_get_actual_len(extent);
} else {
extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
- if (le32_to_cpu(extent->ee_block) > 0)
+ if (le32_to_cpu(extent->ee_block) > start)
*iterator = le32_to_cpu(extent->ee_block) - 1;
- else
- /* Beginning is reached, end of the loop */
+ else if (le32_to_cpu(extent->ee_block) == start)
iterator = NULL;
- /* Update path extent in case we need to stop */
- while (le32_to_cpu(extent->ee_block) < start)
+ else {
+ extent = EXT_LAST_EXTENT(path[depth].p_hdr);
+ while (le32_to_cpu(extent->ee_block) >= start)
+ extent--;
+
+ if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
+ break;
+
extent++;
+ iterator = NULL;
+ }
path[depth].p_ext = extent;
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
handle, SHIFT);
+ /* iterator can be NULL which means we should break */
+ if (ret == -EAGAIN)
+ goto again;
if (ret)
break;
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -5206,120 +5430,79 @@ out:
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- ext4_lblk_t punch_start, punch_stop;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
handle_t *handle;
unsigned int credits;
- loff_t new_size, ioffset;
+ loff_t start, new_size;
int ret;
- /*
- * We need to test this early because xfstests assumes that a
- * collapse range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support collapse range.
- */
+ trace_ext4_collapse_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Collapse range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_collapse_range(inode, offset, len);
-
- punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- /* Call ext4_force_commit to flush all data in case of data=journal. */
- if (ext4_should_journal_data(inode)) {
- ret = ext4_force_commit(inode->i_sb);
- if (ret)
- return ret;
- }
-
- inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- down_write(&EXT4_I(inode)->i_mmap_sem);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_mmap;
+ if (end >= inode->i_size)
+ return -EINVAL;
/*
+ * Write tail of the last page before removed range and data that
+ * will be shifted since they will get removed from the page cache
+ * below. We are also protected from pages becoming dirty by
+ * i_rwsem and invalidate_lock.
* Need to round down offset to be aligned with page size boundary
* for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
- /*
- * Write tail of the last page before removed range since it will get
- * removed from the page cache below.
- */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
- if (ret)
- goto out_mmap;
- /*
- * Write data that will be shifted to preserve them when discarding
- * page cache below. We are also protected from pages becoming dirty
- * by i_mmap_sem.
- */
- ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
- LLONG_MAX);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, offset);
+ if (!ret)
+ ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ return ret;
+
+ truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
+
+ start_lblk = offset >> inode->i_blkbits;
+ end_lblk = (offset + len) >> inode->i_blkbits;
+
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
- ret = ext4_es_remove_extent(inode, punch_start,
- EXT_MAX_BLOCKS - punch_start);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
-
- ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
ext4_discard_preallocations(inode);
- ret = ext4_ext_shift_extents(inode, handle, punch_stop,
- punch_stop - punch_start, SHIFT_LEFT);
+ ret = ext4_ext_shift_extents(inode, handle, end_lblk,
+ end_lblk - start_lblk, SHIFT_LEFT);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
new_size = inode->i_size - len;
@@ -5327,18 +5510,16 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
EXT4_I(inode)->i_disksize = new_size;
up_write(&EXT4_I(inode)->i_data_sem);
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = current_time(inode);
ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ goto out_handle;
+
ext4_update_inode_fsync_trans(handle, inode, 1);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- up_write(&EXT4_I(inode)->i_mmap_sem);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -5350,107 +5531,73 @@ out_mutex:
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
- ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+ ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
- int ret = 0, depth, split_flag = 0;
- loff_t ioffset;
+ int ret, depth, split_flag = 0;
+ loff_t start;
- /*
- * We need to test this early because xfstests assumes that an
- * insert range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support insert range.
- */
+ trace_ext4_insert_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Insert range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_insert_range(inode, offset, len);
-
- offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
-
- /* Call ext4_force_commit to flush all data in case of data=journal */
- if (ext4_should_journal_data(inode)) {
- ret = ext4_force_commit(inode->i_sb);
- if (ret)
- return ret;
- }
-
- inode_lock(inode);
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Check whether the maximum file size would be exceeded */
- if (len > inode->i_sb->s_maxbytes - inode->i_size) {
- ret = -EFBIG;
- goto out_mutex;
- }
-
/* Offset must be less than i_size */
- if (offset >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
+ if (offset >= inode->i_size)
+ return -EINVAL;
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size)
+ return -EFBIG;
/*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
+ * Write out all dirty pages. Need to round down to align start offset
+ * to page size boundary for page size > block size.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
-
- ret = ext4_break_layouts(inode);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
if (ret)
- goto out_mmap;
+ return ret;
- /*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode->i_mtime = inode->i_ctime = current_time(inode);
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
- goto out_stop;
+ goto out_handle;
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = len >> inode->i_blkbits;
+
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ path = ext4_find_extent(inode, start_lblk, NULL, 0);
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ ret = PTR_ERR(path);
+ goto out_handle;
}
depth = ext_depth(inode);
@@ -5460,59 +5607,47 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
ee_len = ext4_ext_get_actual_len(extent);
/*
- * If offset_lblk is not the starting block of extent, split
- * the extent @offset_lblk
+ * If start_lblk is not the starting block of extent, split
+ * the extent @start_lblk
*/
- if ((offset_lblk > ee_start_lblk) &&
- (offset_lblk < (ee_start_lblk + ee_len))) {
+ if ((start_lblk > ee_start_lblk) &&
+ (start_lblk < (ee_start_lblk + ee_len))) {
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
- ret = ext4_split_extent_at(handle, inode, &path,
- offset_lblk, split_flag,
+ path = ext4_split_extent_at(handle, inode, path,
+ start_lblk, split_flag,
EXT4_EX_NOCACHE |
- EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_SPLIT_NOMERGE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
- ext4_ext_drop_refs(path);
- kfree(path);
- if (ret < 0) {
+ if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ ret = PTR_ERR(path);
+ goto out_handle;
}
- } else {
- ext4_ext_drop_refs(path);
- kfree(path);
}
- ret = ext4_es_remove_extent(inode, offset_lblk,
- EXT_MAX_BLOCKS - offset_lblk);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ext4_free_ext_path(path);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
/*
- * if offset_lblk lies in a hole which is at start of file, use
+ * if start_lblk lies in a hole which is at start of file, use
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
- len_lblk, SHIFT_RIGHT);
-
+ max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- up_write(&EXT4_I(inode)->i_mmap_sem);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -5531,7 +5666,7 @@ out_mutex:
* stuff such as page-cache locking consistency, bh mapping consistency or
* extent's data copying must be performed by caller.
* Locking:
- * i_mutex is held for both inodes
+ * i_rwsem is held for both inodes
* i_data_sem is locked for write for both inodes
* Assumptions:
* All pages from requested range are locked for both inodes
@@ -5550,12 +5685,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
BUG_ON(!inode_is_locked(inode1));
BUG_ON(!inode_is_locked(inode2));
- *erp = ext4_es_remove_extent(inode1, lblk1, count);
- if (unlikely(*erp))
- return 0;
- *erp = ext4_es_remove_extent(inode2, lblk2, count);
- if (unlikely(*erp))
- return 0;
+ ext4_es_remove_extent(inode1, lblk1, count);
+ ext4_es_remove_extent(inode2, lblk2, count);
while (count) {
struct ext4_extent *ex1, *ex2, tmp_ex;
@@ -5563,25 +5694,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
int e1_len, e2_len, len;
int split = 0;
- path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+ path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE);
if (IS_ERR(path1)) {
*erp = PTR_ERR(path1);
- path1 = NULL;
- finish:
- count = 0;
- goto repeat;
+ goto errout;
}
- path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+ path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE);
if (IS_ERR(path2)) {
*erp = PTR_ERR(path2);
- path2 = NULL;
- goto finish;
+ goto errout;
}
ex1 = path1[path1->p_depth].p_ext;
ex2 = path2[path2->p_depth].p_ext;
- /* Do we have somthing to swap ? */
+ /* Do we have something to swap ? */
if (unlikely(!ex2 || !ex1))
- goto finish;
+ goto errout;
e1_blk = le32_to_cpu(ex1->ee_block);
e2_blk = le32_to_cpu(ex2->ee_block);
@@ -5603,7 +5730,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
next2 = e2_blk;
/* Do we have something to swap */
if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
- goto finish;
+ goto errout;
/* Move to the rightest boundary */
len = next1 - lblk1;
if (len < next2 - lblk2)
@@ -5613,28 +5740,32 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
lblk1 += len;
lblk2 += len;
count -= len;
- goto repeat;
+ continue;
}
/* Prepare left boundary */
if (e1_blk < lblk1) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode1,
- &path1, lblk1, 0);
- if (unlikely(*erp))
- goto finish;
+ path1 = ext4_force_split_extent_at(handle, inode1,
+ path1, lblk1, 0);
+ if (IS_ERR(path1)) {
+ *erp = PTR_ERR(path1);
+ goto errout;
+ }
}
if (e2_blk < lblk2) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode2,
- &path2, lblk2, 0);
- if (unlikely(*erp))
- goto finish;
+ path2 = ext4_force_split_extent_at(handle, inode2,
+ path2, lblk2, 0);
+ if (IS_ERR(path2)) {
+ *erp = PTR_ERR(path2);
+ goto errout;
+ }
}
/* ext4_split_extent_at() may result in leaf extent split,
* path must to be revalidated. */
if (split)
- goto repeat;
+ continue;
/* Prepare right boundary */
len = count;
@@ -5645,30 +5776,34 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
if (len != e1_len) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode1,
- &path1, lblk1 + len, 0);
- if (unlikely(*erp))
- goto finish;
+ path1 = ext4_force_split_extent_at(handle, inode1,
+ path1, lblk1 + len, 0);
+ if (IS_ERR(path1)) {
+ *erp = PTR_ERR(path1);
+ goto errout;
+ }
}
if (len != e2_len) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode2,
- &path2, lblk2 + len, 0);
- if (*erp)
- goto finish;
+ path2 = ext4_force_split_extent_at(handle, inode2,
+ path2, lblk2 + len, 0);
+ if (IS_ERR(path2)) {
+ *erp = PTR_ERR(path2);
+ goto errout;
+ }
}
/* ext4_split_extent_at() may result in leaf extent split,
* path must to be revalidated. */
if (split)
- goto repeat;
+ continue;
BUG_ON(e2_len != e1_len);
*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
/* Both extents are fully inside boundaries. Swap it now */
tmp_ex = *ex1;
@@ -5686,7 +5821,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
*erp = ext4_ext_dirty(handle, inode2, path2 +
path2->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
*erp = ext4_ext_dirty(handle, inode1, path1 +
path1->p_depth);
/*
@@ -5696,19 +5831,17 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
* aborted anyway.
*/
if (unlikely(*erp))
- goto finish;
+ goto errout;
+
lblk1 += len;
lblk2 += len;
replaced_count += len;
count -= len;
-
- repeat:
- ext4_ext_drop_refs(path1);
- kfree(path1);
- ext4_ext_drop_refs(path2);
- kfree(path2);
- path1 = path2 = NULL;
}
+
+errout:
+ ext4_free_ext_path(path1);
+ ext4_free_ext_path(path2);
return replaced_count;
}
@@ -5732,13 +5865,19 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
struct ext4_extent *extent;
ext4_lblk_t first_lblk, first_lclu, last_lclu;
+ /*
+ * if data can be stored inline, the logical cluster isn't
+ * mapped - no physical clusters have been allocated, and the
+ * file has no extents
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
+ ext4_has_inline_data(inode))
+ return 0;
+
/* search for the extent closest to the first block in the cluster */
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- path = NULL;
- goto out;
- }
+ if (IS_ERR(path))
+ return PTR_ERR(path);
depth = ext_depth(inode);
@@ -5785,8 +5924,266 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return err ? err : mapped;
}
+
+/*
+ * Updates physical block address and unwritten status of extent
+ * starting at lblk start and of len. If such an extent doesn't exist,
+ * this function splits the extent tree appropriately to create an
+ * extent like this. This function is called in the fast commit
+ * replay path. Returns 0 on success and error on failure.
+ */
+int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
+ int len, int unwritten, ext4_fsblk_t pblk)
+{
+ struct ext4_ext_path *path;
+ struct ext4_extent *ex;
+ int ret;
+
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (le32_to_cpu(ex->ee_block) != start ||
+ ext4_ext_get_actual_len(ex) != len) {
+ /* We need to split this extent to match our extent first */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ path = ext4_force_split_extent_at(NULL, inode, path, start, 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ goto out;
+ }
+
+ path = ext4_find_extent(inode, start, path, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ ex = path[path->p_depth].p_ext;
+ WARN_ON(le32_to_cpu(ex->ee_block) != start);
+
+ if (ext4_ext_get_actual_len(ex) != len) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ path = ext4_force_split_extent_at(NULL, inode, path,
+ start + len, 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ goto out;
+ }
+
+ path = ext4_find_extent(inode, start, path, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ }
+ }
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_store_pblock(ex, pblk);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
+ up_write(&EXT4_I(inode)->i_data_sem);
+out:
+ ext4_free_ext_path(path);
+ ext4_mark_inode_dirty(NULL, inode);
+ return ret;
+}
+
+/* Try to shrink the extent tree */
+void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t old_cur, cur = 0;
+
+ while (cur < end) {
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ return;
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_free_ext_path(path);
+ ext4_mark_inode_dirty(NULL, inode);
+ return;
+ }
+ old_cur = cur;
+ cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ if (cur <= old_cur)
+ cur = old_cur + 1;
+ ext4_ext_try_to_merge(NULL, inode, path, ex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_mark_inode_dirty(NULL, inode);
+ ext4_free_ext_path(path);
+ }
+}
+
+/* Check if *cur is a hole and if it is, skip it */
+static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
+{
+ int ret;
+ struct ext4_map_blocks map;
+
+ map.m_lblk = *cur;
+ map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret != 0)
+ return 0;
+ *cur = *cur + map.m_len;
+ return 0;
+}
+
+/* Count number of blocks used by this inode and update i_blocks */
+int ext4_ext_replay_set_iblocks(struct inode *inode)
+{
+ struct ext4_ext_path *path = NULL, *path2 = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t cur = 0, end;
+ int numblks = 0, i, ret = 0;
+ ext4_fsblk_t cmp1, cmp2;
+ struct ext4_map_blocks map;
+
+ /* Determin the size of the file first */
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex)
+ goto out;
+ end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+
+ /* Count the number of data blocks */
+ cur = 0;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0)
+ numblks += ret;
+ cur = cur + map.m_len;
+ }
+
+ /*
+ * Count the number of extent tree blocks. We do it by looking up
+ * two successive extents and determining the difference between
+ * their paths. When path is different for 2 successive extents
+ * we compare the blocks in the path at each level and increment
+ * iblocks by total number of differences found.
+ */
+ cur = 0;
+ ret = skip_hole(inode, &cur);
+ if (ret < 0)
+ goto out;
+ path = ext4_find_extent(inode, cur, path, 0);
+ if (IS_ERR(path))
+ goto out;
+ numblks += path->p_depth;
+ while (cur < end) {
+ path = ext4_find_extent(inode, cur, path, 0);
+ if (IS_ERR(path))
+ break;
+ ex = path[path->p_depth].p_ext;
+ if (!ex)
+ goto cleanup;
+
+ cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
+ ext4_ext_get_actual_len(ex));
+ ret = skip_hole(inode, &cur);
+ if (ret < 0)
+ break;
+
+ path2 = ext4_find_extent(inode, cur, path2, 0);
+ if (IS_ERR(path2))
+ break;
+
+ for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
+ cmp1 = cmp2 = 0;
+ if (i <= path->p_depth)
+ cmp1 = path[i].p_bh ?
+ path[i].p_bh->b_blocknr : 0;
+ if (i <= path2->p_depth)
+ cmp2 = path2[i].p_bh ?
+ path2[i].p_bh->b_blocknr : 0;
+ if (cmp1 != cmp2 && cmp2 != 0)
+ numblks++;
+ }
+ }
+
+out:
+ inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
+ ext4_mark_inode_dirty(NULL, inode);
+cleanup:
+ ext4_free_ext_path(path);
+ ext4_free_ext_path(path2);
+ return 0;
+}
+
+int ext4_ext_clear_bb(struct inode *inode)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t cur = 0, end;
+ int j, ret = 0;
+ struct ext4_map_blocks map;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ return 0;
+
+ /* Determin the size of the file first */
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex)
+ goto out;
+ end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+
+ cur = 0;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ path = ext4_find_extent(inode, map.m_lblk, path, 0);
+ if (!IS_ERR(path)) {
+ for (j = 0; j < path->p_depth; j++) {
+ ext4_mb_mark_bb(inode->i_sb,
+ path[j].p_block, 1, false);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ 0, path[j].p_block, 1, 1);
+ }
+ } else {
+ path = NULL;
+ }
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ map.m_lblk, map.m_pblk, map.m_len, 1);
+ }
+ cur = cur + map.m_len;
+ }
+
+out:
+ ext4_free_ext_path(path);
+ return 0;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e75171535375..e04fbf10fe4f 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -120,9 +120,40 @@
* memory. Hence, we will reclaim written/unwritten/hole extents from
* the tree under a heavy memory pressure.
*
+ * ==========================================================================
+ * 3. Assurance of Ext4 extent status tree consistency
+ *
+ * When mapping blocks, Ext4 queries the extent status tree first and should
+ * always trusts that the extent status tree is consistent and up to date.
+ * Therefore, it is important to adheres to the following rules when createing,
+ * modifying and removing extents.
+ *
+ * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
+ * the extent information should always be processed through the extent
+ * status tree instead of being organized manually through the on-disk
+ * extent tree.
+ *
+ * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
+ * exclusively and update the extent status tree atomically. If the extents
+ * to be modified are large enough to exceed the range that a single
+ * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
+ * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
+ * does):
+ *
+ * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against page faults, as well as reads and writes that may
+ * concurrently modify the extent status tree.
+ * b) Evict all page cache in the affected range and recommend rebuilding
+ * or dropping the extent status tree after modifying the on-disk
+ * extent tree. This ensures exclusion against concurrent writebacks
+ * that do not hold those locks but only holds a folio lock.
+ *
+ * 3. Based on the rules above, when querying block mappings, Ext4 should at
+ * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
+ * specified querying range.
*
* ==========================================================================
- * 3. Performance analysis
+ * 4. Performance analysis
*
* -- overhead
* 1. There is a cache extent for write access, so if writes are
@@ -134,7 +165,7 @@
*
*
* ==========================================================================
- * 4. TODO list
+ * 5. TODO list
*
* -- Refactor delayed space reservation
*
@@ -144,20 +175,21 @@
static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;
-static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved);
+ ext4_lblk_t end, int *reserved,
+ struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ struct pending_reservation **prealloc);
int __init ext4_init_es(void)
{
- ext4_es_cachep = kmem_cache_create("ext4_extent_status",
- sizeof(struct extent_status),
- 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
if (ext4_es_cachep == NULL)
return -ENOMEM;
return 0;
@@ -203,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
return es->es_lblk + es->es_len - 1;
}
+static inline void ext4_es_inc_seq(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
+}
+
/*
* search through the tree for an delayed extent with a given offset. If
* it can't be found, try to find next extent.
@@ -269,14 +308,12 @@ static void __es_find_extent_range(struct inode *inode,
/* see if the extent has been cached */
es->es_lblk = es->es_len = es->es_pblk = 0;
- if (tree->cache_es) {
- es1 = tree->cache_es;
- if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u) %llu %x\n",
- lblk, es1->es_lblk, es1->es_len,
- ext4_es_pblock(es1), ext4_es_status(es1));
- goto out;
- }
+ es1 = READ_ONCE(tree->cache_es);
+ if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %x\n",
+ lblk, es1->es_lblk, es1->es_len,
+ ext4_es_pblock(es1), ext4_es_status(es1));
+ goto out;
}
es1 = __es_tree_search(&tree->root, lblk);
@@ -295,7 +332,7 @@ out:
}
if (es1 && matching_fn(es1)) {
- tree->cache_es = es1;
+ WRITE_ONCE(tree->cache_es, es1);
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
@@ -311,6 +348,11 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
trace_ext4_es_find_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
@@ -361,6 +403,9 @@ bool ext4_es_scan_range(struct inode *inode,
{
bool ret;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return false;
+
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_range(inode, matching_fn, lblk, end);
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -404,6 +449,9 @@ bool ext4_es_scan_clu(struct inode *inode,
{
bool ret;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return false;
+
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_clu(inode, matching_fn, lblk);
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -441,22 +489,49 @@ static void ext4_es_list_del(struct inode *inode)
spin_unlock(&sbi->s_es_lock);
}
-static struct extent_status *
-ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
- ext4_fsblk_t pblk)
+static inline struct pending_reservation *__alloc_pending(bool nofail)
+{
+ if (!nofail)
+ return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+
+ return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static inline void __free_pending(struct pending_reservation *pr)
+{
+ kmem_cache_free(ext4_pending_cachep, pr);
+}
+
+/*
+ * Returns true if we cannot fail to allocate memory for this extent_status
+ * entry and cannot reclaim it until its status changes.
+ */
+static inline bool ext4_es_must_keep(struct extent_status *es)
+{
+ /* fiemap, bigalloc, and seek_data/hole need to use it. */
+ if (ext4_es_is_delayed(es))
+ return true;
+
+ return false;
+}
+
+static inline struct extent_status *__es_alloc_extent(bool nofail)
+{
+ if (!nofail)
+ return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+
+ return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
+ ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
{
- struct extent_status *es;
- es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
- if (es == NULL)
- return NULL;
es->es_lblk = lblk;
es->es_len = len;
es->es_pblk = pblk;
- /*
- * We don't count delayed extent because we never try to reclaim them
- */
- if (!ext4_es_is_delayed(es)) {
+ /* We never try to reclaim a must kept extent, so we don't count it. */
+ if (!ext4_es_must_keep(es)) {
if (!EXT4_I(inode)->i_es_shk_nr++)
ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
@@ -465,8 +540,11 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
EXT4_I(inode)->i_es_all_nr++;
percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+}
- return es;
+static inline void __es_free_extent(struct extent_status *es)
+{
+ kmem_cache_free(ext4_es_cachep, es);
}
static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
@@ -474,8 +552,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
EXT4_I(inode)->i_es_all_nr--;
percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
- /* Decrease the shrink counter when this es is not delayed */
- if (!ext4_es_is_delayed(es)) {
+ /* Decrease the shrink counter when we can reclaim the extent. */
+ if (!ext4_es_must_keep(es)) {
BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
if (!--EXT4_I(inode)->i_es_shk_nr)
ext4_es_list_del(inode);
@@ -483,7 +561,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
s_es_stats.es_stats_shk_cnt);
}
- kmem_cache_free(ext4_es_cachep, es);
+ __es_free_extent(es);
}
/*
@@ -518,8 +596,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
if (ext4_es_is_hole(es1))
return 1;
- /* we need to check delayed extent is without unwritten status */
- if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+ /* we need to check delayed extent */
+ if (ext4_es_is_delayed(es1))
return 1;
return 0;
@@ -658,8 +736,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
static void ext4_es_insert_extent_ind_check(struct inode *inode,
@@ -745,7 +822,8 @@ static inline void ext4_es_insert_extent_check(struct inode *inode,
}
#endif
-static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
@@ -785,10 +863,15 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
}
}
- es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
- newes->es_pblk);
+ if (prealloc)
+ es = prealloc;
+ else
+ es = __es_alloc_extent(false);
if (!es)
return -ENOMEM;
+ ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
+
rb_link_node(&es->rb_node, parent, p);
rb_insert_color(&es->rb_node, &tree->root);
@@ -800,64 +883,118 @@ out:
/*
* ext4_es_insert_extent() adds information to an inode's extent
* status tree.
- *
- * Return 0 on success, error code on failure.
*/
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status)
+void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status, bool delalloc_reserve_used)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
- int err = 0;
+ int err1 = 0, err2 = 0, err3 = 0;
+ int resv_used = 0, pending = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct extent_status *es1 = NULL;
+ struct extent_status *es2 = NULL;
+ struct pending_reservation *pr = NULL;
+ bool revise_pending = false;
+
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
- es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
- lblk, len, pblk, status, inode->i_ino);
+ es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);
if (!len)
- return 0;
+ return;
BUG_ON(end < lblk);
-
- if ((status & EXTENT_STATUS_DELAYED) &&
- (status & EXTENT_STATUS_WRITTEN)) {
- ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
- " delayed and written which can potentially "
- " cause data loss.", lblk, len);
- WARN_ON(1);
- }
+ WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
- trace_ext4_es_insert_extent(inode, &newes);
ext4_es_insert_extent_check(inode, &newes);
+ revise_pending = sbi->s_cluster_ratio > 1 &&
+ test_opt(inode->i_sb, DELALLOC) &&
+ (status & (EXTENT_STATUS_WRITTEN |
+ EXTENT_STATUS_UNWRITTEN));
+retry:
+ if (err1 && !es1)
+ es1 = __es_alloc_extent(true);
+ if ((err1 || err2) && !es2)
+ es2 = __es_alloc_extent(true);
+ if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
+ pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, NULL);
- if (err != 0)
+
+ err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
+ if (err1 != 0)
goto error;
-retry:
- err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
- if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
- err = 0;
+ /* Free preallocated extent if it didn't get used. */
+ if (es1) {
+ if (!es1->es_len)
+ __es_free_extent(es1);
+ es1 = NULL;
+ }
- if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
- (status & EXTENT_STATUS_WRITTEN ||
- status & EXTENT_STATUS_UNWRITTEN))
- __revise_pending(inode, lblk, len);
+ err2 = __es_insert_extent(inode, &newes, es2);
+ if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
+ err2 = 0;
+ if (err2 != 0)
+ goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es2) {
+ if (!es2->es_len)
+ __es_free_extent(es2);
+ es2 = NULL;
+ }
+ if (revise_pending) {
+ err3 = __revise_pending(inode, lblk, len, &pr);
+ if (err3 < 0)
+ goto error;
+ if (pr) {
+ __free_pending(pr);
+ pr = NULL;
+ }
+ pending = err3;
+ }
+ /*
+ * TODO: For cache on-disk extents, there is no need to increment
+ * the sequence counter, this requires future optimization.
+ */
+ ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ /*
+ * Reduce the reserved cluster count to reflect successful deferred
+ * allocation of delayed allocated clusters or direct allocation of
+ * clusters discovered to be delayed allocated. Once allocated, a
+ * cluster is not included in the reserved count.
+ *
+ * When direct allocating (from fallocate, filemap, DIO, or clusters
+ * allocated when delalloc has been disabled by ext4_nonda_switch())
+ * an extent either 1) contains delayed blocks but start with
+ * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
+ * allocated blocks which belong to delayed allocated clusters when
+ * bigalloc feature is enabled, quota has already been claimed by
+ * ext4_mb_new_blocks(), so release the quota reservations made for
+ * any previously delayed allocated clusters instead of claim them
+ * again.
+ */
+ resv_used += pending;
+ if (resv_used)
+ ext4_da_update_reserve_space(inode, resv_used,
+ delalloc_reserve_used);
- ext4_es_print_tree(inode);
+ if (err1 || err2 || err3 < 0)
+ goto retry;
- return err;
+ trace_ext4_es_insert_extent(inode, &newes);
+ ext4_es_print_tree(inode);
+ return;
}
/*
@@ -873,6 +1010,9 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
@@ -887,7 +1027,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
if (!es || es->es_lblk > end)
- __es_insert_extent(inode, &newes);
+ __es_insert_extent(inode, &newes, NULL);
write_unlock(&EXT4_I(inode)->i_es_lock);
}
@@ -899,8 +1039,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
* Return: 1 on found, 0 on not
*/
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t *next_lblk,
- struct extent_status *es)
+ ext4_lblk_t *next_lblk, struct extent_status *es,
+ u64 *pseq)
{
struct ext4_es_tree *tree;
struct ext4_es_stats *stats;
@@ -908,6 +1048,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct rb_node *node;
int found = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
trace_ext4_es_lookup_extent_enter(inode, lblk);
es_debug("lookup extent in block %u\n", lblk);
@@ -916,14 +1059,12 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
/* find extent in cache firstly */
es->es_lblk = es->es_len = es->es_pblk = 0;
- if (tree->cache_es) {
- es1 = tree->cache_es;
- if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u)\n",
- lblk, es1->es_lblk, es1->es_len);
- found = 1;
- goto out;
- }
+ es1 = READ_ONCE(tree->cache_es);
+ if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
}
node = tree->root.rb_node;
@@ -958,6 +1099,8 @@ out:
} else
*next_lblk = 0;
}
+ if (pseq)
+ *pseq = EXT4_I(inode)->i_es_seq;
} else {
percpu_counter_inc(&stats->es_stats_cache_misses);
}
@@ -969,7 +1112,7 @@ out:
}
struct rsvd_count {
- int ndelonly;
+ int ndelayed;
bool first_do_lblk_found;
ext4_lblk_t first_do_lblk;
ext4_lblk_t last_do_lblk;
@@ -995,10 +1138,10 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node;
- rc->ndelonly = 0;
+ rc->ndelayed = 0;
/*
- * for bigalloc, note the first delonly block in the range has not
+ * for bigalloc, note the first delayed block in the range has not
* been found, record the extent containing the block to the left of
* the region to be removed, if any, and note that there's no partial
* cluster to track
@@ -1018,9 +1161,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
}
/*
- * count_rsvd - count the clusters containing delayed and not unwritten
- * (delonly) blocks in a range within an extent and add to
- * the running tally in rsvd_count
+ * count_rsvd - count the clusters containing delayed blocks in a range
+ * within an extent and add to the running tally in rsvd_count
*
* @inode - file containing extent
* @lblk - first block in range
@@ -1037,13 +1179,13 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t i, end, nclu;
- if (!ext4_es_is_delonly(es))
+ if (!ext4_es_is_delayed(es))
return;
WARN_ON(len <= 0);
if (sbi->s_cluster_ratio == 1) {
- rc->ndelonly += (int) len;
+ rc->ndelayed += (int) len;
return;
}
@@ -1053,7 +1195,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
end = lblk + (ext4_lblk_t) len - 1;
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
- /* record the first block of the first delonly extent seen */
+ /* record the first block of the first delayed extent seen */
if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i;
rc->first_do_lblk_found = true;
@@ -1067,7 +1209,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
* doesn't start with it, count it and stop tracking
*/
if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
- rc->ndelonly++;
+ rc->ndelayed++;
rc->partial = false;
}
@@ -1077,7 +1219,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
*/
if (EXT4_LBLK_COFF(sbi, i) != 0) {
if (end >= EXT4_LBLK_CFILL(sbi, i)) {
- rc->ndelonly++;
+ rc->ndelayed++;
rc->partial = false;
i = EXT4_LBLK_CFILL(sbi, i) + 1;
}
@@ -1085,11 +1227,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
/*
* if the current cluster starts on a cluster boundary, count the
- * number of whole delonly clusters in the extent
+ * number of whole delayed clusters in the extent
*/
if ((i + sbi->s_cluster_ratio - 1) <= end) {
nclu = (end - i + 1) >> sbi->s_cluster_bits;
- rc->ndelonly += nclu;
+ rc->ndelayed += nclu;
i += nclu << sbi->s_cluster_bits;
}
@@ -1149,10 +1291,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root,
* @rc - pointer to reserved count data
*
* The number of reservations to be released is equal to the number of
- * clusters containing delayed and not unwritten (delonly) blocks within
- * the range, minus the number of clusters still containing delonly blocks
- * at the ends of the range, and minus the number of pending reservations
- * within the range.
+ * clusters containing delayed blocks within the range, minus the number of
+ * clusters still containing delayed blocks at the ends of the range, and
+ * minus the number of pending reservations within the range.
*/
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct extent_status *right_es,
@@ -1163,33 +1304,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu;
- bool left_delonly, right_delonly, count_pending;
+ bool left_delayed, right_delayed, count_pending;
struct extent_status *es;
if (sbi->s_cluster_ratio > 1) {
/* count any remaining partial cluster */
if (rc->partial)
- rc->ndelonly++;
+ rc->ndelayed++;
- if (rc->ndelonly == 0)
+ if (rc->ndelayed == 0)
return 0;
first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
/*
- * decrease the delonly count by the number of clusters at the
- * ends of the range that still contain delonly blocks -
+ * decrease the delayed count by the number of clusters at the
+ * ends of the range that still contain delayed blocks -
* these clusters still need to be reserved
*/
- left_delonly = right_delonly = false;
+ left_delayed = right_delayed = false;
es = rc->left_es;
while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
- left_delonly = true;
+ if (ext4_es_is_delayed(es)) {
+ rc->ndelayed--;
+ left_delayed = true;
break;
}
node = rb_prev(&es->rb_node);
@@ -1197,7 +1338,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
break;
es = rb_entry(node, struct extent_status, rb_node);
}
- if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+ if (right_es && (!left_delayed || first_lclu != last_lclu)) {
if (end < ext4_es_end(right_es)) {
es = right_es;
} else {
@@ -1207,9 +1348,9 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
- right_delonly = true;
+ if (ext4_es_is_delayed(es)) {
+ rc->ndelayed--;
+ right_delayed = true;
break;
}
node = rb_next(&es->rb_node);
@@ -1223,21 +1364,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* Determine the block range that should be searched for
* pending reservations, if any. Clusters on the ends of the
- * original removed range containing delonly blocks are
+ * original removed range containing delayed blocks are
* excluded. They've already been accounted for and it's not
* possible to determine if an associated pending reservation
* should be released with the information available in the
* extents status tree.
*/
if (first_lclu == last_lclu) {
- if (left_delonly | right_delonly)
+ if (left_delayed | right_delayed)
count_pending = false;
else
count_pending = true;
} else {
- if (left_delonly)
+ if (left_delayed)
first_lclu++;
- if (right_delonly)
+ if (right_delayed)
last_lclu--;
if (first_lclu <= last_lclu)
count_pending = true;
@@ -1248,16 +1389,16 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* a pending reservation found between first_lclu and last_lclu
* represents an allocated cluster that contained at least one
- * delonly block, so the delonly total must be reduced by one
+ * delayed block, so the delayed total must be reduced by one
* for each pending reservation found and released
*/
if (count_pending) {
pr = __pr_tree_search(&tree->root, first_lclu);
while (pr && pr->lclu <= last_lclu) {
- rc->ndelonly--;
+ rc->ndelayed--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
+ __free_pending(pr);
if (!node)
break;
pr = rb_entry(node, struct pending_reservation,
@@ -1265,7 +1406,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
}
}
- return rc->ndelonly;
+ return rc->ndelayed;
}
@@ -1276,6 +1417,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* @lblk - first block in range
* @end - last block in range
* @reserved - number of cluster reservations released
+ * @prealloc - pre-allocated es to avoid memory allocation failures
*
* If @reserved is not NULL and delayed allocation is enabled, counts
* block/cluster reservations freed by removing range and if bigalloc
@@ -1283,7 +1425,8 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* error code on failure.
*/
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved)
+ ext4_lblk_t end, int *reserved,
+ struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node *node;
@@ -1291,14 +1434,12 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
- int err;
+ int err = 0;
bool count_reserved = true;
struct rsvd_count rc;
if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
count_reserved = false;
-retry:
- err = 0;
es = __es_tree_search(&tree->root, lblk);
if (!es)
@@ -1332,14 +1473,13 @@ retry:
orig_es.es_len - len2;
ext4_es_store_pblock_status(&newes, block,
ext4_es_status(&orig_es));
- err = __es_insert_extent(inode, &newes);
+ err = __es_insert_extent(inode, &newes, prealloc);
if (err) {
+ if (!ext4_es_must_keep(&newes))
+ return 0;
+
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
- if ((err == -ENOMEM) &&
- __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
goto out;
}
} else {
@@ -1352,9 +1492,9 @@ retry:
}
}
if (count_reserved)
- count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
- &orig_es, &rc);
- goto out;
+ count_rsvd(inode, orig_es.es_lblk + len1,
+ orig_es.es_len - len1 - len2, &orig_es, &rc);
+ goto out_get_reserved;
}
if (len1 > 0) {
@@ -1396,6 +1536,7 @@ retry:
}
}
+out_get_reserved:
if (count_reserved)
*reserved = get_rsvd(inode, end, es, &rc);
out:
@@ -1410,36 +1551,55 @@ out:
* @len - number of blocks to remove
*
* Reduces block/cluster reservation count and for bigalloc cancels pending
- * reservations as needed. Returns 0 on success, error code on failure.
+ * reservations as needed.
*/
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
+void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
{
ext4_lblk_t end;
int err = 0;
int reserved = 0;
+ struct extent_status *es = NULL;
+
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
- trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
if (!len)
- return err;
+ return;
end = lblk + len - 1;
BUG_ON(end < lblk);
+retry:
+ if (err && !es)
+ es = __es_alloc_extent(true);
/*
* ext4_clear_inode() depends on us taking i_es_lock unconditionally
* so that we are sure __es_shrink() is done with the inode before it
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, &reserved);
+ err = __es_remove_extent(inode, lblk, end, &reserved, es);
+ if (err)
+ goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es) {
+ if (!es->es_len)
+ __es_free_extent(es);
+ es = NULL;
+ }
+ ext4_es_inc_seq(inode);
+error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err)
+ goto retry;
+
+ trace_ext4_es_remove_extent(inode, lblk, len);
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
- return err;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
@@ -1536,7 +1696,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
unsigned long nr;
struct ext4_sb_info *sbi;
- sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+ sbi = shrink->private_data;
nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
@@ -1545,19 +1705,16 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
static unsigned long ext4_es_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct ext4_sb_info *sbi = container_of(shrink,
- struct ext4_sb_info, s_es_shrinker);
+ struct ext4_sb_info *sbi = shrink->private_data;
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
- if (!nr_to_scan)
- return ret;
-
nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
+ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
return nr_shrunk;
}
@@ -1632,12 +1789,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err3;
- sbi->s_es_shrinker.scan_objects = ext4_es_scan;
- sbi->s_es_shrinker.count_objects = ext4_es_count;
- sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- err = register_shrinker(&sbi->s_es_shrinker);
- if (err)
+ sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
+ if (!sbi->s_es_shrinker) {
+ err = -ENOMEM;
goto err4;
+ }
+
+ sbi->s_es_shrinker->scan_objects = ext4_es_scan;
+ sbi->s_es_shrinker->count_objects = ext4_es_count;
+ sbi->s_es_shrinker->private_data = sbi;
+
+ shrinker_register(sbi->s_es_shrinker);
return 0;
err4:
@@ -1657,7 +1819,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
- unregister_shrinker(&sbi->s_es_shrinker);
+ shrinker_free(sbi->s_es_shrinker);
}
/*
@@ -1688,11 +1850,8 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
(*nr_to_scan)--;
node = rb_next(&es->rb_node);
- /*
- * We can't reclaim delayed extent from status tree because
- * fiemap, bigallic, and seek_data/hole need to use it.
- */
- if (ext4_es_is_delayed(es))
+
+ if (ext4_es_must_keep(es))
goto next;
if (ext4_es_is_referenced(es)) {
ext4_es_clear_referenced(es);
@@ -1756,7 +1915,7 @@ void ext4_clear_inode_es(struct inode *inode)
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
node = rb_next(node);
- if (!ext4_es_is_delayed(es)) {
+ if (!ext4_es_must_keep(es)) {
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
}
@@ -1788,9 +1947,7 @@ static void ext4_print_pending_tree(struct inode *inode)
int __init ext4_init_pending(void)
{
- ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
- sizeof(struct pending_reservation),
- 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
if (ext4_pending_cachep == NULL)
return -ENOMEM;
return 0;
@@ -1843,11 +2000,13 @@ static struct pending_reservation *__get_pending(struct inode *inode,
*
* @inode - file containing the cluster
* @lblk - logical block in the cluster to be added
+ * @prealloc - preallocated pending entry
*
- * Returns 0 on successful insertion and -ENOMEM on failure. If the
+ * Returns 1 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully.
*/
-static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
+ struct pending_reservation **prealloc)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
@@ -1873,15 +2032,21 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
}
}
- pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
- if (pr == NULL) {
- ret = -ENOMEM;
- goto out;
+ if (likely(*prealloc == NULL)) {
+ pr = __alloc_pending(false);
+ if (!pr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else {
+ pr = *prealloc;
+ *prealloc = NULL;
}
pr->lclu = lclu;
rb_link_node(&pr->rb_node, parent, p);
rb_insert_color(&pr->rb_node, &tree->root);
+ ret = 1;
out:
return ret;
@@ -1906,7 +2071,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
if (pr != NULL) {
tree = &EXT4_I(inode)->i_pending_tree;
rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
+ __free_pending(pr);
}
}
@@ -1952,144 +2117,112 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}
/*
- * ext4_es_insert_delayed_block - adds a delayed block to the extents status
- * tree, adding a pending reservation where
- * needed
+ * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
+ * status tree, adding a pending reservation
+ * where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
- * @allocated - indicates whether a physical cluster has been allocated for
- * the logical cluster that contains the block
- *
- * Returns 0 on success, negative error code on failure.
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
+ * @lclu_allocated/end_allocated - indicates whether a physical cluster has
+ * been allocated for the logical cluster
+ * that contains the start/end block. Note that
+ * end_allocated should always be set to false
+ * if the start and the end block are in the
+ * same cluster
*/
-int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status newes;
- int err = 0;
+ ext4_lblk_t end = lblk + len - 1;
+ int err1 = 0, err2 = 0, err3 = 0;
+ struct extent_status *es1 = NULL;
+ struct extent_status *es2 = NULL;
+ struct pending_reservation *pr1 = NULL;
+ struct pending_reservation *pr2 = NULL;
- es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
- lblk, inode->i_ino);
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+ if (!len)
+ return;
+
+ WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
+ end_allocated);
newes.es_lblk = lblk;
- newes.es_len = 1;
+ newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
ext4_es_insert_extent_check(inode, &newes);
+retry:
+ if (err1 && !es1)
+ es1 = __es_alloc_extent(true);
+ if ((err1 || err2) && !es2)
+ es2 = __es_alloc_extent(true);
+ if (err1 || err2 || err3 < 0) {
+ if (lclu_allocated && !pr1)
+ pr1 = __alloc_pending(true);
+ if (end_allocated && !pr2)
+ pr2 = __alloc_pending(true);
+ }
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, lblk, NULL);
- if (err != 0)
- goto error;
-retry:
- err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
- if (err != 0)
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ if (err1 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es1) {
+ if (!es1->es_len)
+ __es_free_extent(es1);
+ es1 = NULL;
+ }
- if (allocated)
- __insert_pending(inode, lblk);
+ err2 = __es_insert_extent(inode, &newes, es2);
+ if (err2 != 0)
+ goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es2) {
+ if (!es2->es_len)
+ __es_free_extent(es2);
+ es2 = NULL;
+ }
+ if (lclu_allocated) {
+ err3 = __insert_pending(inode, lblk, &pr1);
+ if (err3 < 0)
+ goto error;
+ if (pr1) {
+ __free_pending(pr1);
+ pr1 = NULL;
+ }
+ }
+ if (end_allocated) {
+ err3 = __insert_pending(inode, end, &pr2);
+ if (err3 < 0)
+ goto error;
+ if (pr2) {
+ __free_pending(pr2);
+ pr2 = NULL;
+ }
+ }
+ ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err1 || err2 || err3 < 0)
+ goto retry;
+ trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+ end_allocated);
ext4_es_print_tree(inode);
ext4_print_pending_tree(inode);
-
- return err;
-}
-
-/*
- * __es_delayed_clu - count number of clusters containing blocks that
- * are delayed only
- *
- * @inode - file containing block range
- * @start - logical block defining start of range
- * @end - logical block defining end of range
- *
- * Returns the number of clusters containing only delayed (not delayed
- * and unwritten) blocks in the range specified by @start and @end. Any
- * cluster or part of a cluster within the range and containing a delayed
- * and not unwritten block within the range is counted as a whole cluster.
- */
-static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end)
-{
- struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
- struct extent_status *es;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct rb_node *node;
- ext4_lblk_t first_lclu, last_lclu;
- unsigned long long last_counted_lclu;
- unsigned int n = 0;
-
- /* guaranteed to be unequal to any ext4_lblk_t value */
- last_counted_lclu = ~0ULL;
-
- es = __es_tree_search(&tree->root, start);
-
- while (es && (es->es_lblk <= end)) {
- if (ext4_es_is_delonly(es)) {
- if (es->es_lblk <= start)
- first_lclu = EXT4_B2C(sbi, start);
- else
- first_lclu = EXT4_B2C(sbi, es->es_lblk);
-
- if (ext4_es_end(es) >= end)
- last_lclu = EXT4_B2C(sbi, end);
- else
- last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
-
- if (first_lclu == last_counted_lclu)
- n += last_lclu - first_lclu;
- else
- n += last_lclu - first_lclu + 1;
- last_counted_lclu = last_lclu;
- }
- node = rb_next(&es->rb_node);
- if (!node)
- break;
- es = rb_entry(node, struct extent_status, rb_node);
- }
-
- return n;
-}
-
-/*
- * ext4_es_delayed_clu - count number of clusters containing blocks that
- * are both delayed and unwritten
- *
- * @inode - file containing block range
- * @lblk - logical block defining start of range
- * @len - number of blocks in range
- *
- * Locking for external use of __es_delayed_clu().
- */
-unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- ext4_lblk_t end;
- unsigned int n;
-
- if (len == 0)
- return 0;
-
- end = lblk + len - 1;
- WARN_ON(end < lblk);
-
- read_lock(&ei->i_es_lock);
-
- n = __es_delayed_clu(inode, lblk, end);
-
- read_unlock(&ei->i_es_lock);
-
- return n;
+ return;
}
/*
@@ -2102,21 +2235,27 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
* @inode - file containing the range
* @lblk - logical block defining the start of range
* @len - length of range in blocks
+ * @prealloc - preallocated pending entry
*
* Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten
- * status. Must be called while holding i_es_lock.
+ * status. Must be called while holding i_es_lock. Returns number of new
+ * inserts pending cluster on insert pendings, returns 0 on remove pendings,
+ * return -ENOMEM on failure.
*/
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ struct pending_reservation **prealloc)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last;
bool f_del = false, l_del = false;
+ int pendings = 0;
+ int ret = 0;
if (len == 0)
- return;
+ return 0;
/*
* Two cases - block range within single cluster and block range
@@ -2134,39 +2273,53 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
if (f_del) {
- __insert_pending(inode, first);
+ ret = __insert_pending(inode, first, prealloc);
+ if (ret < 0)
+ goto out;
+ pendings += ret;
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode,
- &ext4_es_is_delonly,
+ &ext4_es_is_delayed,
end + 1, last);
- if (l_del)
- __insert_pending(inode, last);
- else
+ if (l_del) {
+ ret = __insert_pending(inode, last, prealloc);
+ if (ret < 0)
+ goto out;
+ pendings += ret;
+ } else
__remove_pending(inode, last);
}
} else {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
- if (f_del)
- __insert_pending(inode, first);
- else
+ if (f_del) {
+ ret = __insert_pending(inode, first, prealloc);
+ if (ret < 0)
+ goto out;
+ pendings += ret;
+ } else
__remove_pending(inode, first);
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end)
- l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ l_del = __es_scan_range(inode, &ext4_es_is_delayed,
end + 1, last);
- if (l_del)
- __insert_pending(inode, last);
- else
+ if (l_del) {
+ ret = __insert_pending(inode, last, prealloc);
+ if (ret < 0)
+ goto out;
+ pendings += ret;
+ } else
__remove_pending(inode, last);
}
+out:
+ return (ret < 0) ? ret : pendings;
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4ec30a798260..f3396cf32b44 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -42,6 +42,10 @@ enum {
#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
+/*
+ * Besides EXTENT_STATUS_REFERENCED, all these extent type masks
+ * are exclusive, only one type can be set at a time.
+ */
#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
@@ -51,7 +55,9 @@ enum {
#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
- EXTENT_STATUS_HOLE) << ES_SHIFT)
+ EXTENT_STATUS_HOLE))
+
+#define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1)))
struct ext4_sb_info;
struct ext4_extent;
@@ -127,21 +133,22 @@ extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
-extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status);
+extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status,
+ bool delalloc_reserve_used);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
-extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
int (*match_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t *next_lblk,
- struct extent_status *es);
+ struct extent_status *es, u64 *pseq);
extern bool ext4_es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end);
@@ -156,7 +163,7 @@ static inline unsigned int ext4_es_status(struct extent_status *es)
static inline unsigned int ext4_es_type(struct extent_status *es)
{
- return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
+ return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK;
}
static inline int ext4_es_is_written(struct extent_status *es)
@@ -184,11 +191,6 @@ static inline int ext4_es_is_mapped(struct extent_status *es)
return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}
-static inline int ext4_es_is_delonly(struct extent_status *es)
-{
- return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
-}
-
static inline void ext4_es_set_referenced(struct extent_status *es)
{
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -224,17 +226,12 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
es->es_pblk = block;
}
-static inline void ext4_es_store_status(struct extent_status *es,
- unsigned int status)
-{
- es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
- (es->es_pblk & ~ES_MASK);
-}
-
static inline void ext4_es_store_pblock_status(struct extent_status *es,
ext4_fsblk_t pb,
unsigned int status)
{
+ WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK));
+
es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
(pb & ~ES_MASK);
}
@@ -249,10 +246,9 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
-extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated);
-extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated);
extern void ext4_clear_inode_es(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
new file mode 100644
index 000000000000..fa66b08de999
--- /dev/null
+++ b/fs/ext4/fast_commit.c
@@ -0,0 +1,2343 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * fs/ext4/fast_commit.c
+ *
+ * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+ *
+ * Ext4 fast commits routines.
+ */
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "mballoc.h"
+
+#include <linux/lockdep.h>
+/*
+ * Ext4 Fast Commits
+ * -----------------
+ *
+ * Ext4 fast commits implement fine grained journalling for Ext4.
+ *
+ * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
+ * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
+ * TLV during the recovery phase. For the scenarios for which we currently
+ * don't have replay code, fast commit falls back to full commits.
+ * Fast commits record delta in one of the following three categories.
+ *
+ * (A) Directory entry updates:
+ *
+ * - EXT4_FC_TAG_UNLINK - records directory entry unlink
+ * - EXT4_FC_TAG_LINK - records directory entry link
+ * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
+ *
+ * (B) File specific data range updates:
+ *
+ * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
+ * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
+ *
+ * (C) Inode metadata (mtime / ctime etc):
+ *
+ * - EXT4_FC_TAG_INODE - record the inode that should be replayed
+ * during recovery. Note that iblocks field is
+ * not replayed and instead derived during
+ * replay.
+ * Commit Operation
+ * ----------------
+ * With fast commits, we maintain all the directory entry operations in the
+ * order in which they are issued in an in-memory queue. This queue is flushed
+ * to disk during the commit operation. We also maintain a list of inodes
+ * that need to be committed during a fast commit in another in memory queue of
+ * inodes. During the commit operation, we commit in the following order:
+ *
+ * [1] Prepare all the inodes to write out their data by setting
+ * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
+ * deleted while it is being flushed.
+ * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
+ * state.
+ * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
+ * all the exsiting handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit
+ * by setting "EXT4_STATE_FC_COMMITTING" state.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * starting of new handles. If new handles try to start an update on
+ * any of the inodes that are being committed, ext4_fc_track_inode()
+ * will block until those inodes have finished the fast commit.
+ * [6] Commit all the directory entry updates in the fast commit space.
+ * [7] Commit all the changed inodes in the fast commit space and clear
+ * "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [8] Write tail tag (this tag ensures the atomicity, please read the following
+ * section for more details).
+ *
+ * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * and jbd2_journal_stop() similar to JBD2 journaling.
+ *
+ * Fast Commit Ineligibility
+ * -------------------------
+ *
+ * Not all operations are supported by fast commits today (e.g extended
+ * attributes). Fast commit ineligibility is marked by calling
+ * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
+ * to full commit.
+ *
+ * Atomicity of commits
+ * --------------------
+ * In order to guarantee atomicity during the commit operation, fast commit
+ * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
+ * tag contains CRC of the contents and TID of the transaction after which
+ * this fast commit should be applied. Recovery code replays fast commit
+ * logs only if there's at least 1 valid tail present. For every fast commit
+ * operation, there is 1 tail. This means, we may end up with multiple tails
+ * in the fast commit space. Here's an example:
+ *
+ * - Create a new file A and remove existing file B
+ * - fsync()
+ * - Append contents to file A
+ * - Truncate file A
+ * - fsync()
+ *
+ * The fast commit space at the end of above operations would look like this:
+ * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
+ * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
+ *
+ * Replay code should thus check for all the valid tails in the FC area.
+ *
+ * Fast Commit Replay Idempotence
+ * ------------------------------
+ *
+ * Fast commits tags are idempotent in nature provided the recovery code follows
+ * certain rules. The guiding principle that the commit path follows while
+ * committing is that it stores the result of a particular operation instead of
+ * storing the procedure.
+ *
+ * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
+ * was associated with inode 10. During fast commit, instead of storing this
+ * operation as a procedure "rename a to b", we store the resulting file system
+ * state as a "series" of outcomes:
+ *
+ * - Link dirent b to inode 10
+ * - Unlink dirent a
+ * - Inode <10> with valid refcount
+ *
+ * Now when recovery code runs, it needs "enforce" this state on the file
+ * system. This is what guarantees idempotence of fast commit replay.
+ *
+ * Let's take an example of a procedure that is not idempotent and see how fast
+ * commits make it idempotent. Consider following sequence of operations:
+ *
+ * rm A; mv B A; read A
+ * (x) (y) (z)
+ *
+ * (x), (y) and (z) are the points at which we can crash. If we store this
+ * sequence of operations as is then the replay is not idempotent. Let's say
+ * while in replay, we crash at (z). During the second replay, file A (which was
+ * actually created as a result of "mv B A" operation) would get deleted. Thus,
+ * file named A would be absent when we try to read A. So, this sequence of
+ * operations is not idempotent. However, as mentioned above, instead of storing
+ * the procedure fast commits store the outcome of each procedure. Thus the fast
+ * commit log for above procedure would be as follows:
+ *
+ * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
+ * inode 11 before the replay)
+ *
+ * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
+ * (w) (x) (y) (z)
+ *
+ * If we crash at (z), we will have file A linked to inode 11. During the second
+ * replay, we will remove file A (inode 11). But we will create it back and make
+ * it point to inode 11. We won't find B, so we'll just skip that step. At this
+ * point, the refcount for inode 11 is not reliable, but that gets fixed by the
+ * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
+ * similarly. Thus, by converting a non-idempotent procedure into a series of
+ * idempotent outcomes, fast commits ensured idempotence during the replay.
+ *
+ * Locking
+ * -------
+ * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
+ * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
+ * inode. Most of the code avoids acquiring both the locks, but if one must do
+ * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
+ *
+ * TODOs
+ * -----
+ *
+ * 0) Fast commit replay path hardening: Fast commit replay code should use
+ * journal handles to make sure all the updates it does during the replay
+ * path are atomic. With that if we crash during fast commit replay, after
+ * trying to do recovery again, we will find a file system where fast commit
+ * area is invalid (because new full commit would be found). In order to deal
+ * with that, fast commit replay code should ensure that the "FC_REPLAY"
+ * superblock state is persisted before starting the replay, so that after
+ * the crash, fast commit recovery code can look at that flag and perform
+ * fast commit recovery even if that area is invalidated by later full
+ * commits.
+ *
+ * 1) Handle more ineligible cases.
+ *
+ * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
+ * status tree. This would get rid of the need to call ext4_fc_track_inode()
+ * before acquiring i_data_sem. To do that we would need to ensure that
+ * modified extents from the extent status tree are not evicted from memory.
+ */
+
+#include <trace/events/ext4.h>
+static struct kmem_cache *ext4_fc_dentry_cachep;
+
+static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+ BUFFER_TRACE(bh, "");
+ if (uptodate) {
+ ext4_debug("%s: Block %lld up-to-date",
+ __func__, bh->b_blocknr);
+ set_buffer_uptodate(bh);
+ } else {
+ ext4_debug("%s: Block %lld not up-to-date",
+ __func__, bh->b_blocknr);
+ clear_buffer_uptodate(bh);
+ }
+
+ unlock_buffer(bh);
+}
+
+static inline void ext4_fc_reset_inode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ei->i_fc_lblk_start = 0;
+ ei->i_fc_lblk_len = 0;
+}
+
+void ext4_fc_init_inode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ext4_fc_reset_inode(inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ INIT_LIST_HEAD(&ei->i_fc_list);
+ INIT_LIST_HEAD(&ei->i_fc_dilist);
+ init_waitqueue_head(&ei->i_fc_wait);
+}
+
+static bool ext4_fc_disabled(struct super_block *sb)
+{
+ return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
+}
+
+/*
+ * Remove inode from fast commit list. If the inode is being committed
+ * we wait until inode commit is done.
+ */
+void ext4_fc_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_fc_dentry_update *fc_dentry;
+ wait_queue_head_t *wq;
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ mutex_lock(&sbi->s_fc_lock);
+ if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
+ mutex_unlock(&sbi->s_fc_lock);
+ return;
+ }
+
+ /*
+ * Since ext4_fc_del is called from ext4_evict_inode while having a
+ * handle open, there is no need for us to wait here even if a fast
+ * commit is going on. That is because, if this inode is being
+ * committed, ext4_mark_inode_dirty would have waited for inode commit
+ * operation to finish before we come here. So, by the time we come
+ * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+ * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+ * here.
+ *
+ * We may come here without any handles open in the "no_delete" case of
+ * ext4_evict_inode as well. However, if that happens, we first mark the
+ * file system as fast commit ineligible anyway. So, even in that case,
+ * it is okay to remove the inode from the fc list.
+ */
+ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+ && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+ mutex_unlock(&sbi->s_fc_lock);
+ schedule();
+ mutex_lock(&sbi->s_fc_lock);
+ }
+ finish_wait(wq, &wait.wq_entry);
+ }
+ list_del_init(&ei->i_fc_list);
+
+ /*
+ * Since this inode is getting removed, let's also remove all FC
+ * dentry create references, since it is not needed to log it anyways.
+ */
+ if (list_empty(&ei->i_fc_dilist)) {
+ mutex_unlock(&sbi->s_fc_lock);
+ return;
+ }
+
+ fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+ WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
+ list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
+
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ mutex_unlock(&sbi->s_fc_lock);
+
+ release_dentry_name_snapshot(&fc_dentry->fcd_name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
+}
+
+/*
+ * Mark file system as fast commit ineligible, and record latest
+ * ineligible transaction tid. This means until the recorded
+ * transaction, commit operation would result in a full jbd2 commit.
+ */
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ tid_t tid;
+ bool has_transaction = true;
+ bool is_ineligible;
+
+ if (ext4_fc_disabled(sb))
+ return;
+
+ if (handle && !IS_ERR(handle))
+ tid = handle->h_transaction->t_tid;
+ else {
+ read_lock(&sbi->s_journal->j_state_lock);
+ if (sbi->s_journal->j_running_transaction)
+ tid = sbi->s_journal->j_running_transaction->t_tid;
+ else
+ has_transaction = false;
+ read_unlock(&sbi->s_journal->j_state_lock);
+ }
+ mutex_lock(&sbi->s_fc_lock);
+ is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
+ sbi->s_fc_ineligible_tid = tid;
+ ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ mutex_unlock(&sbi->s_fc_lock);
+ WARN_ON(reason >= EXT4_FC_REASON_MAX);
+ sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+}
+
+/*
+ * Generic fast commit tracking function. If this is the first time this we are
+ * called after a full commit, we initialize fast commit fields and then call
+ * __fc_track_fn() with update = 0. If we have already been called after a full
+ * commit, we pass update = 1. Based on that, the track function can determine
+ * if it needs to track a field for the first time or if it needs to just
+ * update the previously tracked value.
+ *
+ * If enqueue is set, this function enqueues the inode in fast commit list.
+ */
+static int ext4_fc_track_template(
+ handle_t *handle, struct inode *inode,
+ int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
+ void *args, int enqueue)
+{
+ bool update = false;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ tid_t tid = 0;
+ int ret;
+
+ tid = handle->h_transaction->t_tid;
+ spin_lock(&ei->i_fc_lock);
+ if (tid == ei->i_sync_tid) {
+ update = true;
+ } else {
+ ext4_fc_reset_inode(inode);
+ ei->i_sync_tid = tid;
+ }
+ ret = __fc_track_fn(handle, inode, args, update);
+ spin_unlock(&ei->i_fc_lock);
+ if (!enqueue)
+ return ret;
+
+ mutex_lock(&sbi->s_fc_lock);
+ if (list_empty(&EXT4_I(inode)->i_fc_list))
+ list_add_tail(&EXT4_I(inode)->i_fc_list,
+ (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
+ &sbi->s_fc_q[FC_Q_STAGING] :
+ &sbi->s_fc_q[FC_Q_MAIN]);
+ mutex_unlock(&sbi->s_fc_lock);
+
+ return ret;
+}
+
+struct __track_dentry_update_args {
+ struct dentry *dentry;
+ int op;
+};
+
+/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
+static int __track_dentry_update(handle_t *handle, struct inode *inode,
+ void *arg, bool update)
+{
+ struct ext4_fc_dentry_update *node;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct __track_dentry_update_args *dentry_update =
+ (struct __track_dentry_update_args *)arg;
+ struct dentry *dentry = dentry_update->dentry;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ spin_unlock(&ei->i_fc_lock);
+
+ if (IS_ENCRYPTED(dir)) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
+ handle);
+ spin_lock(&ei->i_fc_lock);
+ return -EOPNOTSUPP;
+ }
+
+ node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
+ if (!node) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
+ spin_lock(&ei->i_fc_lock);
+ return -ENOMEM;
+ }
+
+ node->fcd_op = dentry_update->op;
+ node->fcd_parent = dir->i_ino;
+ node->fcd_ino = inode->i_ino;
+ take_dentry_name_snapshot(&node->fcd_name, dentry);
+ INIT_LIST_HEAD(&node->fcd_dilist);
+ INIT_LIST_HEAD(&node->fcd_list);
+ mutex_lock(&sbi->s_fc_lock);
+ if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
+ list_add_tail(&node->fcd_list,
+ &sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ else
+ list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+
+ /*
+ * This helps us keep a track of all fc_dentry updates which is part of
+ * this ext4 inode. So in case the inode is getting unlinked, before
+ * even we get a chance to fsync, we could remove all fc_dentry
+ * references while evicting the inode in ext4_fc_del().
+ * Also with this, we don't need to loop over all the inodes in
+ * sbi->s_fc_q to get the corresponding inode in
+ * ext4_fc_commit_dentry_updates().
+ */
+ if (dentry_update->op == EXT4_FC_TAG_CREAT) {
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+ spin_lock(&ei->i_fc_lock);
+
+ return 0;
+}
+
+void __ext4_fc_track_unlink(handle_t *handle,
+ struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_UNLINK;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
+}
+
+void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_unlink(handle, inode, dentry);
+}
+
+void __ext4_fc_track_link(handle_t *handle,
+ struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_LINK;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_link(handle, inode, dentry, ret);
+}
+
+void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_link(handle, inode, dentry);
+}
+
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+ struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_CREAT;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_create(handle, inode, dentry, ret);
+}
+
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_create(handle, inode, dentry);
+}
+
+/* __track_fn for inode tracking */
+static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
+ bool update)
+{
+ if (update)
+ return -EEXIST;
+
+ EXT4_I(inode)->i_fc_lblk_len = 0;
+
+ return 0;
+}
+
+void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ wait_queue_head_t *wq;
+ int ret;
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_should_journal_data(inode)) {
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
+ return;
+ }
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ /*
+ * If we come here, we may sleep while waiting for the inode to
+ * commit. We shouldn't be holding i_data_sem when we go to sleep since
+ * the commit path needs to grab the lock while committing the inode.
+ */
+ lockdep_assert_not_held(&ei->i_data_sem);
+
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ }
+
+ /*
+ * From this point on, this inode will not be committed either
+ * by fast or full commit as long as the handle is open.
+ */
+ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
+ trace_ext4_fc_track_inode(handle, inode, ret);
+}
+
+struct __track_range_args {
+ ext4_lblk_t start, end;
+};
+
+/* __track_fn for tracking data updates */
+static int __track_range(handle_t *handle, struct inode *inode, void *arg,
+ bool update)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_lblk_t oldstart;
+ struct __track_range_args *__arg =
+ (struct __track_range_args *)arg;
+
+ if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
+ ext4_debug("Special inode %ld being modified\n", inode->i_ino);
+ return -ECANCELED;
+ }
+
+ oldstart = ei->i_fc_lblk_start;
+
+ if (update && ei->i_fc_lblk_len > 0) {
+ ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
+ ei->i_fc_lblk_len =
+ max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
+ ei->i_fc_lblk_start + 1;
+ } else {
+ ei->i_fc_lblk_start = __arg->start;
+ ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
+ }
+
+ return 0;
+}
+
+void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct __track_range_args args;
+ int ret;
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ if (ext4_has_inline_data(inode)) {
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
+ handle);
+ return;
+ }
+
+ args.start = start;
+ args.end = end;
+
+ ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
+
+ trace_ext4_fc_track_range(handle, inode, start, end, ret);
+}
+
+static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
+{
+ blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS;
+ struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
+
+ /* Add REQ_FUA | REQ_PREFLUSH only its tail */
+ if (test_opt(sb, BARRIER) && is_tail)
+ write_flags |= REQ_FUA | REQ_PREFLUSH;
+ lock_buffer(bh);
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ bh->b_end_io = ext4_end_buffer_io_sync;
+ submit_bh(REQ_OP_WRITE | write_flags, bh);
+ EXT4_SB(sb)->s_fc_bh = NULL;
+}
+
+/* Ext4 commit path routines */
+
+/*
+ * Allocate len bytes on a fast commit buffer.
+ *
+ * During the commit time this function is used to manage fast commit
+ * block space. We don't split a fast commit log onto different
+ * blocks. So this function makes sure that if there's not enough space
+ * on the current block, the remaining space in the current block is
+ * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
+ * new block is from jbd2 and CRC is updated to reflect the padding
+ * we added.
+ */
+static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
+{
+ struct ext4_fc_tl tl;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh;
+ int bsize = sbi->s_journal->j_blocksize;
+ int ret, off = sbi->s_fc_bytes % bsize;
+ int remaining;
+ u8 *dst;
+
+ /*
+ * If 'len' is too long to fit in any block alongside a PAD tlv, then we
+ * cannot fulfill the request.
+ */
+ if (len > bsize - EXT4_FC_TAG_BASE_LEN)
+ return NULL;
+
+ if (!sbi->s_fc_bh) {
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ }
+ dst = sbi->s_fc_bh->b_data + off;
+
+ /*
+ * Allocate the bytes in the current block if we can do so while still
+ * leaving enough space for a PAD tlv.
+ */
+ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
+ if (len <= remaining) {
+ sbi->s_fc_bytes += len;
+ return dst;
+ }
+
+ /*
+ * Else, terminate the current block with a PAD tlv, then allocate a new
+ * block and allocate the bytes at the start of that new block.
+ */
+
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+ tl.fc_len = cpu_to_le16(remaining);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
+ *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
+
+ ext4_fc_submit_bh(sb, false);
+
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ sbi->s_fc_bytes += bsize - off + len;
+ return sbi->s_fc_bh->b_data;
+}
+
+/*
+ * Complete a fast commit by writing tail tag.
+ *
+ * Writing tail tag marks the end of a fast commit. In order to guarantee
+ * atomicity, after writing tail tag, even if there's space remaining
+ * in the block, next commit shouldn't use it. That's why tail tag
+ * has the length as that of the remaining space on the block.
+ */
+static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_tl tl;
+ struct ext4_fc_tail tail;
+ int off, bsize = sbi->s_journal->j_blocksize;
+ u8 *dst;
+
+ /*
+ * ext4_fc_reserve_space takes care of allocating an extra block if
+ * there's no enough space on this block for accommodating this tail.
+ */
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
+ if (!dst)
+ return -ENOSPC;
+
+ off = sbi->s_fc_bytes % bsize;
+
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
+ tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
+ sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
+
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ dst += EXT4_FC_TAG_BASE_LEN;
+ tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
+ memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
+ dst += sizeof(tail.fc_tid);
+ crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
+ dst - (u8 *)sbi->s_fc_bh->b_data);
+ tail.fc_crc = cpu_to_le32(crc);
+ memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
+ dst += sizeof(tail.fc_crc);
+ memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
+
+ ext4_fc_submit_bh(sb, true);
+
+ return 0;
+}
+
+/*
+ * Adds tag, length, value and updates CRC. Returns true if tlv was added.
+ * Returns false if there's not enough space.
+ */
+static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
+ u32 *crc)
+{
+ struct ext4_fc_tl tl;
+ u8 *dst;
+
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
+ if (!dst)
+ return false;
+
+ tl.fc_tag = cpu_to_le16(tag);
+ tl.fc_len = cpu_to_le16(len);
+
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
+
+ return true;
+}
+
+/* Same as above, but adds dentry tlv. */
+static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
+ struct ext4_fc_dentry_update *fc_dentry)
+{
+ struct ext4_fc_dentry_info fcd;
+ struct ext4_fc_tl tl;
+ int dlen = fc_dentry->fcd_name.name.len;
+ u8 *dst = ext4_fc_reserve_space(sb,
+ EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
+
+ if (!dst)
+ return false;
+
+ fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
+ fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
+ tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
+ tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ dst += EXT4_FC_TAG_BASE_LEN;
+ memcpy(dst, &fcd, sizeof(fcd));
+ dst += sizeof(fcd);
+ memcpy(dst, fc_dentry->fcd_name.name.name, dlen);
+
+ return true;
+}
+
+/*
+ * Writes inode in the fast commit space under TLV with tag @tag.
+ * Returns 0 on success, error on failure.
+ */
+static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
+ int ret;
+ struct ext4_iloc iloc;
+ struct ext4_fc_inode fc_inode;
+ struct ext4_fc_tl tl;
+ u8 *dst;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ inode_len = EXT4_INODE_SIZE(inode->i_sb);
+ else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+ inode_len += ei->i_extra_isize;
+
+ fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
+ tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+
+ ret = -ECANCELED;
+ dst = ext4_fc_reserve_space(inode->i_sb,
+ EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
+ if (!dst)
+ goto err;
+
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ dst += EXT4_FC_TAG_BASE_LEN;
+ memcpy(dst, &fc_inode, sizeof(fc_inode));
+ dst += sizeof(fc_inode);
+ memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
+ ret = 0;
+err:
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * Writes updated data ranges for the inode in question. Updates CRC.
+ * Returns 0 on success, error otherwise.
+ */
+static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
+{
+ ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_map_blocks map;
+ struct ext4_fc_add_range fc_ext;
+ struct ext4_fc_del_range lrange;
+ struct ext4_extent *ex;
+ int ret;
+
+ spin_lock(&ei->i_fc_lock);
+ if (ei->i_fc_lblk_len == 0) {
+ spin_unlock(&ei->i_fc_lock);
+ return 0;
+ }
+ old_blk_size = ei->i_fc_lblk_start;
+ new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
+ ei->i_fc_lblk_len = 0;
+ spin_unlock(&ei->i_fc_lock);
+
+ cur_lblk_off = old_blk_size;
+ ext4_debug("will try writing %d to %d for inode %ld\n",
+ cur_lblk_off, new_blk_size, inode->i_ino);
+
+ while (cur_lblk_off <= new_blk_size) {
+ map.m_lblk = cur_lblk_off;
+ map.m_len = new_blk_size - cur_lblk_off + 1;
+ ret = ext4_map_blocks(NULL, inode, &map,
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE);
+ if (ret < 0)
+ return -ECANCELED;
+
+ if (map.m_len == 0) {
+ cur_lblk_off++;
+ continue;
+ }
+
+ if (ret == 0) {
+ lrange.fc_ino = cpu_to_le32(inode->i_ino);
+ lrange.fc_lblk = cpu_to_le32(map.m_lblk);
+ lrange.fc_len = cpu_to_le32(map.m_len);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
+ sizeof(lrange), (u8 *)&lrange, crc))
+ return -ENOSPC;
+ } else {
+ unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
+ EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
+
+ /* Limit the number of blocks in one extent */
+ map.m_len = min(max, map.m_len);
+
+ fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
+ ex = (struct ext4_extent *)&fc_ext.fc_ex;
+ ex->ee_block = cpu_to_le32(map.m_lblk);
+ ex->ee_len = cpu_to_le16(map.m_len);
+ ext4_ext_store_pblock(ex, map.m_pblk);
+ if (map.m_flags & EXT4_MAP_UNWRITTEN)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
+ sizeof(fc_ext), (u8 *)&fc_ext, crc))
+ return -ENOSPC;
+ }
+
+ cur_lblk_off += map.m_len;
+ }
+
+ return 0;
+}
+
+
+/* Flushes data of all the inodes in the commit queue. */
+static int ext4_fc_flush_data(journal_t *journal)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *ei;
+ int ret = 0;
+
+ list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ret = jbd2_submit_inode_data(journal, ei->jinode);
+ if (ret)
+ return ret;
+ }
+
+ list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ret = jbd2_wait_inode_data(journal, ei->jinode);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Commit all the directory entry updates */
+static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
+ struct inode *inode;
+ struct ext4_inode_info *ei;
+ int ret;
+
+ if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
+ return 0;
+ list_for_each_entry_safe(fc_dentry, fc_dentry_n,
+ &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+ if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
+ continue;
+ }
+ /*
+ * With fcd_dilist we need not loop in sbi->s_fc_q to get the
+ * corresponding inode. Also, the corresponding inode could have been
+ * deleted, in which case, we don't need to do anything.
+ */
+ if (list_empty(&fc_dentry->fcd_dilist))
+ continue;
+ ei = list_first_entry(&fc_dentry->fcd_dilist,
+ struct ext4_inode_info, i_fc_dilist);
+ inode = &ei->vfs_inode;
+ WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
+
+ /*
+ * We first write the inode and then the create dirent. This
+ * allows the recovery code to create an unnamed inode first
+ * and then link it to a directory entry. This allows us
+ * to use namei.c routines almost as is and simplifies
+ * the recovery code.
+ */
+ ret = ext4_fc_write_inode(inode, crc);
+ if (ret)
+ return ret;
+ ret = ext4_fc_write_inode_data(inode, crc);
+ if (ret)
+ return ret;
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static int ext4_fc_perform_commit(journal_t *journal)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter;
+ struct ext4_fc_head head;
+ struct inode *inode;
+ struct blk_plug plug;
+ int ret = 0;
+ u32 crc = 0;
+
+ /*
+ * Step 1: Mark all inodes on s_fc_q[MAIN] with
+ * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
+ * freed until the data flush is over.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /* Step 2: Flush data for all the eligible inodes. */
+ ret = ext4_fc_flush_data(journal);
+
+ /*
+ * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
+ * any error from step 2. This ensures that waiters waiting on
+ * EXT4_STATE_FC_FLUSHING_DATA can resume.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_clear_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ }
+
+ /*
+ * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
+ * the waiter checks the bit. Pairs with implicit barrier in
+ * prepare_to_wait() in ext4_fc_del().
+ */
+ smp_mb();
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /*
+ * If we encountered error in Step 2, return it now after clearing
+ * EXT4_STATE_FC_FLUSHING_DATA bit.
+ */
+ if (ret)
+ return ret;
+
+
+ /* Step 4: Mark all inodes as being committed. */
+ jbd2_journal_lock_updates(journal);
+ /*
+ * The journal is now locked. No more handles can start and all the
+ * previous handles are now drained. We now mark the inodes on the
+ * commit queue as being committed.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+ jbd2_journal_unlock_updates(journal);
+
+ /*
+ * Step 5: If file system device is different from journal device,
+ * issue a cache flush before we start writing fast commit blocks.
+ */
+ if (journal->j_fs_dev != journal->j_dev)
+ blkdev_issue_flush(journal->j_fs_dev);
+
+ blk_start_plug(&plug);
+ /* Step 6: Write fast commit blocks to disk. */
+ if (sbi->s_fc_bytes == 0) {
+ /*
+ * Step 6.1: Add a head tag only if this is the first fast
+ * commit in this TID.
+ */
+ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
+ head.fc_tid = cpu_to_le32(
+ sbi->s_journal->j_running_transaction->t_tid);
+ if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
+ (u8 *)&head, &crc)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ /* Step 6.2: Now write all the dentry updates. */
+ mutex_lock(&sbi->s_fc_lock);
+ ret = ext4_fc_commit_dentry_updates(journal, &crc);
+ if (ret)
+ goto out;
+
+ /* Step 6.3: Now write all the changed inodes to disk. */
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ inode = &iter->vfs_inode;
+ if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ continue;
+
+ ret = ext4_fc_write_inode_data(inode, &crc);
+ if (ret)
+ goto out;
+ ret = ext4_fc_write_inode(inode, &crc);
+ if (ret)
+ goto out;
+ }
+ /* Step 6.4: Finally write tail tag to conclude this fast commit. */
+ ret = ext4_fc_write_tail(sb, crc);
+
+out:
+ mutex_unlock(&sbi->s_fc_lock);
+ blk_finish_plug(&plug);
+ return ret;
+}
+
+static void ext4_fc_update_stats(struct super_block *sb, int status,
+ u64 commit_time, int nblks, tid_t commit_tid)
+{
+ struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
+
+ ext4_debug("Fast commit ended with status = %d for tid %u",
+ status, commit_tid);
+ if (status == EXT4_FC_STATUS_OK) {
+ stats->fc_num_commits++;
+ stats->fc_numblks += nblks;
+ if (likely(stats->s_fc_avg_commit_time))
+ stats->s_fc_avg_commit_time =
+ (commit_time +
+ stats->s_fc_avg_commit_time * 3) / 4;
+ else
+ stats->s_fc_avg_commit_time = commit_time;
+ } else if (status == EXT4_FC_STATUS_FAILED ||
+ status == EXT4_FC_STATUS_INELIGIBLE) {
+ if (status == EXT4_FC_STATUS_FAILED)
+ stats->fc_failed_commits++;
+ stats->fc_ineligible_commits++;
+ } else {
+ stats->fc_skipped_commits++;
+ }
+ trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
+}
+
+/*
+ * The main commit entry point. Performs a fast commit for transaction
+ * commit_tid if needed. If it's not possible to perform a fast commit
+ * due to various reasons, we fall back to full commit. Returns 0
+ * on success, error otherwise.
+ */
+int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int nblks = 0, ret, bsize = journal->j_blocksize;
+ int subtid = atomic_read(&sbi->s_fc_subtid);
+ int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
+ ktime_t start_time, commit_time;
+ int old_ioprio, journal_ioprio;
+
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
+ return jbd2_complete_transaction(journal, commit_tid);
+
+ trace_ext4_fc_commit_start(sb, commit_tid);
+
+ start_time = ktime_get();
+ old_ioprio = get_current_ioprio();
+
+restart_fc:
+ ret = jbd2_fc_begin_commit(journal, commit_tid);
+ if (ret == -EALREADY) {
+ /* There was an ongoing commit, check if we need to restart */
+ if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
+ tid_gt(commit_tid, journal->j_commit_sequence))
+ goto restart_fc;
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
+ commit_tid);
+ return 0;
+ } else if (ret) {
+ /*
+ * Commit couldn't start. Just update stats and perform a
+ * full commit.
+ */
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
+ commit_tid);
+ return jbd2_complete_transaction(journal, commit_tid);
+ }
+
+ /*
+ * After establishing journal barrier via jbd2_fc_begin_commit(), check
+ * if we are fast commit ineligible.
+ */
+ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
+ status = EXT4_FC_STATUS_INELIGIBLE;
+ goto fallback;
+ }
+
+ /*
+ * Now that we know that this thread is going to do a fast commit,
+ * elevate the priority to match that of the journal thread.
+ */
+ if (journal->j_task->io_context)
+ journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ else
+ journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
+ set_task_ioprio(current, journal_ioprio);
+ fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
+ ret = ext4_fc_perform_commit(journal);
+ if (ret < 0) {
+ status = EXT4_FC_STATUS_FAILED;
+ goto fallback;
+ }
+ nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
+ ret = jbd2_fc_wait_bufs(journal, nblks);
+ if (ret < 0) {
+ status = EXT4_FC_STATUS_FAILED;
+ goto fallback;
+ }
+ atomic_inc(&sbi->s_fc_subtid);
+ ret = jbd2_fc_end_commit(journal);
+ set_task_ioprio(current, old_ioprio);
+ /*
+ * weight the commit time higher than the average time so we
+ * don't react too strongly to vast changes in the commit time
+ */
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
+ return ret;
+
+fallback:
+ set_task_ioprio(current, old_ioprio);
+ ret = jbd2_fc_end_commit_fallback(journal);
+ ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
+ return ret;
+}
+
+/*
+ * Fast commit cleanup routine. This is called after every fast commit and
+ * full commit. full is true if we are called after a full commit.
+ */
+static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *ei;
+ struct ext4_fc_dentry_update *fc_dentry;
+
+ if (full && sbi->s_fc_bh)
+ sbi->s_fc_bh = NULL;
+
+ trace_ext4_fc_cleanup(journal, full, tid);
+ jbd2_fc_release_bufs(journal);
+
+ mutex_lock(&sbi->s_fc_lock);
+ while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+ ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
+ struct ext4_inode_info,
+ i_fc_list);
+ list_del_init(&ei->i_fc_list);
+ ext4_clear_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ if (tid_geq(tid, ei->i_sync_tid)) {
+ ext4_fc_reset_inode(&ei->vfs_inode);
+ } else if (full) {
+ /*
+ * We are called after a full commit, inode has been
+ * modified while the commit was running. Re-enqueue
+ * the inode into STAGING, which will then be splice
+ * back into MAIN. This cannot happen during
+ * fastcommit because the journal is locked all the
+ * time in that case (and tid doesn't increase so
+ * tid check above isn't reliable).
+ */
+ list_add_tail(&ei->i_fc_list,
+ &sbi->s_fc_q[FC_Q_STAGING]);
+ }
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
+ smp_mb();
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
+#else
+ wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
+#endif
+ }
+
+ while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
+ fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
+ struct ext4_fc_dentry_update,
+ fcd_list);
+ list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
+
+ release_dentry_name_snapshot(&fc_dentry->fcd_name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
+ }
+
+ list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
+ &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
+ &sbi->s_fc_q[FC_Q_MAIN]);
+
+ if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
+ sbi->s_fc_ineligible_tid = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ }
+
+ if (full)
+ sbi->s_fc_bytes = 0;
+ mutex_unlock(&sbi->s_fc_lock);
+ trace_ext4_fc_stats(sb);
+}
+
+/* Ext4 Replay Path Routines */
+
+/* Helper struct for dentry replay routines */
+struct dentry_info_args {
+ int parent_ino, dname_len, ino, inode_len;
+ char *dname;
+};
+
+/* Same as struct ext4_fc_tl, but uses native endianness fields */
+struct ext4_fc_tl_mem {
+ u16 fc_tag;
+ u16 fc_len;
+};
+
+static inline void tl_to_darg(struct dentry_info_args *darg,
+ struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ struct ext4_fc_dentry_info fcd;
+
+ memcpy(&fcd, val, sizeof(fcd));
+
+ darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
+ darg->ino = le32_to_cpu(fcd.fc_ino);
+ darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
+ darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
+}
+
+static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ struct ext4_fc_tl tl_disk;
+
+ memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
+ tl->fc_len = le16_to_cpu(tl_disk.fc_len);
+ tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
+}
+
+/* Unlink replay function */
+static int ext4_fc_replay_unlink(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ struct inode *inode, *old_parent;
+ struct qstr entry;
+ struct dentry_info_args darg;
+ int ret = 0;
+
+ tl_to_darg(&darg, tl, val);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ entry.name = darg.dname;
+ entry.len = darg.dname_len;
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode %d not found", darg.ino);
+ return 0;
+ }
+
+ old_parent = ext4_iget(sb, darg.parent_ino,
+ EXT4_IGET_NORMAL);
+ if (IS_ERR(old_parent)) {
+ ext4_debug("Dir with inode %d not found", darg.parent_ino);
+ iput(inode);
+ return 0;
+ }
+
+ ret = __ext4_unlink(old_parent, &entry, inode, NULL);
+ /* -ENOENT ok coz it might not exist anymore. */
+ if (ret == -ENOENT)
+ ret = 0;
+ iput(old_parent);
+ iput(inode);
+ return ret;
+}
+
+static int ext4_fc_replay_link_internal(struct super_block *sb,
+ struct dentry_info_args *darg,
+ struct inode *inode)
+{
+ struct inode *dir = NULL;
+ struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
+ struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
+ int ret = 0;
+
+ dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(dir)) {
+ ext4_debug("Dir with inode %d not found.", darg->parent_ino);
+ dir = NULL;
+ goto out;
+ }
+
+ dentry_dir = d_obtain_alias(dir);
+ if (IS_ERR(dentry_dir)) {
+ ext4_debug("Failed to obtain dentry");
+ dentry_dir = NULL;
+ goto out;
+ }
+
+ dentry_inode = d_alloc(dentry_dir, &qstr_dname);
+ if (!dentry_inode) {
+ ext4_debug("Inode dentry not created.");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __ext4_link(dir, inode, dentry_inode);
+ /*
+ * It's possible that link already existed since data blocks
+ * for the dir in question got persisted before we crashed OR
+ * we replayed this tag and crashed before the entire replay
+ * could complete.
+ */
+ if (ret && ret != -EEXIST) {
+ ext4_debug("Failed to link\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (dentry_dir) {
+ d_drop(dentry_dir);
+ dput(dentry_dir);
+ } else if (dir) {
+ iput(dir);
+ }
+ if (dentry_inode) {
+ d_drop(dentry_inode);
+ dput(dentry_inode);
+ }
+
+ return ret;
+}
+
+/* Link replay function */
+static int ext4_fc_replay_link(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ struct inode *inode;
+ struct dentry_info_args darg;
+ int ret = 0;
+
+ tl_to_darg(&darg, tl, val);
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode not found.");
+ return 0;
+ }
+
+ ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Record all the modified inodes during replay. We use this later to setup
+ * block bitmaps correctly.
+ */
+static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
+{
+ struct ext4_fc_replay_state *state;
+ int i;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_modified_inodes_used; i++)
+ if (state->fc_modified_inodes[i] == ino)
+ return 0;
+ if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
+ int *fc_modified_inodes;
+
+ fc_modified_inodes = krealloc(state->fc_modified_inodes,
+ sizeof(int) * (state->fc_modified_inodes_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!fc_modified_inodes)
+ return -ENOMEM;
+ state->fc_modified_inodes = fc_modified_inodes;
+ state->fc_modified_inodes_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
+ }
+ state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
+ return 0;
+}
+
+/*
+ * Inode replay function
+ */
+static int ext4_fc_replay_inode(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ struct ext4_fc_inode fc_inode;
+ struct ext4_inode *raw_inode;
+ struct ext4_inode *raw_fc_inode;
+ struct inode *inode = NULL;
+ struct ext4_iloc iloc;
+ int inode_len, ino, ret, tag = tl->fc_tag;
+ struct ext4_extent_header *eh;
+ size_t off_gen = offsetof(struct ext4_inode, i_generation);
+
+ memcpy(&fc_inode, val, sizeof(fc_inode));
+
+ ino = le32_to_cpu(fc_inode.fc_ino);
+ trace_ext4_fc_replay(sb, tag, ino, 0, 0);
+
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+ if (!IS_ERR(inode)) {
+ ext4_ext_clear_bb(inode);
+ iput(inode);
+ }
+ inode = NULL;
+
+ ret = ext4_fc_record_modified_inode(sb, ino);
+ if (ret)
+ goto out;
+
+ raw_fc_inode = (struct ext4_inode *)
+ (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
+ ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
+ if (ret)
+ goto out;
+
+ inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
+ raw_inode = ext4_raw_inode(&iloc);
+
+ memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
+ memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
+ inode_len - off_gen);
+ if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
+ eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
+ if (eh->eh_magic != EXT4_EXT_MAGIC) {
+ memset(eh, 0, sizeof(*eh));
+ eh->eh_magic = EXT4_EXT_MAGIC;
+ eh->eh_max = cpu_to_le16(
+ (sizeof(raw_inode->i_block) -
+ sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent));
+ }
+ } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
+ memcpy(raw_inode->i_block, raw_fc_inode->i_block,
+ sizeof(raw_inode->i_block));
+ }
+
+ /* Immediately update the inode on disk. */
+ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+ if (ret)
+ goto out;
+ ret = sync_dirty_buffer(iloc.bh);
+ if (ret)
+ goto out;
+ ret = ext4_mark_inode_used(sb, ino);
+ if (ret)
+ goto out;
+
+ /* Given that we just wrote the inode on disk, this SHOULD succeed. */
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode not found.");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Our allocator could have made different decisions than before
+ * crashing. This should be fixed but until then, we calculate
+ * the number of blocks the inode.
+ */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ ext4_ext_replay_set_iblocks(inode);
+
+ inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
+ ext4_reset_inode_seed(inode);
+
+ ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
+ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+ sync_dirty_buffer(iloc.bh);
+ brelse(iloc.bh);
+out:
+ iput(inode);
+ if (!ret)
+ blkdev_issue_flush(sb->s_bdev);
+
+ return 0;
+}
+
+/*
+ * Dentry create replay function.
+ *
+ * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
+ * inode for which we are trying to create a dentry here, should already have
+ * been replayed before we start here.
+ */
+static int ext4_fc_replay_create(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ int ret = 0;
+ struct inode *inode = NULL;
+ struct inode *dir = NULL;
+ struct dentry_info_args darg;
+
+ tl_to_darg(&darg, tl, val);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ /* This takes care of update group descriptor and other metadata */
+ ret = ext4_mark_inode_used(sb, darg.ino);
+ if (ret)
+ goto out;
+
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("inode %d not found.", darg.ino);
+ inode = NULL;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ /*
+ * If we are creating a directory, we need to make sure that the
+ * dot and dot dot dirents are setup properly.
+ */
+ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(dir)) {
+ ext4_debug("Dir %d not found.", darg.ino);
+ goto out;
+ }
+ ret = ext4_init_new_dir(NULL, dir, inode);
+ iput(dir);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+ if (ret)
+ goto out;
+ set_nlink(inode, 1);
+ ext4_mark_inode_dirty(NULL, inode);
+out:
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Record physical disk regions which are in use as per fast commit area,
+ * and used by inodes during replay phase. Our simple replay phase
+ * allocator excludes these regions from allocation.
+ */
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
+{
+ struct ext4_fc_replay_state *state;
+ struct ext4_fc_alloc_region *region;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ /*
+ * during replay phase, the fc_regions_valid may not same as
+ * fc_regions_used, update it when do new additions.
+ */
+ if (replay && state->fc_regions_used != state->fc_regions_valid)
+ state->fc_regions_used = state->fc_regions_valid;
+ if (state->fc_regions_used == state->fc_regions_size) {
+ struct ext4_fc_alloc_region *fc_regions;
+
+ fc_regions = krealloc(state->fc_regions,
+ sizeof(struct ext4_fc_alloc_region) *
+ (state->fc_regions_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!fc_regions)
+ return -ENOMEM;
+ state->fc_regions_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
+ state->fc_regions = fc_regions;
+ }
+ region = &state->fc_regions[state->fc_regions_used++];
+ region->ino = ino;
+ region->lblk = lblk;
+ region->pblk = pblk;
+ region->len = len;
+
+ if (replay)
+ state->fc_regions_valid++;
+
+ return 0;
+}
+
+/* Replay add range tag */
+static int ext4_fc_replay_add_range(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ struct ext4_fc_add_range fc_add_ex;
+ struct ext4_extent newex, *ex;
+ struct inode *inode;
+ ext4_lblk_t start, cur;
+ int remaining, len;
+ ext4_fsblk_t start_pblk;
+ struct ext4_map_blocks map;
+ struct ext4_ext_path *path = NULL;
+ int ret;
+
+ memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
+ ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
+ le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex));
+
+ inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode not found.");
+ return 0;
+ }
+
+ ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
+
+ start = le32_to_cpu(ex->ee_block);
+ start_pblk = ext4_ext_pblock(ex);
+ len = ext4_ext_get_actual_len(ex);
+
+ cur = start;
+ remaining = len;
+ ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
+ start, start_pblk, len, ext4_ext_is_unwritten(ex),
+ inode->i_ino);
+
+ while (remaining > 0) {
+ map.m_lblk = cur;
+ map.m_len = remaining;
+ map.m_pblk = 0;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ /* Range is not mapped */
+ path = ext4_find_extent(inode, cur, path, 0);
+ if (IS_ERR(path))
+ goto out;
+ memset(&newex, 0, sizeof(newex));
+ newex.ee_block = cpu_to_le32(cur);
+ ext4_ext_store_pblock(
+ &newex, start_pblk + cur - start);
+ newex.ee_len = cpu_to_le16(map.m_len);
+ if (ext4_ext_is_unwritten(ex))
+ ext4_ext_mark_unwritten(&newex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ path = ext4_ext_insert_extent(NULL, inode,
+ path, &newex, 0);
+ up_write((&EXT4_I(inode)->i_data_sem));
+ if (IS_ERR(path))
+ goto out;
+ goto next;
+ }
+
+ if (start_pblk + cur - start != map.m_pblk) {
+ /*
+ * Logical to physical mapping changed. This can happen
+ * if this range was removed and then reallocated to
+ * map to new physical blocks during a fast commit.
+ */
+ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+ ext4_ext_is_unwritten(ex),
+ start_pblk + cur - start);
+ if (ret)
+ goto out;
+ /*
+ * Mark the old blocks as free since they aren't used
+ * anymore. We maintain an array of all the modified
+ * inodes. In case these blocks are still used at either
+ * a different logical range in the same inode or in
+ * some different inode, we will mark them as allocated
+ * at the end of the FC replay using our array of
+ * modified inodes.
+ */
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
+ goto next;
+ }
+
+ /* Range is mapped and needs a state change */
+ ext4_debug("Converting from %ld to %d %lld",
+ map.m_flags & EXT4_MAP_UNWRITTEN,
+ ext4_ext_is_unwritten(ex), map.m_pblk);
+ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+ ext4_ext_is_unwritten(ex), map.m_pblk);
+ if (ret)
+ goto out;
+ /*
+ * We may have split the extent tree while toggling the state.
+ * Try to shrink the extent tree now.
+ */
+ ext4_ext_replay_shrink_inode(inode, start + len);
+next:
+ cur += map.m_len;
+ remaining -= map.m_len;
+ }
+ ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
+ sb->s_blocksize_bits);
+out:
+ ext4_free_ext_path(path);
+ iput(inode);
+ return 0;
+}
+
+/* Replay DEL_RANGE tag */
+static int
+ext4_fc_replay_del_range(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
+{
+ struct inode *inode;
+ struct ext4_fc_del_range lrange;
+ struct ext4_map_blocks map;
+ ext4_lblk_t cur, remaining;
+ int ret;
+
+ memcpy(&lrange, val, sizeof(lrange));
+ cur = le32_to_cpu(lrange.fc_lblk);
+ remaining = le32_to_cpu(lrange.fc_len);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
+ le32_to_cpu(lrange.fc_ino), cur, remaining);
+
+ inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
+ return 0;
+ }
+
+ ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
+
+ ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
+ inode->i_ino, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_len));
+ while (remaining > 0) {
+ map.m_lblk = cur;
+ map.m_len = remaining;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ remaining -= ret;
+ cur += ret;
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
+ } else {
+ remaining -= map.m_len;
+ cur += map.m_len;
+ }
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_lblk) +
+ le32_to_cpu(lrange.fc_len) - 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ ext4_ext_replay_shrink_inode(inode,
+ i_size_read(inode) >> sb->s_blocksize_bits);
+ ext4_mark_inode_dirty(NULL, inode);
+out:
+ iput(inode);
+ return 0;
+}
+
+static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
+{
+ struct ext4_fc_replay_state *state;
+ struct inode *inode;
+ struct ext4_ext_path *path = NULL;
+ struct ext4_map_blocks map;
+ int i, ret, j;
+ ext4_lblk_t cur, end;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_modified_inodes_used; i++) {
+ inode = ext4_iget(sb, state->fc_modified_inodes[i],
+ EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode %d not found.",
+ state->fc_modified_inodes[i]);
+ continue;
+ }
+ cur = 0;
+ end = EXT_MAX_BLOCKS;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
+ iput(inode);
+ continue;
+ }
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+
+ if (ret > 0) {
+ path = ext4_find_extent(inode, map.m_lblk, path, 0);
+ if (!IS_ERR(path)) {
+ for (j = 0; j < path->p_depth; j++)
+ ext4_mb_mark_bb(inode->i_sb,
+ path[j].p_block, 1, true);
+ } else {
+ path = NULL;
+ }
+ cur += ret;
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
+ map.m_len, true);
+ } else {
+ cur = cur + (map.m_len ? map.m_len : 1);
+ }
+ }
+ iput(inode);
+ }
+
+ ext4_free_ext_path(path);
+}
+
+/*
+ * Check if block is in excluded regions for block allocation. The simple
+ * allocator that runs during replay phase is calls this function to see
+ * if it is okay to use a block.
+ */
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
+{
+ int i;
+ struct ext4_fc_replay_state *state;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_regions_valid; i++) {
+ if (state->fc_regions[i].ino == 0 ||
+ state->fc_regions[i].len == 0)
+ continue;
+ if (in_range(blk, state->fc_regions[i].pblk,
+ state->fc_regions[i].len))
+ return true;
+ }
+ return false;
+}
+
+/* Cleanup function called after replay */
+void ext4_fc_replay_cleanup(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ sbi->s_mount_state &= ~EXT4_FC_REPLAY;
+ kfree(sbi->s_fc_replay_state.fc_regions);
+ kfree(sbi->s_fc_replay_state.fc_modified_inodes);
+}
+
+static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
+ int tag, int len)
+{
+ switch (tag) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ return len == sizeof(struct ext4_fc_add_range);
+ case EXT4_FC_TAG_DEL_RANGE:
+ return len == sizeof(struct ext4_fc_del_range);
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ len -= sizeof(struct ext4_fc_dentry_info);
+ return len >= 1 && len <= EXT4_NAME_LEN;
+ case EXT4_FC_TAG_INODE:
+ len -= sizeof(struct ext4_fc_inode);
+ return len >= EXT4_GOOD_OLD_INODE_SIZE &&
+ len <= sbi->s_inode_size;
+ case EXT4_FC_TAG_PAD:
+ return true; /* padding can have any length */
+ case EXT4_FC_TAG_TAIL:
+ return len >= sizeof(struct ext4_fc_tail);
+ case EXT4_FC_TAG_HEAD:
+ return len == sizeof(struct ext4_fc_head);
+ }
+ return false;
+}
+
+/*
+ * Recovery Scan phase handler
+ *
+ * This function is called during the scan phase and is responsible
+ * for doing following things:
+ * - Make sure the fast commit area has valid tags for replay
+ * - Count number of tags that need to be replayed by the replay handler
+ * - Verify CRC
+ * - Create a list of excluded blocks for allocation during replay phase
+ *
+ * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
+ * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
+ * to indicate that scan has finished and JBD2 can now start replay phase.
+ * It returns a negative error to indicate that there was an error. At the end
+ * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
+ * to indicate the number of tags that need to replayed during the replay phase.
+ */
+static int ext4_fc_replay_scan(journal_t *journal,
+ struct buffer_head *bh, int off,
+ tid_t expected_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_replay_state *state;
+ int ret = JBD2_FC_REPLAY_CONTINUE;
+ struct ext4_fc_add_range ext;
+ struct ext4_fc_tl_mem tl;
+ struct ext4_fc_tail tail;
+ __u8 *start, *end, *cur, *val;
+ struct ext4_fc_head head;
+ struct ext4_extent *ex;
+
+ state = &sbi->s_fc_replay_state;
+
+ start = (u8 *)bh->b_data;
+ end = start + journal->j_blocksize;
+
+ if (state->fc_replay_expected_off == 0) {
+ state->fc_cur_tag = 0;
+ state->fc_replay_num_tags = 0;
+ state->fc_crc = 0;
+ state->fc_regions = NULL;
+ state->fc_regions_valid = state->fc_regions_used =
+ state->fc_regions_size = 0;
+ /* Check if we can stop early */
+ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
+ != EXT4_FC_TAG_HEAD)
+ return 0;
+ }
+
+ if (off != state->fc_replay_expected_off) {
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+
+ state->fc_replay_expected_off++;
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
+ if (tl.fc_len > end - val ||
+ !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ goto out_err;
+ }
+ ext4_debug("Scan phase, tag:%s, blk %lld\n",
+ tag2str(tl.fc_tag), bh->b_blocknr);
+ switch (tl.fc_tag) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ memcpy(&ext, val, sizeof(ext));
+ ex = (struct ext4_extent *)&ext.fc_ex;
+ ret = ext4_fc_record_regions(sb,
+ le32_to_cpu(ext.fc_ino),
+ le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
+ ext4_ext_get_actual_len(ex), 0);
+ if (ret < 0)
+ break;
+ ret = JBD2_FC_REPLAY_CONTINUE;
+ fallthrough;
+ case EXT4_FC_TAG_DEL_RANGE:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_INODE:
+ case EXT4_FC_TAG_PAD:
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
+ break;
+ case EXT4_FC_TAG_TAIL:
+ state->fc_cur_tag++;
+ memcpy(&tail, val, sizeof(tail));
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
+ EXT4_FC_TAG_BASE_LEN +
+ offsetof(struct ext4_fc_tail,
+ fc_crc));
+ if (le32_to_cpu(tail.fc_tid) == expected_tid &&
+ le32_to_cpu(tail.fc_crc) == state->fc_crc) {
+ state->fc_replay_num_tags = state->fc_cur_tag;
+ state->fc_regions_valid =
+ state->fc_regions_used;
+ } else {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -EFSBADCRC;
+ }
+ state->fc_crc = 0;
+ break;
+ case EXT4_FC_TAG_HEAD:
+ memcpy(&head, val, sizeof(head));
+ if (le32_to_cpu(head.fc_features) &
+ ~EXT4_FC_SUPPORTED_FEATURES) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ if (le32_to_cpu(head.fc_tid) != expected_tid) {
+ ret = JBD2_FC_REPLAY_STOP;
+ break;
+ }
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
+ break;
+ default:
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ }
+ if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
+ break;
+ }
+
+out_err:
+ trace_ext4_fc_replay_scan(sb, ret, off);
+ return ret;
+}
+
+/*
+ * Main recovery path entry point.
+ * The meaning of return codes is similar as above.
+ */
+static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
+ enum passtype pass, int off, tid_t expected_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_tl_mem tl;
+ __u8 *start, *end, *cur, *val;
+ int ret = JBD2_FC_REPLAY_CONTINUE;
+ struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
+ struct ext4_fc_tail tail;
+
+ if (pass == PASS_SCAN) {
+ state->fc_current_pass = PASS_SCAN;
+ return ext4_fc_replay_scan(journal, bh, off, expected_tid);
+ }
+
+ if (state->fc_current_pass != pass) {
+ state->fc_current_pass = pass;
+ sbi->s_mount_state |= EXT4_FC_REPLAY;
+ }
+ if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
+ ext4_debug("Replay stops\n");
+ ext4_fc_set_bitmaps_and_counters(sb);
+ return 0;
+ }
+
+#ifdef CONFIG_EXT4_DEBUG
+ if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
+ pr_warn("Dropping fc block %d because max_replay set\n", off);
+ return JBD2_FC_REPLAY_STOP;
+ }
+#endif
+
+ start = (u8 *)bh->b_data;
+ end = start + journal->j_blocksize;
+
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
+
+ if (state->fc_replay_num_tags == 0) {
+ ret = JBD2_FC_REPLAY_STOP;
+ ext4_fc_set_bitmaps_and_counters(sb);
+ break;
+ }
+
+ ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
+ state->fc_replay_num_tags--;
+ switch (tl.fc_tag) {
+ case EXT4_FC_TAG_LINK:
+ ret = ext4_fc_replay_link(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_UNLINK:
+ ret = ext4_fc_replay_unlink(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_ADD_RANGE:
+ ret = ext4_fc_replay_add_range(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_CREAT:
+ ret = ext4_fc_replay_create(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_DEL_RANGE:
+ ret = ext4_fc_replay_del_range(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_INODE:
+ ret = ext4_fc_replay_inode(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_PAD:
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
+ tl.fc_len, 0);
+ break;
+ case EXT4_FC_TAG_TAIL:
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
+ 0, tl.fc_len, 0);
+ memcpy(&tail, val, sizeof(tail));
+ WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
+ break;
+ case EXT4_FC_TAG_HEAD:
+ break;
+ default:
+ trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
+ ret = -ECANCELED;
+ break;
+ }
+ if (ret < 0)
+ break;
+ ret = JBD2_FC_REPLAY_CONTINUE;
+ }
+ return ret;
+}
+
+void ext4_fc_init(struct super_block *sb, journal_t *journal)
+{
+ /*
+ * We set replay callback even if fast commit disabled because we may
+ * could still have fast commit blocks that need to be replayed even if
+ * fast commit has now been turned off.
+ */
+ journal->j_fc_replay_callback = ext4_fc_replay;
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
+ return;
+ journal->j_fc_cleanup_callback = ext4_fc_cleanup;
+}
+
+static const char * const fc_ineligible_reasons[] = {
+ [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
+ [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
+ [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
+ [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
+ [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
+ [EXT4_FC_REASON_RESIZE] = "Resize",
+ [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
+ [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
+ [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
+ [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
+};
+
+int ext4_fc_info_show(struct seq_file *seq, void *v)
+{
+ struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
+ struct ext4_fc_stats *stats = &sbi->s_fc_stats;
+ int i;
+
+ if (v != SEQ_START_TOKEN)
+ return 0;
+
+ seq_printf(seq,
+ "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
+ stats->fc_num_commits, stats->fc_ineligible_commits,
+ stats->fc_numblks,
+ div_u64(stats->s_fc_avg_commit_time, 1000));
+ seq_puts(seq, "Ineligible reasons:\n");
+ for (i = 0; i < EXT4_FC_REASON_MAX; i++)
+ seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
+ stats->fc_ineligible_reason_count[i]);
+
+ return 0;
+}
+
+int __init ext4_fc_init_dentry_cache(void)
+{
+ ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
+ SLAB_RECLAIM_ACCOUNT);
+
+ if (ext4_fc_dentry_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ext4_fc_destroy_dentry_cache(void)
+{
+ kmem_cache_destroy(ext4_fc_dentry_cachep);
+}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
new file mode 100644
index 000000000000..3bd534e4dbbf
--- /dev/null
+++ b/fs/ext4/fast_commit.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __FAST_COMMIT_H__
+#define __FAST_COMMIT_H__
+
+/*
+ * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and
+ * linux/fs/ext4/fast_commit.h. These file should always be byte identical.
+ */
+
+/* Fast commit tags */
+#define EXT4_FC_TAG_ADD_RANGE 0x0001
+#define EXT4_FC_TAG_DEL_RANGE 0x0002
+#define EXT4_FC_TAG_CREAT 0x0003
+#define EXT4_FC_TAG_LINK 0x0004
+#define EXT4_FC_TAG_UNLINK 0x0005
+#define EXT4_FC_TAG_INODE 0x0006
+#define EXT4_FC_TAG_PAD 0x0007
+#define EXT4_FC_TAG_TAIL 0x0008
+#define EXT4_FC_TAG_HEAD 0x0009
+
+#define EXT4_FC_SUPPORTED_FEATURES 0x0
+
+/* On disk fast commit tlv value structures */
+
+/* Fast commit on disk tag length structure */
+struct ext4_fc_tl {
+ __le16 fc_tag;
+ __le16 fc_len;
+};
+
+/* Value structure for tag EXT4_FC_TAG_HEAD. */
+struct ext4_fc_head {
+ __le32 fc_features;
+ __le32 fc_tid;
+};
+
+/* Value structure for EXT4_FC_TAG_ADD_RANGE. */
+struct ext4_fc_add_range {
+ __le32 fc_ino;
+ __u8 fc_ex[12];
+};
+
+/* Value structure for tag EXT4_FC_TAG_DEL_RANGE. */
+struct ext4_fc_del_range {
+ __le32 fc_ino;
+ __le32 fc_lblk;
+ __le32 fc_len;
+};
+
+/*
+ * This is the value structure for tags EXT4_FC_TAG_CREAT, EXT4_FC_TAG_LINK
+ * and EXT4_FC_TAG_UNLINK.
+ */
+struct ext4_fc_dentry_info {
+ __le32 fc_parent_ino;
+ __le32 fc_ino;
+ __u8 fc_dname[];
+};
+
+/* Value structure for EXT4_FC_TAG_INODE. */
+struct ext4_fc_inode {
+ __le32 fc_ino;
+ __u8 fc_raw_inode[];
+};
+
+/* Value structure for tag EXT4_FC_TAG_TAIL. */
+struct ext4_fc_tail {
+ __le32 fc_tid;
+ __le32 fc_crc;
+};
+
+/* Tag base length */
+#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl))
+
+/*
+ * Fast commit status codes
+ */
+enum {
+ EXT4_FC_STATUS_OK = 0,
+ EXT4_FC_STATUS_INELIGIBLE,
+ EXT4_FC_STATUS_SKIPPED,
+ EXT4_FC_STATUS_FAILED,
+};
+
+/*
+ * Fast commit ineligiblity reasons:
+ */
+enum {
+ EXT4_FC_REASON_XATTR = 0,
+ EXT4_FC_REASON_CROSS_RENAME,
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE,
+ EXT4_FC_REASON_NOMEM,
+ EXT4_FC_REASON_SWAP_BOOT,
+ EXT4_FC_REASON_RESIZE,
+ EXT4_FC_REASON_RENAME_DIR,
+ EXT4_FC_REASON_FALLOC_RANGE,
+ EXT4_FC_REASON_INODE_JOURNAL_DATA,
+ EXT4_FC_REASON_ENCRYPTED_FILENAME,
+ EXT4_FC_REASON_MAX
+};
+
+#ifdef __KERNEL__
+/*
+ * In memory list of dentry updates that are performed on the file
+ * system used by fast commit code.
+ */
+struct ext4_fc_dentry_update {
+ int fcd_op; /* Type of update create / unlink / link */
+ int fcd_parent; /* Parent inode number */
+ int fcd_ino; /* Inode number */
+ struct name_snapshot fcd_name; /* Dirent name */
+ struct list_head fcd_list;
+ struct list_head fcd_dilist;
+};
+
+struct ext4_fc_stats {
+ unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
+ unsigned long fc_num_commits;
+ unsigned long fc_ineligible_commits;
+ unsigned long fc_failed_commits;
+ unsigned long fc_skipped_commits;
+ unsigned long fc_numblks;
+ u64 s_fc_avg_commit_time;
+};
+
+#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4
+
+/*
+ * Physical block regions added to different inodes due to fast commit
+ * recovery. These are set during the SCAN phase. During the replay phase,
+ * our allocator excludes these from its allocation. This ensures that
+ * we don't accidentally allocating a block that is going to be used by
+ * another inode.
+ */
+struct ext4_fc_alloc_region {
+ ext4_lblk_t lblk;
+ ext4_fsblk_t pblk;
+ int ino, len;
+};
+
+/*
+ * Fast commit replay state.
+ */
+struct ext4_fc_replay_state {
+ int fc_replay_num_tags;
+ int fc_replay_expected_off;
+ int fc_current_pass;
+ int fc_cur_tag;
+ int fc_crc;
+ struct ext4_fc_alloc_region *fc_regions;
+ int fc_regions_size, fc_regions_used, fc_regions_valid;
+ int *fc_modified_inodes;
+ int fc_modified_inodes_used, fc_modified_inodes_size;
+};
+
+#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
+#endif
+
+static inline const char *tag2str(__u16 tag)
+{
+ switch (tag) {
+ case EXT4_FC_TAG_LINK:
+ return "ADD_ENTRY";
+ case EXT4_FC_TAG_UNLINK:
+ return "DEL_ENTRY";
+ case EXT4_FC_TAG_ADD_RANGE:
+ return "ADD_RANGE";
+ case EXT4_FC_TAG_CREAT:
+ return "CREAT_DENTRY";
+ case EXT4_FC_TAG_DEL_RANGE:
+ return "DEL_RANGE";
+ case EXT4_FC_TAG_INODE:
+ return "INODE";
+ case EXT4_FC_TAG_PAD:
+ return "PAD";
+ case EXT4_FC_TAG_TAIL:
+ return "TAIL";
+ case EXT4_FC_TAG_HEAD:
+ return "HEAD";
+ default:
+ return "ERROR";
+ }
+}
+
+#endif /* __FAST_COMMIT_H__ */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a01e31a032c..7a8b30932189 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,17 +36,34 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct inode *inode)
+/*
+ * Returns %true if the given DIO request should be attempted with DIO, or
+ * %false if it should fall back to buffered I/O.
+ *
+ * DIO isn't well specified; when it's unsupported (either due to the request
+ * being misaligned, or due to the file not supporting DIO at all), filesystems
+ * either fall back to buffered I/O or return EINVAL. For files that don't use
+ * any special features like encryption or verity, ext4 has traditionally
+ * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
+ * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
+ *
+ * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
+ * traditionally falls back to buffered I/O.
+ *
+ * This function implements the traditional ext4 behavior in all these cases.
+ */
+static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
{
- if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
- return false;
- if (fsverity_active(inode))
- return false;
- if (ext4_should_journal_data(inode))
- return false;
- if (ext4_has_inline_data(inode))
+ struct inode *inode = file_inode(iocb->ki_filp);
+ u32 dio_align = ext4_dio_alignment(inode);
+
+ if (dio_align == 0)
return false;
- return true;
+
+ if (dio_align == 1)
+ return true;
+
+ return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
}
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -61,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_should_use_dio(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -74,8 +91,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -115,7 +131,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (!iov_iter_count(to))
@@ -131,6 +147,17 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
+static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode = file_inode(in);
+
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ return -EIO;
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
/*
* Called when an inode is released. Note that this is different
* from ext4_file_open: open gets called at every open, but release
@@ -145,8 +172,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1) &&
- !EXT4_I(inode)->i_reserved_data_blocks)
- {
+ !EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
@@ -187,8 +213,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
return false;
}
-/* Is IO overwriting allocated and initialized blocks? */
-static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+/* Is IO overwriting allocated or initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode,
+ loff_t pos, loff_t len, bool *unwritten)
{
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
@@ -202,12 +229,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
blklen = map.m_len;
err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err != blklen)
+ return false;
/*
* 'err==len' means that all of the blocks have been preallocated,
- * regardless of whether they have been initialized or not. To exclude
- * unwritten extents, we need to check m_flags.
+ * regardless of whether they have been initialized or not. We need to
+ * check m_flags to distinguish the unwritten extents.
*/
- return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
+ return true;
}
static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
@@ -266,95 +296,48 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;
- current->backing_dev_info = inode_to_bdi(inode);
- ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
- current->backing_dev_info = NULL;
+ ret = generic_perform_write(iocb, from);
out:
inode_unlock(inode);
- if (likely(ret > 0)) {
- iocb->ki_pos += ret;
- ret = generic_write_sync(iocb, ret);
- }
-
- return ret;
+ if (unlikely(ret <= 0))
+ return ret;
+ return generic_write_sync(iocb, ret);
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
- ssize_t written, size_t count)
+ ssize_t written, ssize_t count)
{
handle_t *handle;
- bool truncate = false;
- u8 blkbits = inode->i_blkbits;
- ext4_lblk_t written_blk, end_blk;
- int ret;
-
- /*
- * Note that EXT4_I(inode)->i_disksize can get extended up to
- * inode->i_size while the I/O was running due to writeback of delalloc
- * blocks. But, the code in ext4_iomap_alloc() is careful to use
- * zeroed/unwritten extents if this is possible; thus we won't leave
- * uninitialized blocks in a file even if we didn't succeed in writing
- * as much as we intended.
- */
- WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
- if (offset + count <= EXT4_I(inode)->i_disksize) {
- /*
- * We need to ensure that the inode is removed from the orphan
- * list if it has been added prematurely, due to writeback of
- * delalloc blocks.
- */
- if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
- return PTR_ERR(handle);
- }
-
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- }
-
- return written;
- }
-
- if (written < 0)
- goto truncate;
+ lockdep_assert_held_write(&inode->i_rwsem);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- written = PTR_ERR(handle);
- goto truncate;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
if (ext4_update_inode_size(inode, offset + written)) {
- ret = ext4_mark_inode_dirty(handle, inode);
+ int ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) {
- written = ret;
ext4_journal_stop(handle);
- goto truncate;
+ return ret;
}
}
- /*
- * We may need to truncate allocated but not written blocks beyond EOF.
- */
- written_blk = ALIGN(offset + written, 1 << blkbits);
- end_blk = ALIGN(offset + count, 1 << blkbits);
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
-
- /*
- * Remove the inode from the orphan list if it has been extended and
- * everything went OK.
- */
- if (!truncate && inode->i_nlink)
+ if ((written == count) && inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
- if (truncate) {
-truncate:
+ return written;
+}
+
+/*
+ * Clean up the inode after DIO or DAX extending write has completed and the
+ * inode size has been updated using ext4_handle_inode_extension().
+ */
+static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc)
+{
+ lockdep_assert_held_write(&inode->i_rwsem);
+ if (need_trunc) {
ext4_truncate_failed_write(inode);
/*
* If the truncate operation failed early, then the inode may
@@ -363,25 +346,60 @@ truncate:
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
+ return;
}
+ /*
+ * If i_disksize got extended either due to writeback of delalloc
+ * blocks or extending truncate while the DIO was running we could fail
+ * to cleanup the orphan list in ext4_handle_inode_extension(). Do it
+ * now.
+ */
+ if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) {
+ handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- return written;
+ if (IS_ERR(handle)) {
+ /*
+ * The write has successfully completed. Not much to
+ * do with the error here so just cleanup the orphan
+ * list and hope for the best.
+ */
+ ext4_orphan_del(NULL, inode);
+ return;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
}
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
int error, unsigned int flags)
{
- loff_t offset = iocb->ki_pos;
+ loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) &&
+ (iocb->ki_flags & IOCB_ATOMIC))
+ error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos,
+ size);
+ else if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
-
- if (size && flags & IOMAP_DIO_UNWRITTEN)
- return ext4_convert_unwritten_extents(NULL, inode,
- offset, size);
-
- return 0;
+ /*
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended. Also we can race with truncate or write
+ * expanding the file so we have to be a bit careful here.
+ */
+ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
+ pos + size <= i_size_read(inode))
+ return 0;
+ error = ext4_handle_inode_extension(inode, pos, size, size);
+ return error < 0 ? error : 0;
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -399,17 +417,23 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
* - For extending writes case we don't take the shared lock, since it requires
* updating inode i_disksize and/or orphan handling with exclusive lock.
*
- * - shared locking will only be true mostly with overwrites. Otherwise we will
- * switch to exclusive i_rwsem lock.
+ * - shared locking will only be true mostly with overwrites, including
+ * initialized blocks and unwritten blocks. For overwrite unwritten blocks
+ * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
+ * also release exclusive i_rwsem lock.
+ *
+ * - Otherwise we will switch to exclusive i_rwsem lock.
*/
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
- bool *ilock_shared, bool *extend)
+ bool *ilock_shared, bool *extend,
+ bool *unwritten, int *dio_flags)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
loff_t offset;
size_t count;
ssize_t ret;
+ bool overwrite, unaligned_io;
restart:
ret = ext4_generic_write_checks(iocb, from);
@@ -418,22 +442,52 @@ restart:
offset = iocb->ki_pos;
count = ret;
- if (ext4_extending_io(inode, offset, count))
- *extend = true;
+
+ unaligned_io = ext4_unaligned_io(inode, from, offset);
+ *extend = ext4_extending_io(inode, offset, count);
+ overwrite = ext4_overwrite_io(inode, offset, count, unwritten);
+
/*
- * Determine whether the IO operation will overwrite allocated
- * and initialized blocks.
- * We need exclusive i_rwsem for changing security info
- * in file_modified().
+ * Determine whether we need to upgrade to an exclusive lock. This is
+ * required to change security info in file_modified(), for extending
+ * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten
+ * extents (as partial block zeroing may be required).
+ *
+ * Note that unaligned writes are allowed under shared lock so long as
+ * they are pure overwrites. Otherwise, concurrent unaligned writes risk
+ * data corruption due to partial block zeroing in the dio layer, and so
+ * the I/O must occur exclusively.
*/
- if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
- !ext4_overwrite_io(inode, offset, count))) {
+ if (*ilock_shared &&
+ ((!IS_NOSEC(inode) || *extend || !overwrite ||
+ (unaligned_io && *unwritten)))) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
inode_unlock_shared(inode);
*ilock_shared = false;
inode_lock(inode);
goto restart;
}
+ /*
+ * Now that locking is settled, determine dio flags and exclusivity
+ * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce
+ * behavior already. The inode lock is already held exclusive if the
+ * write is non-overwrite or extending, so drain all outstanding dio and
+ * set the force wait dio flag.
+ */
+ if (!*ilock_shared && (unaligned_io || *extend)) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (unaligned_io && (!overwrite || *unwritten))
+ inode_dio_wait(inode);
+ *dio_flags = IOMAP_DIO_FORCE_WAIT;
+ }
+
ret = file_modified(file);
if (ret < 0)
goto out;
@@ -455,18 +509,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
- bool extend = false, unaligned_io = false;
+ bool extend = false, unwritten = false;
bool ilock_shared = true;
+ int dio_flags = 0;
/*
- * We initially start with shared inode lock unless it is
- * unaligned IO which needs exclusive lock anyways.
- */
- if (ext4_unaligned_io(inode, from, offset)) {
- unaligned_io = true;
- ilock_shared = false;
- }
- /*
* Quick check here without any i_rwsem lock to see if it is extending
* IO. A more reliable check is done in ext4_dio_write_checks() with
* proper locking in place.
@@ -490,7 +537,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_should_use_dio(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
@@ -498,32 +545,23 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ext4_buffered_write_iter(iocb, from);
}
- ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+ /*
+ * Prevent inline data from being created since we are going to allocate
+ * blocks for DIO. We know the inode does not currently have inline data
+ * because ext4_should_use_dio() checked for it, but we have to clear
+ * the state flag before the write checks because a lock cycle could
+ * introduce races with other writers.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
+ ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
+ &unwritten, &dio_flags);
if (ret <= 0)
return ret;
- /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
- if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
- ret = -EAGAIN;
- goto out;
- }
-
offset = iocb->ki_pos;
count = ret;
- /*
- * Unaligned direct IO must be serialized among each other as zeroing
- * of partial blocks of two competing unaligned IOs can result in data
- * corruption.
- *
- * So we make sure we don't allow any unaligned IO in flight.
- * For IOs where we need not wait (like unaligned non-AIO DIO),
- * below inode_dio_wait() may anyway become a no-op, since we start
- * with exclusive lock.
- */
- if (unaligned_io)
- inode_dio_wait(inode);
-
if (extend) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
@@ -532,21 +570,27 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
-
ext4_journal_stop(handle);
+ if (ret)
+ goto out;
}
- if (ilock_shared)
+ if (ilock_shared && !unwritten)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io || extend);
-
- if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ dio_flags, NULL, 0);
+ if (ret == -ENOTBLK)
+ ret = 0;
+ if (extend) {
+ /*
+ * We always perform extending DIO write synchronously so by
+ * now the IO is completed and ext4_handle_inode_extension()
+ * was called. Cleanup the inode in case of error or race with
+ * writeback of delalloc blocks.
+ */
+ WARN_ON_ONCE(ret == -EIOCBQUEUED);
+ ext4_inode_extension_cleanup(inode, ret < 0);
+ }
out:
if (ilock_shared)
@@ -558,6 +602,13 @@ out:
ssize_t err;
loff_t endbyte;
+ /*
+ * There is no support for atomic writes on buffered-io yet,
+ * we should never fallback to buffered-io for DIO atomic
+ * writes.
+ */
+ WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
+
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
@@ -627,8 +678,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
- if (extend)
+ if (extend) {
ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ ext4_inode_extension_cleanup(inode, ret < (ssize_t)count);
+ }
out:
inode_unlock(inode);
if (ret > 0)
@@ -640,24 +693,38 @@ out:
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
+ int ret;
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
+
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ size_t len = iov_iter_count(from);
+
+ if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
+ len > EXT4_SB(inode->i_sb)->s_awu_max)
+ return -EINVAL;
+
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
-
- return ext4_buffered_write_iter(iocb, from);
+ else
+ return ext4_buffered_write_iter(iocb, from);
}
#ifdef CONFIG_FS_DAX
-static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
{
int error = 0;
vm_fault_t result;
@@ -673,30 +740,31 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
* read-only.
*
* We check for VM_SHARED rather than vmf->cow_page since the latter is
- * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
+ * unset for order != 0 (i.e. only in do_cow_fault); for
* other sizes, dax_iomap_fault will handle splitting / fallback so that
* we eventually come back with a COW page.
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
- pfn_t pfn;
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ unsigned long pfn;
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
retry:
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
if (IS_ERR(handle)) {
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
return VM_FAULT_SIGBUS;
}
} else {
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
}
- result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
+ result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
if (write) {
ext4_journal_stop(handle);
@@ -705,11 +773,11 @@ retry:
goto retry;
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
- result = dax_finish_sync_fault(vmf, pe_size, pfn);
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ result = dax_finish_sync_fault(vmf, order, pfn);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
} else {
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
}
return result;
@@ -717,7 +785,7 @@ retry:
static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
- return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
+ return ext4_dax_huge_fault(vmf, 0);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
@@ -731,33 +799,38 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = ext4_filemap_fault,
+ .fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
-static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ext4_file_mmap_prepare(struct vm_area_desc *desc)
{
+ int ret;
+ struct file *file = desc->file;
struct inode *inode = file->f_mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct dax_device *dax_dev = sbi->s_daxdev;
+ struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
- if (unlikely(ext4_forced_shutdown(sbi)))
- return -EIO;
+ if (file->f_mode & FMODE_WRITE)
+ ret = ext4_emergency_state(inode->i_sb);
+ else
+ ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
+ if (unlikely(ret))
+ return ret;
/*
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, dax_dev))
+ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
- vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_HUGEPAGE;
+ desc->vm_ops = &ext4_dax_vm_ops;
+ desc->vm_flags |= VM_HUGEPAGE;
} else {
- vma->vm_ops = &ext4_file_vm_ops;
+ desc->vm_ops = &ext4_file_vm_ops;
}
return 0;
}
@@ -771,13 +844,14 @@ static int ext4_sample_last_mounted(struct super_block *sb,
handle_t *handle;
int err;
- if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
+ if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
- if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ !sb_start_intwrite_trylock(sb))
return 0;
- sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
/*
* Sample where the filesystem has been mounted and
* store it in the superblock for sysadmin convenience
@@ -797,12 +871,15 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (IS_ERR(handle))
goto out;
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto out_journal;
- strlcpy(sbi->s_es->s_last_mounted, cp,
- sizeof(sbi->s_es->s_last_mounted));
- ext4_handle_dirty_super(handle, sb);
+ lock_buffer(sbi->s_sbh);
+ strtomem_pad(sbi->s_es->s_last_mounted, cp, 0);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
out_journal:
ext4_journal_stop(handle);
out:
@@ -810,12 +887,16 @@ out:
return err;
}
-static int ext4_file_open(struct inode * inode, struct file * filp)
+static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ if (filp->f_mode & FMODE_WRITE)
+ ret = ext4_emergency_state(inode->i_sb);
+ else
+ ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
+ if (unlikely(ret))
+ return ret;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
if (ret)
@@ -839,7 +920,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
return ret;
}
- filp->f_mode |= FMODE_NOWAIT;
+ if (ext4_inode_can_atomic_write(inode))
+ filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+
+ filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
}
@@ -851,12 +935,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes;
-
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
- else
- maxbytes = inode->i_sb->s_maxbytes;
+ loff_t maxbytes = ext4_get_maxbytes(inode);
switch (whence) {
default:
@@ -885,28 +964,32 @@ const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = ext4_file_mmap,
- .mmap_supported_flags = MAP_SYNC,
+ .mmap_prepare = ext4_file_mmap_prepare,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
- .splice_read = generic_file_splice_read,
+ .splice_read = ext4_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
+ .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
+ FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_file_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
+ .fileattr_get = ext4_fileattr_get,
+ .fileattr_set = ext4_fileattr_set,
};
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index dbccf46f1770..22fc333244ef 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -74,7 +74,8 @@ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2)
static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info,
struct ext4_fsmap *rec)
{
- return rec->fmr_physical < info->gfi_low.fmr_physical;
+ return rec->fmr_physical + rec->fmr_length <=
+ info->gfi_low.fmr_physical;
}
/*
@@ -108,6 +109,9 @@ static int ext4_getfsmap_helper(struct super_block *sb,
/* Are we just counting mappings? */
if (info->gfi_head->fmh_count == 0) {
+ if (info->gfi_head->fmh_entries == UINT_MAX)
+ return EXT4_QUERY_RANGE_ABORT;
+
if (rec_fsblk > info->gfi_next_fsblk)
info->gfi_head->fmh_entries++;
@@ -182,6 +186,59 @@ static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr)
return fmr->fmr_physical + fmr->fmr_length;
}
+static int ext4_getfsmap_meta_helper(struct super_block *sb,
+ ext4_group_t agno, ext4_grpblk_t start,
+ ext4_grpblk_t len, void *priv)
+{
+ struct ext4_getfsmap_info *info = priv;
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *tmp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t fsb, fs_start, fs_end;
+ int error;
+
+ fs_start = fsb = (EXT4_C2B(sbi, start) +
+ ext4_group_first_block_no(sb, agno));
+ fs_end = fs_start + EXT4_C2B(sbi, len);
+
+ /*
+ * Return relevant extents from the meta_list. We emit all extents that
+ * partially/fully overlap with the query range
+ */
+ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
+ if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) {
+ list_del(&p->fmr_list);
+ kfree(p);
+ continue;
+ }
+ if (p->fmr_physical <= fs_end &&
+ p->fmr_physical + p->fmr_length > fs_start) {
+ /* Emit the retained free extent record if present */
+ if (info->gfi_lastfree.fmr_owner) {
+ error = ext4_getfsmap_helper(sb, info,
+ &info->gfi_lastfree);
+ if (error)
+ return error;
+ info->gfi_lastfree.fmr_owner = 0;
+ }
+ error = ext4_getfsmap_helper(sb, info, p);
+ if (error)
+ return error;
+ fsb = p->fmr_physical + p->fmr_length;
+ if (info->gfi_next_fsblk < fsb)
+ info->gfi_next_fsblk = fsb;
+ list_del(&p->fmr_list);
+ kfree(p);
+ continue;
+ }
+ }
+ if (info->gfi_next_fsblk < fsb)
+ info->gfi_next_fsblk = fsb;
+
+ return 0;
+}
+
+
/* Transform a blockgroup's free record into a fsmap */
static int ext4_getfsmap_datadev_helper(struct super_block *sb,
ext4_group_t agno, ext4_grpblk_t start,
@@ -277,7 +334,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,
/* Fabricate an rmap entry for the external log device. */
irec.fmr_physical = journal->j_blk_offset;
- irec.fmr_length = journal->j_maxlen;
+ irec.fmr_length = journal->j_total_len;
irec.fmr_owner = EXT4_FMR_OWN_LOG;
irec.fmr_flags = 0;
@@ -340,6 +397,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
/* Reserved GDT blocks */
if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+
+ /*
+ * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases,
+ * check for that.
+ */
+ if (!len)
+ return 0;
+
error = ext4_getfsmap_fill(meta_list, fsb, len,
EXT4_FMR_OWN_RESV_GDT);
if (error)
@@ -351,8 +416,8 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
/* Compare two fsmap items. */
static int ext4_getfsmap_compare(void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct ext4_fsmap *fa;
struct ext4_fsmap *fb;
@@ -473,6 +538,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
ext4_group_t end_ag;
ext4_grpblk_t first_cluster;
ext4_grpblk_t last_cluster;
+ struct ext4_fsmap irec;
int error = 0;
bofs = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -483,6 +549,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
keys[0].fmr_physical = bofs;
if (keys[1].fmr_physical >= eofs)
keys[1].fmr_physical = eofs - 1;
+ if (keys[1].fmr_physical < keys[0].fmr_physical)
+ return 0;
start_fsb = keys[0].fmr_physical;
end_fsb = keys[1].fmr_physical;
@@ -534,6 +602,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
error = ext4_mballoc_query_range(sb, info->gfi_agno,
EXT4_B2C(sbi, info->gfi_low.fmr_physical),
EXT4_B2C(sbi, info->gfi_high.fmr_physical),
+ ext4_getfsmap_meta_helper,
ext4_getfsmap_datadev_helper, info);
if (error)
goto err;
@@ -553,9 +622,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
goto err;
}
- /* Report any gaps at the end of the bg */
+ /*
+ * The dummy record below will cause ext4_getfsmap_helper() to report
+ * any allocated blocks at the end of the range.
+ */
+ irec.fmr_device = 0;
+ irec.fmr_physical = end_fsb + 1;
+ irec.fmr_length = 0;
+ irec.fmr_owner = EXT4_FMR_OWN_FREE;
+ irec.fmr_flags = 0;
+
info->gfi_last = true;
- error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info);
+ error = ext4_getfsmap_helper(sb, info, &irec);
if (error)
goto err;
@@ -571,8 +649,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->journal_bdev &&
- fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev))
+ if (EXT4_SB(sb)->s_journal_bdev_file &&
+ fm->fmr_device ==
+ new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev))
return true;
return false;
}
@@ -642,9 +721,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->journal_bdev) {
+ if (EXT4_SB(sb)->s_journal_bdev_file) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->journal_bdev->bd_dev);
+ file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h
index 68c8001fee85..ac642be2302e 100644
--- a/fs/ext4/fsmap.h
+++ b/fs/ext4/fsmap.h
@@ -50,7 +50,7 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */
#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */
-#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */
-#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */
+#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* block bitmap */
+#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* inode bitmap */
#endif /* __EXT4_FSMAP_H__ */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1d668c8f131f..e476c6de3074 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -78,21 +79,13 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
-static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
- bool *needs_barrier)
+static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
+ int datasync, bool *needs_barrier)
{
- int ret, err;
-
- ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY_ALL))
- return ret;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- return ret;
-
- err = sync_inode_metadata(inode, 1);
- if (!ret)
- ret = err;
+ struct inode *inode = file->f_inode;
+ int ret;
+ ret = generic_buffers_fsync_noflush(file, start, end, datasync);
if (!ret)
ret = ext4_sync_parent(inode);
if (test_opt(inode->i_sb, BARRIER))
@@ -108,11 +101,18 @@ static int ext4_fsync_journal(struct inode *inode, bool datasync,
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ /*
+ * Fastcommit does not really support fsync on directories or other
+ * special files. Force a full commit.
+ */
+ if (!S_ISREG(inode->i_mode))
+ return ext4_force_commit(inode->i_sb);
+
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
*needs_barrier = true;
- return jbd2_complete_transaction(journal, commit_tid);
+ return ext4_fc_commit(journal, commit_tid);
}
/*
@@ -131,50 +131,40 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
bool needs_barrier = false;
struct inode *inode = file->f_mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (unlikely(ext4_forced_shutdown(sbi)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
- J_ASSERT(ext4_journal_current_handle() == NULL);
+ ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
- if (sb_rdonly(inode->i_sb)) {
- /* Make sure that we read updated s_mount_flags value */
- smp_rmb();
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
- ret = -EROFS;
+ if (sb_rdonly(inode->i_sb))
+ goto out;
+
+ if (!EXT4_SB(inode->i_sb)->s_journal) {
+ ret = ext4_fsync_nojournal(file, start, end, datasync,
+ &needs_barrier);
+ if (needs_barrier)
+ goto issue_flush;
goto out;
}
ret = file_write_and_wait_range(file, start, end);
if (ret)
- return ret;
+ goto out;
/*
- * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
* Metadata is in the journal, we wait for proper transaction to
* commit here.
- *
- * data=journal:
- * filemap_fdatawrite won't do anything (the buffers are clean).
- * ext4_force_commit will write the file data into the journal and
- * will wait on that.
- * filemap_fdatawait() will encounter a ton of newly-dirtied pages
- * (they were dirtied by commit). But that's OK - the blocks are
- * safe in-journal, which is all fsync() needs to ensure.
*/
- if (!sbi->s_journal)
- ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
- else if (ext4_should_journal_data(inode))
- ret = ext4_force_commit(inode->i_sb);
- else
- ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
+ ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
+issue_flush:
if (needs_barrier) {
- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(inode->i_sb->s_bdev);
if (!ret)
ret = err;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 3e133793a5a3..48483cd015d3 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -197,7 +197,7 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
* represented, and whether or not the returned hash is 32 bits or 64
* bits. 32 bit hashes will return 0 for the minor hash.
*/
-static int __ext4fs_dirhash(const char *name, int len,
+static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
__u32 hash;
@@ -233,7 +233,7 @@ static int __ext4fs_dirhash(const char *name, int len,
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -247,7 +247,7 @@ static int __ext4fs_dirhash(const char *name, int len,
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_TEA:
p = name;
while (len > 0) {
@@ -259,9 +259,29 @@ static int __ext4fs_dirhash(const char *name, int len,
hash = buf[0];
minor_hash = buf[1];
break;
+ case DX_HASH_SIPHASH:
+ {
+ struct qstr qname = QSTR_INIT(name, len);
+ __u64 combined_hash;
+
+ if (fscrypt_has_encryption_key(dir)) {
+ combined_hash = fscrypt_fname_siphash(dir, &qname);
+ } else {
+ ext4_warning_inode(dir, "Siphash requires key");
+ return -EINVAL;
+ }
+
+ hash = (__u32)(combined_hash >> 32);
+ minor_hash = (__u32)combined_hash;
+ break;
+ }
default:
hinfo->hash = 0;
- return -1;
+ hinfo->minor_hash = 0;
+ ext4_warning(dir->i_sb,
+ "invalid/unsupported hash tree version %u",
+ hinfo->hash_version);
+ return -EINVAL;
}
hash = hash & ~1;
if (hash == (EXT4_HTREE_EOF_32BIT << 1))
@@ -274,14 +294,15 @@ static int __ext4fs_dirhash(const char *name, int len,
int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
-#ifdef CONFIG_UNICODE
- const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding;
+#if IS_ENABLED(CONFIG_UNICODE)
+ const struct unicode_map *um = dir->i_sb->s_encoding;
int r, dlen;
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
- if (len && IS_CASEFOLDED(dir) && um) {
- buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
+ if (len && IS_CASEFOLDED(dir) &&
+ (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
+ buff = kzalloc(PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
@@ -291,12 +312,12 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
goto opaque_seq;
}
- r = __ext4fs_dirhash(buff, dlen, hinfo);
+ r = __ext4fs_dirhash(dir, buff, dlen, hinfo);
kfree(buff);
return r;
}
opaque_seq:
#endif
- return __ext4fs_dirhash(name, len, hinfo);
+ return __ext4fs_dirhash(dir, name, len, hinfo);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 54d324e80fe5..b20a1bf866ab 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -82,19 +82,23 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
if (buffer_verified(bh))
return 0;
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+
+ grp = ext4_get_group_info(sb, block_group);
+ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
return -EFSCORRUPTED;
ext4_lock_group(sb, block_group);
if (buffer_verified(bh))
goto verified;
blk = ext4_inode_bitmap(sb, desc);
- if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8) ||
+ if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) ||
ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
@@ -189,11 +193,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
- wait_on_buffer(bh);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
+ ext4_read_bh(bh, REQ_META | REQ_PRIO,
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO));
if (!buffer_uptodate(bh)) {
put_bh(bh);
ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
@@ -250,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
"nonexistent device\n", __func__, __LINE__);
return;
}
- if (atomic_read(&inode->i_count) > 1) {
+ if (icount_read(inode) > 1) {
ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
__func__, __LINE__, inode->i_ino,
- atomic_read(&inode->i_count));
+ icount_read(inode));
return;
}
if (inode->i_nlink) {
@@ -284,19 +286,22 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
/* Don't bother if the inode bitmap is corrupt. */
- grp = ext4_get_group_info(sb, block_group);
if (IS_ERR(bitmap_bh)) {
fatal = PTR_ERR(bitmap_bh);
bitmap_bh = NULL;
goto error_return;
}
- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
- fatal = -EFSCORRUPTED;
- goto error_return;
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, block_group);
+ if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+ fatal = -EFSCORRUPTED;
+ goto error_return;
+ }
}
BUFFER_TRACE(bitmap_bh, "get_write_access");
- fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+ fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (fatal)
goto error_return;
@@ -304,7 +309,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
gdp = ext4_get_group_desc(sb, block_group, &bh2);
if (gdp) {
BUFFER_TRACE(bh2, "get_write_access");
- fatal = ext4_journal_get_write_access(handle, bh2);
+ fatal = ext4_journal_get_write_access(handle, sb, bh2,
+ EXT4_JTR_NONE);
}
ext4_lock_group(sb, block_group);
cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
@@ -318,14 +324,15 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (is_directory) {
count = ext4_used_dirs_count(sb, gdp) - 1;
ext4_used_dirs_set(sb, gdp, count);
- percpu_counter_dec(&sbi->s_dirs_counter);
+ if (percpu_counter_initialized(&sbi->s_dirs_counter))
+ percpu_counter_dec(&sbi->s_dirs_counter);
}
- ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
if (sbi->s_log_groups_per_flex) {
struct flex_groups *fg;
@@ -396,7 +403,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
*
* We always try to spread first-level directories.
*
- * If there are blockgroups with both free inodes and free blocks counts
+ * If there are blockgroups with both free inodes and free clusters counts
* not worse than average we return one with smallest directory count.
* Otherwise we simply return a random group.
*
@@ -405,7 +412,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
* It's OK to put directory into a group unless
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks) or
+ * it has too few free clusters left (min_clusters) or
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
@@ -421,7 +428,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei, grp_free;
- ext4_fsblk_t freeb, avefreec;
+ ext4_fsblk_t freec, avefreec;
unsigned int ndirs;
int max_dirs, min_inodes;
ext4_grpblk_t min_clusters;
@@ -440,9 +447,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
- freeb = EXT4_C2B(sbi,
- percpu_counter_read_positive(&sbi->s_freeclusters_counter));
- avefreec = freeb;
+ freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ avefreec = freec;
do_div(avefreec, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
@@ -456,10 +462,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
hinfo.hash_version = DX_HASH_HALF_MD4;
hinfo.seed = sbi->s_hash_seed;
ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
- grp = hinfo.hash;
+ parent_group = hinfo.hash % ngroups;
} else
- grp = prandom_u32();
- parent_group = (unsigned)grp % ngroups;
+ parent_group = get_random_u32_below(ngroups);
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
get_orlov_stats(sb, g, flex_size, &stats);
@@ -503,11 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
goto fallback;
}
- max_dirs = ndirs / ngroups + inodes_per_group / 16;
+ max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1)
min_inodes = 1;
min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
+ if (min_clusters < 0)
+ min_clusters = 0;
/*
* Start looking in the flex group where we last allocated an
@@ -684,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
if (!bh || !buffer_uptodate(bh))
/*
* If the block is not in the buffer cache, then it
- * must have been written out.
+ * must have been written out, or, most unlikely, is
+ * being migrated - false failure should be OK here.
*/
goto out;
@@ -742,6 +750,168 @@ not_found:
return 1;
}
+int ext4_mark_inode_used(struct super_block *sb, int ino)
+{
+ unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+ struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
+ struct ext4_group_desc *gdp;
+ ext4_group_t group;
+ int bit;
+ int err;
+
+ if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+ return -EFSCORRUPTED;
+
+ group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ if (IS_ERR(inode_bitmap_bh))
+ return PTR_ERR(inode_bitmap_bh);
+
+ if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
+ err = 0;
+ goto out;
+ }
+
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+ if (!gdp) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ext4_set_bit(bit, inode_bitmap_bh->b_data);
+
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ err = sync_dirty_buffer(inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
+ /* We may have to initialize the block bitmap if it isn't already */
+ if (ext4_has_group_desc_csum(sb) &&
+ gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ struct buffer_head *block_bitmap_bh;
+
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(block_bitmap_bh)) {
+ err = PTR_ERR(block_bitmap_bh);
+ goto out;
+ }
+
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
+ sync_dirty_buffer(block_bitmap_bh);
+
+ /* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+ ext4_unlock_group(sb, group);
+ brelse(block_bitmap_bh);
+
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ }
+
+ /* Update the relevant bg descriptor fields */
+ if (ext4_has_group_desc_csum(sb)) {
+ int free;
+
+ ext4_lock_group(sb, group); /* while we modify the bg desc */
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ }
+
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ */
+ if (bit >= free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - bit - 1));
+ } else {
+ ext4_lock_group(sb, group);
+ }
+
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (ext4_has_group_desc_csum(sb)) {
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+
+ ext4_unlock_group(sb, group);
+ err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
+ sync_dirty_buffer(group_desc_bh);
+out:
+ brelse(inode_bitmap_bh);
+ return err;
+}
+
+static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
+ bool encrypt)
+{
+ struct super_block *sb = dir->i_sb;
+ int nblocks = 0;
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+ if (p) {
+ int acl_size = p->a_count * sizeof(ext4_acl_entry);
+
+ nblocks += (S_ISDIR(mode) ? 2 : 1) *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, acl_size,
+ true /* is_create */);
+ posix_acl_release(p);
+ }
+#endif
+
+#ifdef CONFIG_SECURITY
+ {
+ int num_security_xattrs = 1;
+
+#ifdef CONFIG_INTEGRITY
+ num_security_xattrs++;
+#endif
+ /*
+ * We assume that security xattrs are never more than 1k.
+ * In practice they are under 128 bytes.
+ */
+ nblocks += num_security_xattrs *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, 1024,
+ true /* is_create */);
+ }
+#endif
+ if (encrypt)
+ nblocks += __ext4_xattr_set_credits(sb,
+ NULL /* inode */,
+ NULL /* block_bh */,
+ FSCRYPT_SET_CONTEXT_MAX_SIZE,
+ true /* is_create */);
+ return nblocks;
+}
+
/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
@@ -752,7 +922,8 @@ not_found:
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
+ handle_t *handle, struct inode *dir,
umode_t mode, const struct qstr *qstr,
__u32 goal, uid_t *owner, __u32 i_flags,
int handle_type, unsigned int line_no,
@@ -771,8 +942,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
- struct ext4_group_info *grp;
- int encrypt = 0;
+ struct ext4_group_info *grp = NULL;
+ bool encrypt = false;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -781,61 +952,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
sb = dir->i_sb;
sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sbi)))
- return ERR_PTR(-EIO);
-
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
- !(i_flags & EXT4_EA_INODE_FL)) {
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return ERR_PTR(err);
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-ENOKEY);
- encrypt = 1;
- }
-
- if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
-
- if (IS_ERR(p))
- return ERR_CAST(p);
- if (p) {
- int acl_size = p->a_count * sizeof(ext4_acl_entry);
-
- nblocks += (S_ISDIR(mode) ? 2 : 1) *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, acl_size,
- true /* is_create */);
- posix_acl_release(p);
- }
-#endif
-
-#ifdef CONFIG_SECURITY
- {
- int num_security_xattrs = 1;
-
-#ifdef CONFIG_INTEGRITY
- num_security_xattrs++;
-#endif
- /*
- * We assume that security xattrs are never
- * more than 1k. In practice they are under
- * 128 bytes.
- */
- nblocks += num_security_xattrs *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, 1024,
- true /* is_create */);
- }
-#endif
- if (encrypt)
- nblocks += __ext4_xattr_set_credits(sb,
- NULL /* inode */, NULL /* block_bh */,
- FSCRYPT_SET_CONTEXT_MAX_SIZE,
- true /* is_create */);
- }
+ ret2 = ext4_emergency_state(sb);
+ if (unlikely(ret2))
+ return ERR_PTR(ret2);
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
@@ -855,10 +974,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
- inode->i_uid = current_fsuid();
+ inode_fsuid_set(inode, idmap);
inode->i_gid = dir->i_gid;
} else
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(idmap, inode, dir, mode);
if (ext4_has_feature_project(sb) &&
ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
@@ -866,10 +985,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
else
ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+ if (!(i_flags & EXT4_EA_INODE_FL)) {
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto out;
+ }
+
err = dquot_initialize(inode);
if (err)
goto out;
+ if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
+ ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
+ if (ret2 < 0) {
+ err = ret2;
+ goto out;
+ }
+ nblocks += ret2;
+ }
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -909,21 +1043,27 @@ got_group:
if (ext4_free_inodes_count(sb, gdp) == 0)
goto next_group;
- grp = ext4_get_group_info(sb, group);
- /* Skip groups with already-known suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
- goto next_group;
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, group);
+ /*
+ * Skip groups with already-known suspicious inode
+ * tables
+ */
+ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ goto next_group;
+ }
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
- IS_ERR(inode_bitmap_bh)) {
+ if (IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
goto next_group;
}
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY) &&
+ EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ goto next_group;
-repeat_in_this_group:
ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
if (!ret2)
goto next_group;
@@ -936,10 +1076,10 @@ repeat_in_this_group:
goto next_group;
}
- if (!handle) {
+ if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
- handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks, 0,
+ handle = __ext4_journal_start_sb(NULL, dir->i_sb,
+ line_no, handle_type, nblocks, 0,
ext4_trans_default_revoke_credits(sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -948,7 +1088,8 @@ repeat_in_this_group:
}
}
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(sb, err);
goto out;
@@ -972,8 +1113,6 @@ repeat_in_this_group:
if (!ret2)
goto got; /* we grabbed the inode! */
- if (ino < EXT4_INODES_PER_GROUP(sb))
- goto repeat_in_this_group;
next_group:
if (++group == ngroups)
group = 0;
@@ -990,7 +1129,8 @@ got:
}
BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, group_desc_bh);
+ err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(sb, err);
goto out;
@@ -1007,7 +1147,8 @@ got:
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
- err = ext4_journal_get_write_access(handle, block_bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
+ EXT4_JTR_NONE);
if (err) {
brelse(block_bitmap_bh);
ext4_std_error(sb, err);
@@ -1024,8 +1165,7 @@ got:
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
ext4_free_group_clusters_set(sb, gdp,
ext4_free_clusters_after_init(sb, group, gdp));
- ext4_block_bitmap_csum_set(sb, group, gdp,
- block_bitmap_bh);
+ ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
@@ -1040,9 +1180,19 @@ got:
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_group_info *grp = NULL;
- down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, group);
+ if (!grp) {
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+ down_read(&grp->alloc_sem); /*
+ * protect vs itable
+ * lazyinit
+ */
+ }
ext4_lock_group(sb, group); /* while we modify the bg desc */
free = EXT4_INODES_PER_GROUP(sb) -
ext4_itable_unused_count(sb, gdp);
@@ -1058,7 +1208,8 @@ got:
if (ino > free)
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - ino));
- up_read(&grp->alloc_sem);
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
+ up_read(&grp->alloc_sem);
} else {
ext4_lock_group(sb, group);
}
@@ -1074,8 +1225,7 @@ got:
}
}
if (ext4_has_group_desc_csum(sb)) {
- ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
@@ -1100,8 +1250,8 @@ got:
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- ei->i_crtime = inode->i_mtime;
+ simple_inode_init_ts(inode);
+ ei->i_crtime = inode_get_mtime(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_dir_start_lookup = 0;
@@ -1116,7 +1266,7 @@ got:
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
@@ -1131,25 +1281,24 @@ got:
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto out;
}
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
- ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
- if (ext4_has_feature_inline_data(sb))
+ if (ext4_has_feature_inline_data(sb) &&
+ (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1162,7 +1311,7 @@ got:
* prevent its deduplication.
*/
if (encrypt) {
- err = fscrypt_inherit_context(dir, inode, handle, true);
+ err = fscrypt_set_context(inode, handle);
if (err)
goto fail_free_drop;
}
@@ -1185,10 +1334,9 @@ got:
}
}
- if (ext4_handle_valid(handle)) {
- ei->i_sync_tid = handle->h_transaction->t_tid;
- ei->i_datasync_tid = handle->h_transaction->t_tid;
- }
+ ext4_set_inode_mapping_order(inode);
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
@@ -1370,15 +1518,10 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
handle_t *handle;
ext4_fsblk_t blk;
int num, ret = 0, used_blks = 0;
-
- /* This should not happen, but just to be sure check this */
- if (sb_rdonly(sb)) {
- ret = 1;
- goto out;
- }
+ unsigned long used_inos = 0;
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
- if (!gdp)
+ if (!gdp || !grp)
goto out;
/*
@@ -1400,30 +1543,45 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
* used inodes so we need to skip blocks with used inodes in
* inode table.
*/
- if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
- used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp)),
- sbi->s_inodes_per_block);
-
- if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) ||
- ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp)) <
- EXT4_FIRST_INO(sb)))) {
- ext4_error(sb, "Something is wrong with group %u: "
- "used itable blocks: %d; "
- "itable unused count: %u",
- group, used_blks,
- ext4_itable_unused_count(sb, gdp));
- ret = 1;
- goto err_out;
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
+ used_inos = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);
+
+ /* Bogus inode unused count? */
+ if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "used itable blocks: %d; "
+ "itable unused count: %u",
+ group, used_blks,
+ ext4_itable_unused_count(sb, gdp));
+ ret = 1;
+ goto err_out;
+ }
+
+ used_inos += group * EXT4_INODES_PER_GROUP(sb);
+ /*
+ * Are there some uninitialized inodes in the inode table
+ * before the first normal inode?
+ */
+ if ((used_blks != sbi->s_itb_per_group) &&
+ (used_inos < EXT4_FIRST_INO(sb))) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "itable unused count: %u; "
+ "itables initialized count: %ld",
+ group, ext4_itable_unused_count(sb, gdp),
+ used_inos);
+ ret = 1;
+ goto err_out;
+ }
}
blk = ext4_inode_table(sb, gdp) + used_blks;
num = sbi->s_itb_per_group - used_blks;
BUFFER_TRACE(group_desc_bh, "get_write_access");
- ret = ext4_journal_get_write_access(handle,
- group_desc_bh);
+ ret = ext4_journal_get_write_access(handle, sb, group_desc_bh,
+ EXT4_JTR_NONE);
if (ret)
goto err_out;
@@ -1441,7 +1599,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
if (ret < 0)
goto err_out;
if (barrier)
- blkdev_issue_flush(sb->s_bdev, GFP_NOFS);
+ blkdev_issue_flush(sb->s_bdev);
skip_zeroout:
ext4_lock_group(sb, group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index be2b66eb65f7..da76353b3a57 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
struct super_block *sb = inode->i_sb;
Indirect *p = chain;
struct buffer_head *bh;
+ unsigned int key;
int ret = -EIO;
*err = 0;
@@ -156,14 +157,20 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_getblk(sb, le32_to_cpu(p->key));
+ key = le32_to_cpu(p->key);
+ if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+ /* the block was out of range */
+ ret = -EFSCORRUPTED;
+ goto failure;
+ }
+ bh = sb_getblk(sb, key);
if (unlikely(!bh)) {
ret = -ENOMEM;
goto failure;
}
if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
+ if (ext4_read_bh(bh, 0, NULL, false) < 0) {
put_bh(bh);
goto failure;
}
@@ -354,7 +361,8 @@ static int ext4_alloc_branch(handle_t *handle,
}
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, ar->inode->i_sb,
+ bh, EXT4_JTR_NONE);
if (err) {
unlock_buffer(bh);
goto failed;
@@ -429,7 +437,8 @@ static int ext4_splice_branch(handle_t *handle,
*/
if (where->bh) {
BUFFER_TRACE(where->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, where->bh);
+ err = ext4_journal_get_write_access(handle, ar->inode->i_sb,
+ where->bh, EXT4_JTR_NONE);
if (err)
goto err_out;
}
@@ -458,7 +467,7 @@ static int ext4_splice_branch(handle_t *handle,
* the new i_size. But that is not done here - it is done in
* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
*/
- jbd_debug(5, "splicing indirect only\n");
+ ext4_debug("splicing indirect only\n");
BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
if (err)
@@ -470,7 +479,7 @@ static int ext4_splice_branch(handle_t *handle,
err = ext4_mark_inode_dirty(handle, ar->inode);
if (unlikely(err))
goto err_out;
- jbd_debug(5, "splicing direct\n");
+ ext4_debug("splicing direct\n");
}
return err;
@@ -530,12 +539,12 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
int indirect_blks;
int blocks_to_boundary = 0;
int depth;
- int count = 0;
+ u64 count = 0;
ext4_fsblk_t first_block = 0;
trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
- J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
- J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+ ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+ ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
depth = ext4_block_to_path(inode, map->m_lblk, offsets,
&blocks_to_boundary);
@@ -579,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
count++;
/* Fill in size of a hole we found */
map->m_pblk = 0;
- map->m_len = min_t(unsigned int, map->m_len, count);
+ map->m_len = umin(map->m_len, count);
goto cleanup;
}
@@ -593,7 +602,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
if (ext4_has_feature_bigalloc(inode->i_sb)) {
EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
"non-extent mapped inodes with bigalloc");
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto out;
}
/* Set up for the direct block allocation */
@@ -641,6 +651,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_update_inode_fsync_trans(handle, inode, 1);
count = ar.len;
+
got_it:
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key);
@@ -693,7 +704,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
+ * i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode);
@@ -704,7 +715,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
/*
* Truncate transactions can be complex and absolutely huge. So we need to
- * be able to restart the transaction at a conventient checkpoint to make
+ * be able to restart the transaction at a convenient checkpoint to make
* sure we don't overflow the journal.
*
* Try to extend this transaction for the purposes of truncation. If
@@ -727,7 +738,8 @@ static int ext4_ind_truncate_ensure_credits(handle_t *handle,
return ret;
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
- ret = ext4_journal_get_write_access(handle, bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(ret))
return ret;
}
@@ -858,8 +870,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
else if (ext4_should_journal_data(inode))
flags |= EXT4_FREE_BLOCKS_FORGET;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
- count)) {
+ if (!ext4_inode_block_valid(inode, block_to_free, count)) {
EXT4_ERROR_INODE(inode, "attempt to clear invalid "
"blocks %llu len %lu",
(unsigned long long) block_to_free, count);
@@ -916,7 +927,8 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) { /* For indirect block */
BUFFER_TRACE(this_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, this_bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ this_bh, EXT4_JTR_NONE);
/* Important: if we can't update the indirect pointers
* to the blocks, we can't free them. */
if (err)
@@ -1004,8 +1016,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
if (!nr)
continue; /* A hole */
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- nr, 1)) {
+ if (!ext4_inode_block_valid(inode, nr, 1)) {
EXT4_ERROR_INODE(inode,
"invalid indirect mapped "
"block %lu (level %d)",
@@ -1014,14 +1025,14 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
/* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
+ bh = ext4_sb_bread_nofail(inode->i_sb, nr);
/*
* A read failure? Report error and clear slot
* (should be rare).
*/
- if (!bh) {
- ext4_error_inode_block(inode, nr, EIO,
+ if (IS_ERR(bh)) {
+ ext4_error_inode_block(inode, nr, -PTR_ERR(bh),
"Read failure");
continue;
}
@@ -1035,7 +1046,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
brelse(bh);
/*
- * Everything below this this pointer has been
+ * Everything below this pointer has been
* released. Now let this top-of-subtree go.
*
* We want the freeing of this indirect block to be
@@ -1080,7 +1091,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
*/
BUFFER_TRACE(parent_bh, "get_write_access");
if (!ext4_journal_get_write_access(handle,
- parent_bh)){
+ inode->i_sb, parent_bh,
+ EXT4_JTR_NONE)) {
*p = 0;
BUFFER_TRACE(parent_bh,
"call ext4_handle_dirty_metadata");
@@ -1182,21 +1194,21 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
@@ -1436,7 +1448,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
if (++n >= n2)
break;
@@ -1445,7 +1457,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
if (++n >= n2)
break;
@@ -1454,7 +1466,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c3a1ad2db122..1f6bc05593df 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -6,7 +6,9 @@
#include <linux/iomap.h>
#include <linux/fiemap.h>
+#include <linux/namei.h>
#include <linux/iversion.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -18,6 +20,11 @@
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
+
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ void **fsdata);
+
static int ext4_get_inline_size(struct inode *inode)
{
if (EXT4_I(inode)->i_inline_off)
@@ -32,8 +39,12 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_entry *entry;
struct ext4_inode *raw_inode;
+ void *end;
int free, min_offs;
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+ return 0;
+
min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
EXT4_GOOD_OLD_INODE_SIZE -
EXT4_I(inode)->i_extra_isize -
@@ -52,14 +63,23 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
entry = IFIRST(header);
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
/* Compute min_offs. */
- for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ while (!IS_LAST_ENTRY(entry)) {
+ void *next = EXT4_XATTR_NEXT(entry);
+
+ if (next >= end) {
+ EXT4_ERROR_INODE(inode,
+ "corrupt xattr in inline inode");
+ return 0;
+ }
if (!entry->e_value_inum && entry->e_value_size) {
size_t offs = le16_to_cpu(entry->e_value_offs);
if (offs < min_offs)
min_offs = offs;
}
+ entry = next;
}
free = min_offs -
((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
@@ -154,7 +174,6 @@ int ext4_find_inline_data_nolock(struct inode *inode)
(void *)ext4_raw_inode(&is.iloc));
EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
le32_to_cpu(is.s.here->e_value_size);
- ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
out:
brelse(is.iloc.bh);
@@ -175,8 +194,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer,
BUG_ON(len > EXT4_I(inode)->i_inline_size);
- cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
- len : EXT4_MIN_INLINE_DATA_SIZE;
+ cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE);
raw_inode = ext4_raw_inode(iloc);
memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
@@ -204,7 +222,7 @@ out:
/*
* write the buffer to the inline inode.
* If 'create' is set, we don't need to do the extra copy in the xattr
- * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * value since it is already handled by ext4_xattr_ibody_set.
* That saves us one memcpy.
*/
static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
@@ -215,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
struct ext4_inode *raw_inode;
int cp_len = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
BUG_ON(!EXT4_I(inode)->i_inline_off);
@@ -264,7 +282,8 @@ static int ext4_create_inline_data(handle_t *handle,
return error;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
@@ -276,7 +295,7 @@ static int ext4_create_inline_data(handle_t *handle,
len = 0;
}
- /* Insert the the xttr entry. */
+ /* Insert the xttr entry. */
i.value = value;
i.value_len = len;
@@ -284,9 +303,13 @@ static int ext4_create_inline_data(handle_t *handle,
if (error)
goto out;
- BUG_ON(!is.s.not_found);
+ if (!is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "unexpected inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
if (error == -ENOSPC)
ext4_clear_inode_state(inode,
@@ -335,7 +358,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error)
goto out;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "missing inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
len -= EXT4_MIN_INLINE_DATA_SIZE;
value = kzalloc(len, GFP_NOFS);
@@ -346,19 +373,20 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
value, len);
- if (error == -ENODATA)
+ if (error < 0)
goto out;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
- /* Update the xttr entry. */
+ /* Update the xattr entry. */
i.value = value;
i.value_len = len;
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error)
goto out;
@@ -377,7 +405,7 @@ out:
}
static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+ loff_t len)
{
int ret, size, no_expand;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -390,7 +418,12 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
return -ENOSPC;
ext4_write_lock_xattr(inode, &no_expand);
-
+ /*
+ * ei->i_inline_size may have changed since the initial check
+ * if other xattrs were added. Recalculate to ensure
+ * ext4_update_inline_data() validates against current capacity.
+ */
+ (void) ext4_find_inline_data_nolock(inode);
if (ei->i_inline_off)
ret = ext4_update_inline_data(handle, inode, len);
else
@@ -418,20 +451,25 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
if (!ei->i_inline_off)
return 0;
+ down_write(&ei->i_data_sem);
+
error = ext4_get_inode_loc(inode, &is.iloc);
- if (error)
+ if (error) {
+ up_write(&ei->i_data_sem);
return error;
+ }
error = ext4_xattr_ibody_find(inode, &i, &is);
if (error)
goto out;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error)
goto out;
@@ -458,19 +496,20 @@ out:
brelse(is.iloc.bh);
if (error == -ENODATA)
error = 0;
+ up_write(&ei->i_data_sem);
return error;
}
-static int ext4_read_inline_page(struct inode *inode, struct page *page)
+static int ext4_read_inline_folio(struct inode *inode, struct folio *folio)
{
void *kaddr;
int ret = 0;
size_t len;
struct ext4_iloc iloc;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
BUG_ON(!ext4_has_inline_data(inode));
- BUG_ON(page->index);
+ BUG_ON(folio->index);
if (!EXT4_I(inode)->i_inline_off) {
ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
@@ -483,19 +522,19 @@ static int ext4_read_inline_page(struct inode *inode, struct page *page)
goto out;
len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
- kaddr = kmap_atomic(page);
+ BUG_ON(len > PAGE_SIZE);
+ kaddr = kmap_local_folio(folio, 0);
ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- zero_user_segment(page, len, PAGE_SIZE);
- SetPageUptodate(page);
+ kaddr = folio_zero_tail(folio, len, kaddr + len);
+ kunmap_local(kaddr);
+ folio_mark_uptodate(folio);
brelse(iloc.bh);
out:
return ret;
}
-int ext4_readpage_inline(struct inode *inode, struct page *page)
+int ext4_readpage_inline(struct inode *inode, struct folio *folio)
{
int ret = 0;
@@ -509,27 +548,26 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
* Current inline data can only exist in the 1st page,
* So for all the other pages, just set them uptodate.
*/
- if (!page->index)
- ret = ext4_read_inline_page(inode, page);
- else if (!PageUptodate(page)) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ if (!folio->index)
+ ret = ext4_read_inline_folio(inode, folio);
+ else if (!folio_test_uptodate(folio)) {
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
}
up_read(&EXT4_I(inode)->xattr_sem);
- unlock_page(page);
+ folio_unlock(folio);
return ret >= 0 ? 0 : ret;
}
static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
- struct inode *inode,
- unsigned flags)
+ struct inode *inode)
{
int ret, needed_blocks, no_expand;
handle_t *handle = NULL;
int retries = 0, sem_held = 0;
- struct page *page = NULL;
+ struct folio *folio = NULL;
unsigned from, to;
struct ext4_iloc iloc;
@@ -542,7 +580,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
return 0;
}
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -558,12 +596,11 @@ retry:
/* We cannot recurse into the filesystem as the transaction is already
* started */
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, 0, flags);
- if (!page) {
- ret = -ENOMEM;
- goto out;
+ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ goto out_nofolio;
}
ext4_write_lock_xattr(inode, &no_expand);
@@ -576,32 +613,35 @@ retry:
from = 0;
to = ext4_get_inline_size(inode);
- if (!PageUptodate(page)) {
- ret = ext4_read_inline_page(inode, page);
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_read_inline_folio(inode, folio);
if (ret < 0)
goto out;
}
+ ext4_fc_track_inode(handle, inode);
ret = ext4_destroy_inline_data_nolock(handle, inode);
if (ret)
goto out;
if (ext4_should_dioread_nolock(inode)) {
- ret = __block_write_begin(page, from, to,
- ext4_get_block_unwritten);
+ ret = ext4_block_write_begin(handle, folio, from, to,
+ ext4_get_block_unwritten);
} else
- ret = __block_write_begin(page, from, to, ext4_get_block);
+ ret = ext4_block_write_begin(handle, folio, from, to,
+ ext4_get_block);
+ clear_buffer_new(folio_buffers(folio));
if (!ret && ext4_should_journal_data(inode)) {
- ret = ext4_walk_page_buffers(handle, page_buffers(page),
- from, to, NULL,
- do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, inode,
+ folio_buffers(folio), from, to,
+ NULL, do_journal_get_write_access);
}
if (ret) {
- unlock_page(page);
- put_page(page);
- page = NULL;
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
ext4_orphan_add(handle, inode);
ext4_write_unlock_xattr(inode, &no_expand);
sem_held = 0;
@@ -621,13 +661,14 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
- if (page)
- block_commit_write(page, from, to);
+ if (folio)
+ block_commit_write(folio, from, to);
out:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
+out_nofolio:
if (sem_held)
ext4_write_unlock_xattr(inode, &no_expand);
if (handle)
@@ -637,155 +678,191 @@ out:
}
/*
- * Try to write data in the inode.
- * If the inode has inline data, check whether the new write can be
- * in the inode also. If not, create the page the handle, move the data
- * to the page make it update and let the later codes create extent for it.
+ * Prepare the write for the inline data.
+ * If the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
*/
-int ext4_try_to_write_inline_data(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- unsigned flags,
- struct page **pagep)
+int ext4_generic_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop,
+ void **fsdata, bool da)
{
int ret;
handle_t *handle;
- struct page *page;
+ struct folio *folio;
struct ext4_iloc iloc;
-
- if (pos + len > ext4_get_max_inline_size(inode))
- goto convert;
+ int retries = 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
- /*
- * The possible write could happen in the inode,
- * so try to reserve the space in inode first.
- */
+retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- handle = NULL;
- goto out;
+ goto out_release_bh;
}
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
- goto out;
+ goto out_stop_journal;
- /* We don't have space in inline inode, so convert it to extent. */
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
- brelse(iloc.bh);
- goto convert;
- }
-
- ret = ext4_journal_get_write_access(handle, iloc.bh);
- if (ret)
- goto out;
+ if (!da) {
+ brelse(iloc.bh);
+ /* Retry inside */
+ return ext4_convert_inline_data_to_extent(mapping, inode);
+ }
- flags |= AOP_FLAG_NOFS;
+ ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ goto out_release_bh;
+ }
- page = grab_cache_page_write_begin(mapping, 0, flags);
- if (!page) {
- ret = -ENOMEM;
- goto out;
+ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ goto out_stop_journal;
}
- *pagep = page;
down_read(&EXT4_I(inode)->xattr_sem);
+ /* Someone else had converted it to extent */
if (!ext4_has_inline_data(inode)) {
ret = 0;
- unlock_page(page);
- put_page(page);
- goto out_up_read;
+ goto out_release_folio;
}
- if (!PageUptodate(page)) {
- ret = ext4_read_inline_page(inode, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- goto out_up_read;
- }
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_read_inline_folio(inode, folio);
+ if (ret < 0)
+ goto out_release_folio;
}
- ret = 1;
- handle = NULL;
-out_up_read:
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE);
+ if (ret)
+ goto out_release_folio;
+ *foliop = folio;
up_read(&EXT4_I(inode)->xattr_sem);
-out:
- if (handle && (ret != 1))
- ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return 1;
+
+out_release_folio:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ folio_unlock(folio);
+ folio_put(folio);
+out_stop_journal:
+ ext4_journal_stop(handle);
+out_release_bh:
brelse(iloc.bh);
return ret;
-convert:
- return ext4_convert_inline_data_to_extent(mapping,
- inode, flags);
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop)
+{
+ if (pos + len > ext4_get_max_inline_size(inode))
+ return ext4_convert_inline_data_to_extent(mapping, inode);
+ return ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, NULL, false);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page)
+ unsigned copied, struct folio *folio)
{
- int ret, no_expand;
+ handle_t *handle = ext4_journal_current_handle();
+ int no_expand;
void *kaddr;
struct ext4_iloc iloc;
+ int ret = 0, ret2;
- if (unlikely(copied < len)) {
- if (!PageUptodate(page)) {
- copied = 0;
- goto out;
- }
- }
-
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret) {
- ext4_std_error(inode->i_sb, ret);
+ if (unlikely(copied < len) && !folio_test_uptodate(folio))
copied = 0;
- goto out;
- }
- ext4_write_lock_xattr(inode, &no_expand);
- BUG_ON(!ext4_has_inline_data(inode));
+ if (likely(copied)) {
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ folio_unlock(folio);
+ folio_put(folio);
+ ext4_std_error(inode->i_sb, ret);
+ goto out;
+ }
+ ext4_write_lock_xattr(inode, &no_expand);
+ BUG_ON(!ext4_has_inline_data(inode));
- kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
- kunmap_atomic(kaddr);
- SetPageUptodate(page);
- /* clear page dirty so that writepages wouldn't work for us. */
- ClearPageDirty(page);
+ /*
+ * ei->i_inline_off may have changed since
+ * ext4_write_begin() called
+ * ext4_try_to_write_inline_data()
+ */
+ (void) ext4_find_inline_data_nolock(inode);
- ext4_write_unlock_xattr(inode, &no_expand);
- brelse(iloc.bh);
- mark_inode_dirty(inode);
-out:
- return copied;
-}
+ kaddr = kmap_local_folio(folio, 0);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
+ kunmap_local(kaddr);
+ folio_mark_uptodate(folio);
+ /* clear dirty flag so that writepages wouldn't work for us. */
+ folio_clear_dirty(folio);
-struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page)
-{
- int ret, no_expand;
- void *kaddr;
- struct ext4_iloc iloc;
+ ext4_write_unlock_xattr(inode, &no_expand);
+ brelse(iloc.bh);
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret) {
- ext4_std_error(inode->i_sb, ret);
- return NULL;
+ /*
+ * It's important to update i_size while still holding folio
+ * lock: page writeout could otherwise come in and zero
+ * beyond i_size.
+ */
+ ext4_update_inode_size(inode, pos + copied);
}
+ folio_unlock(folio);
+ folio_put(folio);
- ext4_write_lock_xattr(inode, &no_expand);
- kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
- kunmap_atomic(kaddr);
- ext4_write_unlock_xattr(inode, &no_expand);
+ /*
+ * Don't mark the inode dirty under folio lock. First, it unnecessarily
+ * makes the holding time of folio lock longer. Second, it forces lock
+ * ordering of folio lock and transaction start for journaling
+ * filesystems.
+ */
+ if (likely(copied))
+ mark_inode_dirty(inode);
+out:
+ /*
+ * If we didn't copy as much data as expected, we need to trim back
+ * size of xattr containing inline data.
+ */
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ ext4_orphan_add(handle, inode);
- return iloc.bh;
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+ return ret ? ret : copied;
}
/*
@@ -795,19 +872,19 @@ ext4_journalled_write_inline_data(struct inode *inode,
* clear the inode state safely.
* 2. The inode has inline data, then we need to read the data, make it
* update and dirty so that ext4_da_writepages can handle it. We don't
- * need to start the journal since the file's metatdata isn't changed now.
+ * need to start the journal since the file's metadata isn't changed now.
*/
static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
struct inode *inode,
- unsigned flags,
void **fsdata)
{
int ret = 0, inline_size;
- struct page *page;
+ struct folio *folio;
- page = grab_cache_page_write_begin(mapping, 0, flags);
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
@@ -817,168 +894,37 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
inline_size = ext4_get_inline_size(inode);
- if (!PageUptodate(page)) {
- ret = ext4_read_inline_page(inode, page);
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_read_inline_folio(inode, folio);
if (ret < 0)
goto out;
}
- ret = __block_write_begin(page, 0, inline_size,
- ext4_da_get_block_prep);
+ ret = ext4_block_write_begin(NULL, folio, 0, inline_size,
+ ext4_da_get_block_prep);
if (ret) {
up_read(&EXT4_I(inode)->xattr_sem);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
ext4_truncate_failed_write(inode);
return ret;
}
- SetPageDirty(page);
- SetPageUptodate(page);
+ clear_buffer_new(folio_buffers(folio));
+ folio_mark_dirty(folio);
+ folio_mark_uptodate(folio);
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
*fsdata = (void *)CONVERT_INLINE_DATA;
out:
up_read(&EXT4_I(inode)->xattr_sem);
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
return ret;
}
-/*
- * Prepare the write for the inline data.
- * If the data can be written into the inode, we just read
- * the page and make it uptodate, and start the journal.
- * Otherwise read the page, makes it dirty so that it can be
- * handle in writepages(the i_disksize update is left to the
- * normal ext4_da_write_end).
- */
-int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- unsigned flags,
- struct page **pagep,
- void **fsdata)
-{
- int ret, inline_size;
- handle_t *handle;
- struct page *page;
- struct ext4_iloc iloc;
- int retries = 0;
-
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret)
- return ret;
-
-retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
-
- inline_size = ext4_get_max_inline_size(inode);
-
- ret = -ENOSPC;
- if (inline_size >= pos + len) {
- ret = ext4_prepare_inline_data(handle, inode, pos + len);
- if (ret && ret != -ENOSPC)
- goto out_journal;
- }
-
- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- flags |= AOP_FLAG_NOFS;
-
- if (ret == -ENOSPC) {
- ext4_journal_stop(handle);
- ret = ext4_da_convert_inline_data_to_extent(mapping,
- inode,
- flags,
- fsdata);
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry_journal;
- goto out;
- }
-
- page = grab_cache_page_write_begin(mapping, 0, flags);
- if (!page) {
- ret = -ENOMEM;
- goto out_journal;
- }
-
- down_read(&EXT4_I(inode)->xattr_sem);
- if (!ext4_has_inline_data(inode)) {
- ret = 0;
- goto out_release_page;
- }
-
- if (!PageUptodate(page)) {
- ret = ext4_read_inline_page(inode, page);
- if (ret < 0)
- goto out_release_page;
- }
- ret = ext4_journal_get_write_access(handle, iloc.bh);
- if (ret)
- goto out_release_page;
-
- up_read(&EXT4_I(inode)->xattr_sem);
- *pagep = page;
- brelse(iloc.bh);
- return 1;
-out_release_page:
- up_read(&EXT4_I(inode)->xattr_sem);
- unlock_page(page);
- put_page(page);
-out_journal:
- ext4_journal_stop(handle);
-out:
- brelse(iloc.bh);
- return ret;
-}
-
-int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
- unsigned len, unsigned copied,
- struct page *page)
-{
- int ret;
-
- ret = ext4_write_inline_data_end(inode, pos, len, copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- return ret;
- }
- copied = ret;
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
- *
- * But it's important to update i_size while still holding page lock:
- * page writeout could otherwise come in and zero beyond i_size.
- */
- if (pos+copied > inode->i_size)
- i_size_write(inode, pos+copied);
- unlock_page(page);
- put_page(page);
-
- /*
- * Don't mark the inode dirty under page lock. First, it unnecessarily
- * makes the holding time of page lock longer. Second, it forces lock
- * ordering of page lock and transaction start for journaling
- * filesystems.
- */
- mark_inode_dirty(inode);
-
- return copied;
-}
-
#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
void *inline_start, int inline_size)
@@ -1022,16 +968,17 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
int err;
struct ext4_dir_entry_2 *de;
- err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+ err = ext4_find_dest_de(dir, iloc->bh, inline_start,
inline_size, fname, &de);
if (err)
return err;
BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh,
+ EXT4_JTR_NONE);
if (err)
return err;
- ext4_insert_dentry(inode, de, inline_size, fname);
+ ext4_insert_dentry(dir, inode, de, inline_size, fname);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
@@ -1046,7 +993,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
return 1;
@@ -1068,20 +1015,20 @@ static void *ext4_get_inline_xattr_pos(struct inode *inode,
}
/* Set the final de to cover the whole block. */
-static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+void ext4_update_final_de(void *de_buf, int old_size, int new_size)
{
struct ext4_dir_entry_2 *de, *prev_de;
void *limit;
int de_len;
- de = (struct ext4_dir_entry_2 *)de_buf;
+ de = de_buf;
if (old_size) {
limit = de_buf + old_size;
do {
prev_de = de;
de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
de_buf += de_len;
- de = (struct ext4_dir_entry_2 *)de_buf;
+ de = de_buf;
} while (de_buf < limit);
prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
@@ -1100,7 +1047,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
int new_size = get_max_inline_xattr_value_size(dir, iloc);
- if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+ if (new_size - old_size <= ext4_dir_rec_len(1, NULL))
return -ENOSPC;
ret = ext4_update_inline_data(handle, dir,
@@ -1119,55 +1066,19 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc,
void *buf, int inline_size)
{
- ext4_create_inline_data(handle, inode, inline_size);
+ int ret;
+
+ ret = ext4_create_inline_data(handle, inode, inline_size);
+ if (ret) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)",
+ inode->i_ino, ret);
+ return;
+ }
ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
-static int ext4_finish_convert_inline_dir(handle_t *handle,
- struct inode *inode,
- struct buffer_head *dir_block,
- void *buf,
- int inline_size)
-{
- int err, csum_size = 0, header_size = 0;
- struct ext4_dir_entry_2 *de;
- void *target = dir_block->b_data;
-
- /*
- * First create "." and ".." and then copy the dir information
- * back to the block.
- */
- de = (struct ext4_dir_entry_2 *)target;
- de = ext4_init_dot_dotdot(inode, de,
- inode->i_sb->s_blocksize, csum_size,
- le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
- header_size = (void *)de - target;
-
- memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
- inline_size - EXT4_INLINE_DOTDOT_SIZE);
-
- if (ext4_has_metadata_csum(inode->i_sb))
- csum_size = sizeof(struct ext4_dir_entry_tail);
-
- inode->i_size = inode->i_sb->s_blocksize;
- i_size_write(inode, inode->i_sb->s_blocksize);
- EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- ext4_update_final_de(dir_block->b_data,
- inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
- inode->i_sb->s_blocksize - csum_size);
-
- if (csum_size)
- ext4_initialize_dirent_tail(dir_block,
- inode->i_sb->s_blocksize);
- set_buffer_uptodate(dir_block);
- err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
- if (err)
- return err;
- set_buffer_verified(dir_block);
- return ext4_mark_inode_dirty(handle, inode);
-}
-
static int ext4_convert_inline_data_nolock(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc)
@@ -1223,7 +1134,8 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
}
lock_buffer(data_bh);
- error = ext4_journal_get_create_access(handle, data_bh);
+ error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh,
+ EXT4_JTR_NONE);
if (error) {
unlock_buffer(data_bh);
error = -EIO;
@@ -1234,14 +1146,23 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
if (!S_ISDIR(inode->i_mode)) {
memcpy(data_bh->b_data, buf, inline_size);
set_buffer_uptodate(data_bh);
+ unlock_buffer(data_bh);
error = ext4_handle_dirty_metadata(handle,
inode, data_bh);
} else {
- error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
- buf, inline_size);
+ unlock_buffer(data_bh);
+ inode->i_size = inode->i_sb->s_blocksize;
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+
+ error = ext4_init_dirblock(handle, inode, data_bh,
+ le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode),
+ buf + EXT4_INLINE_DOTDOT_SIZE,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE);
+ if (!error)
+ error = ext4_mark_inode_dirty(handle, inode);
}
- unlock_buffer(data_bh);
out_restore:
if (error)
ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
@@ -1378,20 +1299,20 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
if (pos == 0) {
fake.inode = cpu_to_le32(inode->i_ino);
fake.name_len = 1;
- strcpy(fake.name, ".");
+ memcpy(fake.name, ".", 2);
fake.rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(fake.name_len),
- inline_size);
+ ext4_dir_rec_len(fake.name_len, NULL),
+ inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_OFFSET;
} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
fake.inode = cpu_to_le32(parent_ino);
fake.name_len = 2;
- strcpy(fake.name, "..");
+ memcpy(fake.name, "..", 3);
fake.rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(fake.name_len),
- inline_size);
+ ext4_dir_rec_len(fake.name_len, NULL),
+ inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_SIZE;
@@ -1406,7 +1327,16 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
}
}
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (ext4_hash_in_dirent(dir)) {
+ hinfo->hash = EXT4_DIRENT_HASH(de);
+ hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ } else {
+ err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ }
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@@ -1451,6 +1381,7 @@ int ext4_read_inline_dir(struct file *file,
struct ext4_iloc iloc;
void *dir_buf = NULL;
int dotdot_offset, dotdot_size, extra_offset, extra_size;
+ struct dir_private_info *info = file->private_data;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -1488,18 +1419,18 @@ int ext4_read_inline_dir(struct file *file,
* So we will use extra_offset and extra_size to indicate them
* during the inline dir iteration.
*/
- dotdot_offset = EXT4_DIR_REC_LEN(1);
- dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+ dotdot_offset = ext4_dir_rec_len(1, NULL);
+ dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL);
extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
extra_size = extra_offset + inline_size;
/*
- * If the version has changed since the last call to
+ * If the cookie has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the inline
* dir to make sure.
*/
- if (!inode_eq_iversion(inode, file->f_version)) {
+ if (!inode_eq_iversion(inode, info->cookie)) {
for (i = 0; i < extra_size && i < offset;) {
/*
* "." is with offset 0 and
@@ -1524,14 +1455,14 @@ int ext4_read_inline_dir(struct file *file,
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len, extra_size)
- < EXT4_DIR_REC_LEN(1))
+ < ext4_dir_rec_len(1, NULL))
break;
i += ext4_rec_len_from_disk(de->rec_len,
extra_size);
}
offset = i;
ctx->pos = offset;
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
}
while (ctx->pos < extra_size) {
@@ -1568,6 +1499,35 @@ out:
return ret;
}
+void *ext4_read_inline_link(struct inode *inode)
+{
+ struct ext4_iloc iloc;
+ int ret, inline_size;
+ void *link;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = -ENOMEM;
+ inline_size = ext4_get_inline_size(inode);
+ link = kmalloc(inline_size + 1, GFP_NOFS);
+ if (!link)
+ goto out;
+
+ ret = ext4_read_inline_data(inode, link, inline_size, &iloc);
+ if (ret < 0) {
+ kfree(link);
+ goto out;
+ }
+ nd_terminate_link(link, inode->i_size, ret);
+out:
+ if (ret < 0)
+ link = ERR_PTR(ret);
+ brelse(iloc.bh);
+ return link;
+}
+
struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval)
@@ -1626,24 +1586,36 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir,
int *has_inline_data)
{
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
int ret;
- struct ext4_iloc iloc;
void *inline_start;
int inline_size;
- if (ext4_get_inode_loc(dir, &iloc))
- return NULL;
+ ret = ext4_get_inode_loc(dir, &is.iloc);
+ if (ret)
+ return ERR_PTR(ret);
down_read(&EXT4_I(dir)->xattr_sem);
+
+ ret = ext4_xattr_ibody_find(dir, &i, &is);
+ if (ret)
+ goto out;
+
if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0;
goto out;
}
- inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block +
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
- ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+ ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size,
dir, fname, 0, res_dir);
if (ret == 1)
goto out_find;
@@ -1653,20 +1625,23 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
goto out;
- inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc);
inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
- ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+ ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size,
dir, fname, 0, res_dir);
if (ret == 1)
goto out_find;
out:
- brelse(iloc.bh);
- iloc.bh = NULL;
+ brelse(is.iloc.bh);
+ if (ret < 0)
+ is.iloc.bh = ERR_PTR(ret);
+ else
+ is.iloc.bh = NULL;
out_find:
up_read(&EXT4_I(dir)->xattr_sem);
- return iloc.bh;
+ return is.iloc.bh;
}
int ext4_delete_inline_entry(handle_t *handle,
@@ -1702,11 +1677,12 @@ int ext4_delete_inline_entry(handle_t *handle,
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ err = ext4_generic_delete_entry(dir, de_del, bh,
inline_start, inline_size, 0);
if (err)
goto out;
@@ -1759,19 +1735,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
void *inline_pos;
unsigned int offset;
struct ext4_dir_entry_2 *de;
- bool ret = true;
+ bool ret = false;
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
EXT4_ERROR_INODE_ERR(dir, -err,
"error %d getting inode %lu block",
err, dir->i_ino);
- return true;
+ return false;
}
down_read(&EXT4_I(dir)->xattr_sem);
if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0;
+ ret = true;
goto out;
}
@@ -1780,7 +1757,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - no `..'",
dir->i_ino);
- ret = true;
goto out;
}
@@ -1799,16 +1775,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len,
inline_size);
- ret = true;
goto out;
}
if (le32_to_cpu(de->inode)) {
- ret = false;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
}
+ ret = true;
out:
up_read(&EXT4_I(dir)->xattr_sem);
brelse(iloc.bh);
@@ -1873,13 +1848,14 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
};
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
if (IS_ERR(handle))
return PTR_ERR(handle);
ext4_write_lock_xattr(inode, &no_expand);
if (!ext4_has_inline_data(inode)) {
+ ext4_write_unlock_xattr(inode, &no_expand);
*has_inline = 0;
ext4_journal_stop(handle);
return 0;
@@ -1897,12 +1873,26 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
EXT4_I(inode)->i_disksize = i_size;
if (i_size < inline_size) {
+ /*
+ * if there's inline data to truncate and this file was
+ * converted to extents after that inline data was written,
+ * the extent status cache must be cleared to avoid leaving
+ * behind stale delayed allocated extent entries
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+
/* Clear the content in the xattr space. */
if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
goto out_error;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode,
+ "missing inline data xattr");
+ err = -EFSCORRUPTED;
+ goto out_error;
+ }
value_len = le32_to_cpu(is.s.here->e_value_size);
value = kmalloc(value_len, GFP_NOFS);
@@ -1919,8 +1909,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
i.value = value;
i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
- err = ext4_xattr_ibody_inline_set(handle, inode,
- &i, &is);
+ err = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (err)
goto out_error;
}
@@ -1947,7 +1936,7 @@ out:
ext4_orphan_del(handle, inode);
if (err == 0) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err = ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@@ -1965,9 +1954,21 @@ int ext4_convert_inline_data(struct inode *inode)
if (!ext4_has_inline_data(inode)) {
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
return 0;
+ } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ /*
+ * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is
+ * cleared. This means we are in the middle of moving of
+ * inline data to delay allocated block. Just force writeout
+ * here to finish conversion.
+ */
+ error = filemap_flush(inode->i_mapping);
+ if (error)
+ return error;
+ if (!ext4_has_inline_data(inode))
+ return 0;
}
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
iloc.bh = NULL;
error = ext4_get_inode_loc(inode, &iloc);
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index d62d802c9c12..749af7ad4e09 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -80,6 +80,145 @@ struct timestamp_expectation {
bool lower_bound;
};
+static const struct timestamp_expectation test_data[] = {
+ {
+ .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {0LL, 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 6,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0xFFFFFFFF,
+ .expected = {.tv_sec = 0x300000000LL,
+ .tv_nsec = MAX_NANOSECONDS},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
+ }
+};
+
+static void timestamp_expectation_to_desc(const struct timestamp_expectation *t,
+ char *desc)
+{
+ strscpy(desc, t->test_case_name, KUNIT_PARAM_DESC_SIZE);
+}
+
+KUNIT_ARRAY_PARAM(ext4_inode, test_data, timestamp_expectation_to_desc);
+
static time64_t get_32bit_time(const struct timestamp_expectation * const test)
{
if (test->msb_set) {
@@ -101,166 +240,35 @@ static time64_t get_32bit_time(const struct timestamp_expectation * const test)
*/
static void inode_test_xtimestamp_decoding(struct kunit *test)
{
- const struct timestamp_expectation test_data[] = {
- {
- .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 0,
- .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 0,
- .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 0,
- .expected = {0LL, 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 0,
- .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 1,
- .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 6,
- .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 0xFFFFFFFF,
- .expected = {.tv_sec = 0x300000000LL,
- .tv_nsec = MAX_NANOSECONDS},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 3,
- .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 3,
- .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
- }
- };
-
struct timespec64 timestamp;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(test_data); ++i) {
- timestamp.tv_sec = get_32bit_time(&test_data[i]);
- ext4_decode_extra_time(&timestamp,
- cpu_to_le32(test_data[i].extra_bits));
-
- KUNIT_EXPECT_EQ_MSG(test,
- test_data[i].expected.tv_sec,
- timestamp.tv_sec,
- CASE_NAME_FORMAT,
- test_data[i].test_case_name,
- test_data[i].msb_set,
- test_data[i].lower_bound,
- test_data[i].extra_bits);
- KUNIT_EXPECT_EQ_MSG(test,
- test_data[i].expected.tv_nsec,
- timestamp.tv_nsec,
- CASE_NAME_FORMAT,
- test_data[i].test_case_name,
- test_data[i].msb_set,
- test_data[i].lower_bound,
- test_data[i].extra_bits);
- }
+
+ struct timestamp_expectation *test_param =
+ (struct timestamp_expectation *)(test->param_value);
+
+ timestamp = ext4_decode_extra_time(
+ cpu_to_le32(get_32bit_time(test_param)),
+ cpu_to_le32(test_param->extra_bits));
+
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_param->expected.tv_sec,
+ timestamp.tv_sec,
+ CASE_NAME_FORMAT,
+ test_param->test_case_name,
+ test_param->msb_set,
+ test_param->lower_bound,
+ test_param->extra_bits);
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_param->expected.tv_nsec,
+ timestamp.tv_nsec,
+ CASE_NAME_FORMAT,
+ test_param->test_case_name,
+ test_param->msb_set,
+ test_param->lower_bound,
+ test_param->extra_bits);
}
static struct kunit_case ext4_inode_test_cases[] = {
- KUNIT_CASE(inode_test_xtimestamp_decoding),
+ KUNIT_CASE_PARAM(inode_test_xtimestamp_decoding, ext4_inode_gen_params),
{}
};
@@ -271,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = {
kunit_test_suites(&ext4_inode_test_suite);
+MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding");
MODULE_LICENSE("GPL v2");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 40ec5c7ef0d3..0c466ccbed69 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -20,6 +20,7 @@
*/
#include <linux/fs.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
@@ -30,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
@@ -48,32 +50,35 @@
#include <trace/events/ext4.h>
+static void ext4_journalled_zero_new_buffers(handle_t *handle,
+ struct inode *inode,
+ struct folio *folio,
+ unsigned from, unsigned to);
+
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__u16 dummy_csum = 0;
int offset = offsetof(struct ext4_inode, i_checksum_lo);
unsigned int csum_size = sizeof(dummy_csum);
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size);
offset += csum_size;
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_GOOD_OLD_INODE_SIZE - offset);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
offset = offsetof(struct ext4_inode, i_checksum_hi);
- csum = ext4_chksum(sbi, csum, (__u8 *)raw +
- EXT4_GOOD_OLD_INODE_SIZE,
+ csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE,
offset - EXT4_GOOD_OLD_INODE_SIZE);
if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum,
csum_size);
offset += csum_size;
}
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_INODE_SIZE(inode->i_sb) - offset);
}
@@ -87,7 +92,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
provided = le16_to_cpu(raw->i_checksum_lo);
@@ -101,14 +106,14 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
return provided == calculated;
}
-static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
- struct ext4_inode_info *ei)
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
__u32 csum;
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return;
csum = ext4_inode_csum(inode, raw, ei);
@@ -135,20 +140,13 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
-static int __ext4_journalled_writepage(struct page *page, unsigned int len);
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents);
-
/*
* Test whether an inode is a fast symlink.
* A fast symlink has its symlink data stored in ext4_inode_info->i_data.
*/
int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ if (!ext4_has_feature_ea_inode(inode->i_sb)) {
int ea_blocks = EXT4_I(inode)->i_file_acl ?
EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -175,38 +173,15 @@ void ext4_evict_inode(struct inode *inode)
*/
int extra_credits = 6;
struct ext4_xattr_inode_array *ea_inode_array = NULL;
+ bool freeze_protected = false;
trace_ext4_evict_inode(inode);
+ dax_break_layout_final(inode);
+
+ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
+ ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
- /*
- * When journalling data dirty buffers are tracked only in the
- * journal. So although mm thinks everything is clean and
- * ready for reaping the inode might still have some pages to
- * write in the running transaction or waiting to be
- * checkpointed. Thus calling jbd2_journal_invalidatepage()
- * (via truncate_inode_pages()) to discard these buffers can
- * cause data loss. Also even if we did not discard these
- * buffers, we would have no way to find them after the inode
- * is reaped and thus user could see stale data if he tries to
- * read them before the transaction is checkpointed. So be
- * careful and force everything to disk here... We use
- * ei->i_datasync_tid to store the newest transaction
- * containing inode's data.
- *
- * Note that directories do not have this problem because they
- * don't use page cache.
- */
- if (inode->i_ino != EXT4_JOURNAL_INO &&
- ext4_should_journal_data(inode) &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
- inode->i_data.nrpages) {
- journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
-
- jbd2_complete_transaction(journal, commit_tid);
- filemap_write_and_wait(&inode->i_data);
- }
truncate_inode_pages_final(&inode->i_data);
goto no_delete;
@@ -222,19 +197,23 @@ void ext4_evict_inode(struct inode *inode)
/*
* For inodes with journalled data, transaction commit could have
- * dirtied the inode. Flush worker is ignoring it because of I_FREEING
- * flag but we still need to remove the inode from the writeback lists.
+ * dirtied the inode. And for inodes with dioread_nolock, unwritten
+ * extents converting worker could merge extents and also have dirtied
+ * the inode. Flush worker is ignoring it because of I_FREEING flag but
+ * we still need to remove the inode from the writeback lists.
*/
- if (!list_empty_careful(&inode->i_io_list)) {
- WARN_ON_ONCE(!ext4_should_journal_data(inode));
- inode_io_list_del(inode);
- }
+ inode_io_list_del(inode);
/*
* Protect us against freezing - iput() caller didn't have to have any
- * protection against it
+ * protection against it. When we are in a running transaction though,
+ * we are already protected against freezing and we cannot grab further
+ * protection due to lock ordering constraints.
*/
- sb_start_intwrite(inode->i_sb);
+ if (!ext4_journal_current_handle()) {
+ sb_start_intwrite(inode->i_sb);
+ freeze_protected = true;
+ }
if (!IS_NOQUOTA(inode))
extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
@@ -253,7 +232,8 @@ void ext4_evict_inode(struct inode *inode)
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
- sb_end_intwrite(inode->i_sb);
+ if (freeze_protected)
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
@@ -294,7 +274,8 @@ void ext4_evict_inode(struct inode *inode)
stop_handle:
ext4_journal_stop(handle);
ext4_orphan_del(NULL, inode);
- sb_end_intwrite(inode->i_sb);
+ if (freeze_protected)
+ sb_end_intwrite(inode->i_sb);
ext4_xattr_inode_array_free(ea_inode_array);
goto no_delete;
}
@@ -323,10 +304,19 @@ stop_handle:
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
- sb_end_intwrite(inode->i_sb);
+ if (freeze_protected)
+ sb_end_intwrite(inode->i_sb);
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
+ /*
+ * Check out some where else accidentally dirty the evicting inode,
+ * which may probably cause inode use-after-free issues later.
+ */
+ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
+
+ if (!list_empty(&EXT4_I(inode)->i_fc_list))
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
@@ -362,7 +352,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
ei->i_reserved_data_blocks -= used;
percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ spin_unlock(&ei->i_block_reservation_lock);
/* Update quota subsystem for data blocks */
if (quota_claim)
@@ -390,12 +380,12 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal && inode == journal->j_inode)
return 0;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
- map->m_len)) {
+
+ if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
"(length %d)", (unsigned long) map->m_lblk,
@@ -420,6 +410,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
return ret;
}
+/*
+ * For generic regular files, when updating the extent tree, Ext4 should
+ * hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against concurrent page faults, as well as reads and writes.
+ */
+#ifdef CONFIG_EXT4_DEBUG
+void ext4_check_map_extents_env(struct inode *inode)
+{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ if (!S_ISREG(inode->i_mode) ||
+ IS_NOQUOTA(inode) || IS_VERITY(inode) ||
+ is_special_ino(inode->i_sb, inode->i_ino) ||
+ (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ ext4_verity_in_progress(inode))
+ return;
+
+ WARN_ON_ONCE(!inode_is_locked(inode) &&
+ !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+}
+#else
+void ext4_check_map_extents_env(struct inode *inode) {}
+#endif
+
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
@@ -466,6 +482,191 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int orig_mlen)
+{
+ struct ext4_map_blocks map2;
+ unsigned int status, status2;
+ int retval;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF));
+ WARN_ON_ONCE(orig_mlen <= map->m_len);
+
+ /* Prepare map2 for lookup in next leaf block */
+ map2.m_lblk = map->m_lblk + map->m_len;
+ map2.m_len = orig_mlen - map->m_len;
+ map2.m_flags = 0;
+ retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
+
+ if (retval <= 0) {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return map->m_len;
+ }
+
+ if (unlikely(retval != map2.m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map2.m_len);
+ WARN_ON(1);
+ }
+
+ status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ /*
+ * If map2 is contiguous with map, then let's insert it as a single
+ * extent in es cache and return the combined length of both the maps.
+ */
+ if (map->m_pblk + map->m_len == map2.m_pblk &&
+ status == status2) {
+ ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len + map2.m_len, map->m_pblk,
+ status, false);
+ map->m_len += map2.m_len;
+ } else {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ }
+
+ return map->m_len;
+}
+
+static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
+{
+ unsigned int status;
+ int retval;
+ unsigned int orig_mlen = map->m_len;
+
+ flags &= EXT4_EX_QUERY_FILTER;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
+ else
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
+ if (retval < 0)
+ return retval;
+
+ /* A hole? */
+ if (retval == 0)
+ goto out;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * No need to query next in leaf:
+ * - if returned extent is not last in leaf or
+ * - if the last in leaf is the full requested range
+ */
+ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) ||
+ map->m_len == orig_mlen) {
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ } else {
+ retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map,
+ orig_mlen);
+ }
+out:
+ map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);
+ return retval;
+}
+
+static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
+{
+ struct extent_status es;
+ unsigned int status;
+ int err, retval = 0;
+
+ /*
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
+ * indicates that the blocks and quotas has already been
+ * checked when the data was copied into the page cache.
+ */
+ if (map->m_flags & EXT4_MAP_DELAYED)
+ flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ /*
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
+ */
+ map->m_flags &= ~EXT4_MAP_FLAGS;
+
+ /*
+ * We need to check for EXT4 here because migrate could have
+ * changed the inode type in between.
+ */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
+ } else {
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+ /*
+ * We allocated new blocks which will result in i_data's
+ * format changing. Force the migrate to fail by clearing
+ * migrate flags.
+ */
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW)
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ }
+ if (retval <= 0)
+ return retval;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode %lu: "
+ "retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * We have to zeroout blocks before inserting them into extent
+ * status tree. Otherwise someone could look them up there and
+ * use them before they are really zeroed. We also have to
+ * unmap metadata before zeroing as otherwise writeback can
+ * overwrite zeros with stale data from block device.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO &&
+ map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) {
+ err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk,
+ map->m_len);
+ if (err)
+ return err;
+ }
+
+ /*
+ * If the extent has been zeroed out, we don't need to update
+ * extent status tree.
+ */
+ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE &&
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
+ if (ext4_es_is_written(&es))
+ return retval;
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
+ status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);
+
+ return retval;
+}
+
/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
@@ -478,9 +679,10 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocated. if
- * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
- * is marked as unwritten. If the create == 1, it will mark @map as mapped.
+ * On success, it returns the number of blocks being mapped or allocated.
+ * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
+ * pre-allocated and unwritten, the resulting @map is marked as unwritten.
+ * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
* that case, @map is returned as unmapped but we still do fill map->m_len to
@@ -494,6 +696,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct extent_status es;
int retval;
int ret = 0;
+ unsigned int orig_mlen = map->m_len;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -514,8 +717,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
return -EFSCORRUPTED;
+ /*
+ * Callers from the context of data submission are the only exceptions
+ * for regular files that do not hold the i_rwsem or invalidate_lock.
+ * However, caching unrelated ranges is not permitted.
+ */
+ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+ WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
+ else
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -527,6 +740,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
map->m_pblk = 0;
+ map->m_flags |= ext4_es_is_delayed(&es) ?
+ EXT4_MAP_DELAYED : 0;
retval = es.es_len - (map->m_lblk - es.es_lblk);
if (retval > map->m_len)
retval = map->m_len;
@@ -535,46 +750,32 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
} else {
BUG();
}
+
+ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
+ return retval;
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
- goto found;
+ if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) ||
+ orig_mlen == map->m_len)
+ goto found;
+
+ map->m_len = orig_mlen;
}
+ /*
+ * In the query cache no-wait mode, nothing we can do more if we
+ * cannot find extent in the cache.
+ */
+ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
+ return 0;
/*
* Try to see if we can get the block without requesting a new
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
- } else {
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
- }
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
- ret = ext4_es_insert_extent(inode, map->m_lblk,
- map->m_len, map->m_pblk, status);
- if (ret < 0)
- retval = ret;
- }
+ retval = ext4_map_query_blocks(handle, inode, map, flags);
up_read((&EXT4_I(inode)->i_data_sem));
found:
@@ -592,8 +793,7 @@ found:
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns the create = 0
- * with buffer head unmapped.
+ * ext4_ext_map_blocks() returns with buffer head unmapped
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
/*
@@ -604,12 +804,8 @@ found:
if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
return retval;
- /*
- * Here we clear m_flags because after allocating an new extent,
- * it will be set again.
- */
- map->m_flags &= ~EXT4_MAP_FLAGS;
+ ext4_fc_track_inode(handle, inode);
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
@@ -617,92 +813,15 @@ found:
* with create == 1 flag.
*/
down_write(&EXT4_I(inode)->i_data_sem);
+ retval = ext4_map_create_blocks(handle, inode, map, flags);
+ up_write((&EXT4_I(inode)->i_data_sem));
- /*
- * We need to check for EXT4 here because migrate
- * could have changed the inode type in between
- */
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, flags);
- } else {
- retval = ext4_ind_map_blocks(handle, inode, map, flags);
-
- if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
- /*
- * We allocated new blocks which will result in
- * i_data's format changing. Force the migrate
- * to fail by clearing migrate flags
- */
- ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
- }
-
- /*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now. We don't
- * support fallocate for non extent files. So we can update
- * reserve space here.
- */
- if ((retval > 0) &&
- (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
- ext4_da_update_reserve_space(inode, retval, 1);
- }
-
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- /*
- * We have to zeroout blocks before inserting them into extent
- * status tree. Otherwise someone could look them up there and
- * use them before they are really zeroed. We also have to
- * unmap metadata before zeroing as otherwise writeback can
- * overwrite zeros with stale data from block device.
- */
- if (flags & EXT4_GET_BLOCKS_ZERO &&
- map->m_flags & EXT4_MAP_MAPPED &&
- map->m_flags & EXT4_MAP_NEW) {
- ret = ext4_issue_zeroout(inode, map->m_lblk,
- map->m_pblk, map->m_len);
- if (ret) {
- retval = ret;
- goto out_sem;
- }
- }
-
- /*
- * If the extent has been zeroed out, we don't need to update
- * extent status tree.
- */
- if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
- if (ext4_es_is_written(&es))
- goto out_sem;
- }
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- if (ret < 0) {
- retval = ret;
- goto out_sem;
- }
- }
+ if (retval < 0)
+ ext_debug(inode, "failed with err %d\n", retval);
+ if (retval <= 0)
+ return retval;
-out_sem:
- up_write((&EXT4_I(inode)->i_data_sem));
- if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ if (map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
@@ -717,9 +836,8 @@ out_sem:
!(flags & EXT4_GET_BLOCKS_ZERO) &&
!ext4_is_quota_file(inode) &&
ext4_should_order_data(inode)) {
- loff_t start_byte =
- (loff_t)map->m_lblk << inode->i_blkbits;
- loff_t length = (loff_t)map->m_len << inode->i_blkbits;
+ loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk);
+ loff_t length = EXT4_LBLK_TO_B(inode, map->m_len);
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
ret = ext4_jbd2_inode_add_wait(handle, inode,
@@ -731,9 +849,8 @@ out_sem:
return ret;
}
}
-
- if (retval < 0)
- ext_debug(inode, "failed with err %d\n", retval);
+ ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk +
+ map->m_len - 1);
return retval;
}
@@ -749,7 +866,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
flags &= EXT4_MAP_FLAGS;
/* Dummy buffer_head? Set non-atomically. */
- if (!bh->b_page) {
+ if (!bh->b_folio) {
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
return;
}
@@ -758,11 +875,30 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
* once we get rid of using bh as a container for mapping information
* to pass to / from get_block functions, this can go away.
*/
+ old_state = READ_ONCE(bh->b_state);
do {
- old_state = READ_ONCE(bh->b_state);
new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
- } while (unlikely(
- cmpxchg(&bh->b_state, old_state, new_state) != old_state));
+ } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
+}
+
+/*
+ * Make sure that the current journal transaction has enough credits to map
+ * one extent. Return -EAGAIN if it cannot extend the current running
+ * transaction.
+ */
+static inline int ext4_journal_ensure_extent_credits(handle_t *handle,
+ struct inode *inode)
+{
+ int credits;
+ int ret;
+
+ /* Called from ext4_da_write_begin() which has no handle started? */
+ if (!handle)
+ return 0;
+
+ credits = ext4_chunk_trans_blocks(inode, 1);
+ ret = __ext4_journal_ensure_credits(handle, credits, credits, 0);
+ return ret <= 0 ? ret : -EAGAIN;
}
static int _ext4_get_block(struct inode *inode, sector_t iblock,
@@ -806,10 +942,22 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
+ int ret = 0;
+
ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ ret = _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+
+ /*
+ * If the buffer is marked unwritten, mark it as new to make sure it is
+ * zeroed out correctly in case of partial writes. Otherwise, there is
+ * a chance of stale data getting exposed.
+ */
+ if (ret == 0 && buffer_unwritten(bh_result))
+ set_buffer_new(bh_result);
+
+ return ret;
}
/* Maximum number of blocks we map for direct IO at once. */
@@ -824,9 +972,12 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
struct ext4_map_blocks map;
struct buffer_head *bh;
int create = map_flags & EXT4_GET_BLOCKS_CREATE;
+ bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
int err;
- J_ASSERT(handle != NULL || create == 0);
+ ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ || handle != NULL || create == 0);
+ ASSERT(create == 0 || !nowait);
map.m_lblk = block;
map.m_len = 1;
@@ -837,12 +988,23 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
if (err < 0)
return ERR_PTR(err);
- bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (nowait)
+ return sb_find_get_block(inode->i_sb, map.m_pblk);
+
+ /*
+ * Since bh could introduce extra ref count such as referred by
+ * journal_head etc. Try to avoid using __GFP_MOVABLE here
+ * as it may fail the migration when journal_head remains.
+ */
+ bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk,
+ inode->i_sb->s_blocksize);
+
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
- J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
+ ASSERT(create != 0);
+ ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ || (handle != NULL));
/*
* Now that we do not always journal data, we should
@@ -853,7 +1015,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
*/
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(err)) {
unlock_buffer(bh);
goto errout;
@@ -879,18 +1042,20 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int map_flags)
{
struct buffer_head *bh;
+ int ret;
bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
if (!bh || ext4_buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- return ERR_PTR(-EIO);
+
+ ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
+ if (ret) {
+ put_bh(bh);
+ return ERR_PTR(ret);
+ }
+ return bh;
}
/* Read a contiguous batch of blocks. */
@@ -911,8 +1076,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
for (i = 0; i < bh_count; i++)
/* Note that NULL bhs[i] is valid because of holes. */
if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
- &bhs[i]);
+ ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
if (!wait)
return 0;
@@ -937,12 +1101,12 @@ out_brelse:
return err;
}
-int ext4_walk_page_buffers(handle_t *handle,
+int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
- int (*fn)(handle_t *handle,
+ int (*fn)(handle_t *handle, struct inode *inode,
struct buffer_head *bh))
{
struct buffer_head *bh;
@@ -961,7 +1125,7 @@ int ext4_walk_page_buffers(handle_t *handle,
*partial = 1;
continue;
}
- err = (*fn)(handle, bh);
+ err = (*fn)(handle, inode, bh);
if (!ret)
ret = err;
}
@@ -969,120 +1133,110 @@ int ext4_walk_page_buffers(handle_t *handle,
}
/*
- * To preserve ordering, it is essential that the hole instantiation and
- * the data write be encapsulated in a single transaction. We cannot
- * close off a transaction and start a new one between the ext4_get_block()
- * and the commit_write(). So doing the jbd2_journal_start at the start of
- * prepare_write() is the right place.
- *
- * Also, this function can nest inside ext4_writepage(). In that case, we
- * *know* that ext4_writepage() has generated enough buffer credits to do the
- * whole page. So we won't block on the journal in that case, which is good,
- * because the caller may be PF_MEMALLOC.
- *
- * By accident, ext4 can be reentered when a transaction is open via
- * quota file writes. If we were to commit the transaction while thus
- * reentered, there can be a deadlock - we would be holding a quota
- * lock, and the commit would never complete if another thread had a
- * transaction open and was blocking on the quota lock - a ranking
- * violation.
- *
- * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
- * will _not_ run commit under these circumstances because handle->h_ref
- * is elevated. We'll still have enough credits for the tiny quotafile
- * write.
+ * Helper for handling dirtying of journalled data. We also mark the folio as
+ * dirty so that writeback code knows about this page (and inode) contains
+ * dirty data. ext4_writepages() then commits appropriate transaction to
+ * make data stable.
*/
-int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
- int dirty = buffer_dirty(bh);
- int ret;
+ struct folio *folio = bh->b_folio;
+ struct inode *inode = folio->mapping->host;
+
+ /* only regular files have a_ops */
+ if (S_ISREG(inode->i_mode))
+ folio_mark_dirty(folio);
+ return ext4_handle_dirty_metadata(handle, NULL, bh);
+}
+int do_journal_get_write_access(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
+{
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
- /*
- * __block_write_begin() could have dirtied some buffers. Clean
- * the dirty bit as jbd2_journal_get_write_access() could complain
- * otherwise about fs integrity issues. Setting of the dirty bit
- * by __block_write_begin() isn't a real problem here as we clear
- * the bit before releasing a page lock and thus writeback cannot
- * ever write the buffer.
- */
- if (dirty)
- clear_buffer_dirty(bh);
BUFFER_TRACE(bh, "get write access");
- ret = ext4_journal_get_write_access(handle, bh);
- if (!ret && dirty)
- ret = ext4_handle_dirty_metadata(handle, NULL, bh);
- return ret;
+ return ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block)
+int ext4_block_write_begin(handle_t *handle, struct folio *folio,
+ loff_t pos, unsigned len,
+ get_block_t *get_block)
{
- unsigned from = pos & (PAGE_SIZE - 1);
+ unsigned int from = offset_in_folio(folio, pos);
unsigned to = from + len;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned block_start, block_end;
sector_t block;
int err = 0;
- unsigned blocksize = inode->i_sb->s_blocksize;
- unsigned bbits;
+ unsigned int blocksize = i_blocksize(inode);
struct buffer_head *bh, *head, *wait[2];
int nr_wait = 0;
int i;
+ bool should_journal_data = ext4_should_journal_data(inode);
- BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
+ WARN_ON_ONCE(blocksize > folio_size(folio));
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
- head = page_buffers(page);
- bbits = ilog2(blocksize);
- block = (sector_t)page->index << (PAGE_SHIFT - bbits);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
+ block = EXT4_PG_TO_LBLK(inode, folio->index);
for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start = block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
- if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
+ if (folio_test_uptodate(folio)) {
+ set_buffer_uptodate(bh);
}
continue;
}
- if (buffer_new(bh))
+ if (WARN_ON_ONCE(buffer_new(bh)))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
+ err = ext4_journal_ensure_extent_credits(handle, inode);
+ if (!err)
+ err = get_block(inode, block, bh, 1);
if (err)
break;
if (buffer_new(bh)) {
- if (PageUptodate(page)) {
- clear_buffer_new(bh);
+ /*
+ * We may be zeroing partial buffers or all new
+ * buffers in case of failure. Prepare JBD2 for
+ * that.
+ */
+ if (should_journal_data)
+ do_journal_get_write_access(handle,
+ inode, bh);
+ if (folio_test_uptodate(folio)) {
+ /*
+ * Unlike __block_write_begin() we leave
+ * dirtying of new uptodate buffers to
+ * ->write_end() time or
+ * folio_zero_new_buffers().
+ */
set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
continue;
}
if (block_end > to || block_start < from)
- zero_user_segments(page, to, block_end,
- block_start, from);
+ folio_zero_segments(folio, to,
+ block_end,
+ block_start, from);
continue;
}
}
- if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
+ if (folio_test_uptodate(folio)) {
+ set_buffer_uptodate(bh);
continue;
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ext4_read_bh_lock(bh, 0, false);
wait[nr_wait++] = bh;
}
}
@@ -1095,13 +1249,17 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
err = -EIO;
}
if (unlikely(err)) {
- page_zero_new_buffers(page, from, to);
- } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+ if (should_journal_data)
+ ext4_journalled_zero_new_buffers(handle, inode, folio,
+ from, to);
+ else
+ folio_zero_new_buffers(folio, from, to);
+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
- err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
- bh_offset(wait[i]));
+ err2 = fscrypt_decrypt_pagecache_blocks(folio,
+ blocksize, bh_offset(wait[i]));
if (err2) {
clear_buffer_uptodate(wait[i]);
err = err2;
@@ -1111,36 +1269,43 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
return err;
}
-#endif
-static int ext4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction. We cannot
+ * close off a transaction and start a new one between the ext4_get_block()
+ * and the ext4_write_end(). So doing the jbd2_journal_start at the start of
+ * ext4_write_begin() is the right place.
+ */
+static int ext4_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
int ret, needed_blocks;
handle_t *handle;
int retries = 0;
- struct page *page;
+ struct folio *folio;
pgoff_t index;
unsigned from, to;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
- trace_ext4_write_begin(inode, pos, len, flags);
+ trace_ext4_write_begin(inode, pos, len);
/*
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
*/
- needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ needed_blocks = ext4_chunk_trans_extent(inode,
+ ext4_journal_blocks_per_folio(inode)) + 1;
index = pos >> PAGE_SHIFT;
- from = pos & (PAGE_SIZE - 1);
- to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
- flags, pagep);
+ foliop);
if (ret < 0)
return ret;
if (ret == 1)
@@ -1148,65 +1313,71 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
}
/*
- * grab_cache_page_write_begin() can take a long time if the
- * system is thrashing due to memory pressure, or if the page
+ * write_begin_get_folio() can take a long time if the
+ * system is thrashing due to memory pressure, or if the folio
* is being written back. So grab it first before we start
* the transaction handle. This also allows us to allocate
- * the page (if needed) without using GFP_NOFS.
+ * the folio (if needed) without using GFP_NOFS.
*/
retry_grab:
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
- unlock_page(page);
+ folio = write_begin_get_folio(iocb, mapping, index, len);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+
+ if (len > folio_next_pos(folio) - pos)
+ len = folio_next_pos(folio) - pos;
+
+ from = offset_in_folio(folio, pos);
+ to = from + len;
+
+ /*
+ * The same as page allocation, we prealloc buffer heads before
+ * starting the handle.
+ */
+ if (!folio_buffers(folio))
+ create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);
+
+ folio_unlock(folio);
retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
- put_page(page);
+ folio_put(folio);
return PTR_ERR(handle);
}
- lock_page(page);
- if (page->mapping != mapping) {
- /* The page got truncated from under us */
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ /* The folio got truncated from under us */
+ folio_unlock(folio);
+ folio_put(folio);
ext4_journal_stop(handle);
goto retry_grab;
}
- /* In case writeback began while the page was unlocked */
- wait_for_stable_page(page);
+ /* In case writeback began while the folio was unlocked */
+ folio_wait_stable(folio);
-#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
- ret = ext4_block_write_begin(page, pos, len,
+ ret = ext4_block_write_begin(handle, folio, pos, len,
ext4_get_block_unwritten);
else
- ret = ext4_block_write_begin(page, pos, len,
+ ret = ext4_block_write_begin(handle, folio, pos, len,
ext4_get_block);
-#else
- if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(page, pos, len,
- ext4_get_block_unwritten);
- else
- ret = __block_write_begin(page, pos, len, ext4_get_block);
-#endif
if (!ret && ext4_should_journal_data(inode)) {
- ret = ext4_walk_page_buffers(handle, page_buffers(page),
- from, to, NULL,
- do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, inode,
+ folio_buffers(folio), from, to,
+ NULL, do_journal_get_write_access);
}
if (ret) {
bool extended = (pos + len > inode->i_size) &&
!ext4_verity_in_progress(inode);
- unlock_page(page);
+ folio_unlock(folio);
/*
- * __block_write_begin may have instantiated a few blocks
+ * ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * i_size_read because we hold i_rwsem.
*
* Add inode to orphan list in case we crash before
* truncate finishes
@@ -1227,64 +1398,61 @@ retry_journal:
ext4_orphan_del(NULL, inode);
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (ret == -EAGAIN ||
+ (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_journal;
- put_page(page);
+ folio_put(folio);
return ret;
}
- *pagep = page;
+ *foliop = folio;
return ret;
}
/* For write_end() in data=journal mode */
-static int write_end_fn(handle_t *handle, struct buffer_head *bh)
+static int write_end_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
set_buffer_uptodate(bh);
- ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ ret = ext4_dirty_journalled_data(handle, bh);
clear_buffer_meta(bh);
clear_buffer_prio(bh);
+ clear_buffer_new(bh);
return ret;
}
/*
* We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
+ * `iocb` can be NULL - eg, when called from page_symlink().
*
- * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * ext4 never places buffers on inode->i_mapping->i_private_list. metadata
* buffers are managed internally.
*/
-static int ext4_write_end(struct file *file,
+static int ext4_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
int ret = 0, ret2;
int i_size_changed = 0;
- int inline_data = ext4_has_inline_data(inode);
bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
- if (inline_data) {
- ret = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- goto errout;
- }
- copied = ret;
- } else
- copied = block_write_end(file, mapping, pos,
- len, copied, page, fsdata);
+
+ if (ext4_has_inline_data(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
+
+ copied = block_write_end(pos, len, copied, folio);
/*
- * it's important to update i_size while still holding page lock:
+ * it's important to update i_size while still holding folio lock:
* page writeout could otherwise come in and zero beyond i_size.
*
* If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
@@ -1292,18 +1460,20 @@ static int ext4_write_end(struct file *file,
*/
if (!verity)
i_size_changed = ext4_update_inode_size(inode, pos + copied);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
/*
- * Don't mark the inode dirty under page lock. First, it unnecessarily
- * makes the holding time of page lock longer. Second, it forces lock
- * ordering of page lock and transaction start for journaling
+ * Don't mark the inode dirty under folio lock. First, it unnecessarily
+ * makes the holding time of folio lock longer. Second, it forces lock
+ * ordering of folio lock and transaction start for journaling
* filesystems.
*/
- if (i_size_changed || inline_data)
+ if (i_size_changed)
ret = ext4_mark_inode_dirty(handle, inode);
if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
@@ -1312,7 +1482,7 @@ static int ext4_write_end(struct file *file,
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-errout:
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1332,32 +1502,33 @@ errout:
}
/*
- * This is a private version of page_zero_new_buffers() which doesn't
+ * This is a private version of folio_zero_new_buffers() which doesn't
* set the buffer to be dirty, since in data=journalled mode we need
- * to call ext4_handle_dirty_metadata() instead.
+ * to call ext4_dirty_journalled_data() instead.
*/
static void ext4_journalled_zero_new_buffers(handle_t *handle,
- struct page *page,
+ struct inode *inode,
+ struct folio *folio,
unsigned from, unsigned to)
{
unsigned int block_start = 0, block_end;
struct buffer_head *head, *bh;
- bh = head = page_buffers(page);
+ bh = head = folio_buffers(folio);
do {
block_end = block_start + bh->b_size;
if (buffer_new(bh)) {
if (block_end > from && block_start < to) {
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
unsigned start, size;
start = max(from, block_start);
size = min(to, block_end) - start;
- zero_user(page, start, size);
- write_end_fn(handle, bh);
+ folio_zero_range(folio, start, size);
}
clear_buffer_new(bh);
+ write_end_fn(handle, inode, bh);
}
}
block_start = block_end;
@@ -1365,10 +1536,10 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
} while (bh != head);
}
-static int ext4_journalled_write_end(struct file *file,
+static int ext4_journalled_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
@@ -1377,7 +1548,6 @@ static int ext4_journalled_write_end(struct file *file,
int partial = 0;
unsigned from, to;
int size_changed = 0;
- int inline_data = ext4_has_inline_data(inode);
bool verity = ext4_verity_in_progress(inode);
trace_ext4_journalled_write_end(inode, pos, len, copied);
@@ -1386,39 +1556,37 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
- if (inline_data) {
- ret = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- goto errout;
- }
- copied = ret;
- } else if (unlikely(copied < len) && !PageUptodate(page)) {
+ if (ext4_has_inline_data(inode))
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
+
+ if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
copied = 0;
- ext4_journalled_zero_new_buffers(handle, page, from, to);
+ ext4_journalled_zero_new_buffers(handle, inode, folio,
+ from, to);
} else {
if (unlikely(copied < len))
- ext4_journalled_zero_new_buffers(handle, page,
+ ext4_journalled_zero_new_buffers(handle, inode, folio,
from + copied, to);
- ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
- from + copied, &partial,
+ ret = ext4_walk_page_buffers(handle, inode,
+ folio_buffers(folio),
+ from, from + copied, &partial,
write_end_fn);
if (!partial)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
if (!verity)
size_changed = ext4_update_inode_size(inode, pos + copied);
- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
- if (size_changed || inline_data) {
+ if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
@@ -1431,7 +1599,6 @@ static int ext4_journalled_write_end(struct file *file,
*/
ext4_orphan_add(handle, inode);
-errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1450,9 +1617,9 @@ errout:
}
/*
- * Reserve space for a single cluster
+ * Reserve space for 'nr_resv' clusters
*/
-static int ext4_da_reserve_space(struct inode *inode)
+static int ext4_da_reserve_space(struct inode *inode, int nr_resv)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1463,18 +1630,18 @@ static int ext4_da_reserve_space(struct inode *inode)
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv));
if (ret)
return ret;
spin_lock(&ei->i_block_reservation_lock);
- if (ext4_claim_free_clusters(sbi, 1, 0)) {
+ if (ext4_claim_free_clusters(sbi, nr_resv, 0)) {
spin_unlock(&ei->i_block_reservation_lock);
- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv));
return -ENOSPC;
}
- ei->i_reserved_data_blocks++;
- trace_ext4_da_reserve_space(inode);
+ ei->i_reserved_data_blocks += nr_resv;
+ trace_ext4_da_reserve_space(inode, nr_resv);
spin_unlock(&ei->i_block_reservation_lock);
return 0; /* success */
@@ -1520,14 +1687,18 @@ void ext4_da_release_space(struct inode *inode, int to_free)
*/
struct mpage_da_data {
+ /* These are input fields for ext4_do_writepages() */
struct inode *inode;
struct writeback_control *wbc;
+ unsigned int can_map:1; /* Can writepages call map blocks? */
+
+ /* These are internal state of ext4_do_writepages() */
+ loff_t start_pos; /* The start pos to write */
+ loff_t next_pos; /* Current pos to examine */
+ loff_t end_pos; /* Last pos to examine */
- pgoff_t first_page; /* The first page to write */
- pgoff_t next_page; /* Current page to examine */
- pgoff_t last_page; /* Last page to examine */
/*
- * Extent to map - this can be after first_page because that can be
+ * Extent to map - this can be after start_pos because that can be
* fully mapped. We somewhat abuse m_flags to store whether the extent
* is delalloc or unwritten.
*/
@@ -1535,50 +1706,63 @@ struct mpage_da_data {
struct ext4_io_submit io_submit; /* IO submission data */
unsigned int do_map:1;
unsigned int scanned_until_end:1;
+ unsigned int journalled_more_data:1;
};
static void mpage_release_unused_pages(struct mpage_da_data *mpd,
bool invalidate)
{
- int nr_pages, i;
+ unsigned nr, i;
pgoff_t index, end;
- struct pagevec pvec;
+ struct folio_batch fbatch;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- /* This is necessary when next_page == 0. */
- if (mpd->first_page >= mpd->next_page)
+ /* This is necessary when next_pos == 0. */
+ if (mpd->start_pos >= mpd->next_pos)
return;
mpd->scanned_until_end = 0;
- index = mpd->first_page;
- end = mpd->next_page - 1;
if (invalidate) {
ext4_lblk_t start, last;
- start = index << (PAGE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_SHIFT - inode->i_blkbits);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ start = EXT4_B_TO_LBLK(inode, mpd->start_pos);
+ last = mpd->next_pos >> inode->i_blkbits;
+
+ /*
+ * avoid racing with extent status tree scans made by
+ * ext4_insert_delayed_block()
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_es_remove_extent(inode, start, last - start);
+ up_write(&EXT4_I(inode)->i_data_sem);
}
- pagevec_init(&pvec);
- while (index <= end) {
- nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
- if (nr_pages == 0)
+ folio_batch_init(&fbatch);
+ index = mpd->start_pos >> PAGE_SHIFT;
+ end = mpd->next_pos >> PAGE_SHIFT;
+ while (index < end) {
+ nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
+ if (nr == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr; i++) {
+ struct folio *folio = fbatch.folios[i];
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
+ if (folio_pos(folio) < mpd->start_pos)
+ continue;
+ if (folio_next_index(folio) > end)
+ continue;
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
if (invalidate) {
- if (page_mapped(page))
- clear_page_dirty_for_io(page);
- block_invalidatepage(page, 0, PAGE_SIZE);
- ClearPageUptodate(page);
+ if (folio_mapped(folio))
+ folio_clear_dirty_for_io(folio);
+ block_invalidate_folio(folio, 0,
+ folio_size(folio));
+ folio_clear_uptodate(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
}
}
@@ -1604,30 +1788,59 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+/*
+ * Check whether the cluster containing lblk has been allocated or has
+ * delalloc reservation.
+ *
+ * Returns 0 if the cluster doesn't have either, 1 if it has delalloc
+ * reservation, 2 if it's already been allocated, negative error code on
+ * failure.
+ */
+static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
{
- return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+
+ /* Has delalloc reservation? */
+ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk))
+ return 1;
+
+ /* Already been allocated? */
+ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
+ return 2;
+ ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 2;
+
+ return 0;
}
/*
- * ext4_insert_delayed_block - adds a delayed block to the extents status
- * tree, incrementing the reserved cluster/block
- * count or making a pending reservation
- * where needed
+ * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
+ * status tree, incrementing the reserved
+ * cluster/block count or making pending
+ * reservations where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
*
* Returns 0 on success, negative error code on failure.
*/
-static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
- bool allocated = false;
+ bool lclu_allocated = false;
+ bool end_allocated = false;
+ ext4_lblk_t resv_clu;
+ ext4_lblk_t end = lblk + len - 1;
/*
- * If the cluster containing lblk is shared with a delayed,
+ * If the cluster containing lblk or end is shared with a delayed,
* written, or unwritten extent in a bigalloc file system, it's
* already been accounted for and does not need to be reserved.
* A pending reservation must be made for the cluster if it's
@@ -1638,86 +1851,86 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
* extents status tree doesn't get a match.
*/
if (sbi->s_cluster_ratio == 1) {
- ret = ext4_da_reserve_space(inode);
+ ret = ext4_da_reserve_space(inode, len);
if (ret != 0) /* ENOSPC */
- goto errout;
+ return ret;
} else { /* bigalloc */
- if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
- if (!ext4_es_scan_clu(inode,
- &ext4_es_is_mapped, lblk)) {
- ret = ext4_clu_mapped(inode,
- EXT4_B2C(sbi, lblk));
- if (ret < 0)
- goto errout;
- if (ret == 0) {
- ret = ext4_da_reserve_space(inode);
- if (ret != 0) /* ENOSPC */
- goto errout;
- } else {
- allocated = true;
- }
- } else {
- allocated = true;
+ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1;
+
+ ret = ext4_clu_alloc_state(inode, lblk);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ resv_clu--;
+ lclu_allocated = (ret == 2);
+ }
+
+ if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) {
+ ret = ext4_clu_alloc_state(inode, end);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ resv_clu--;
+ end_allocated = (ret == 2);
}
}
- }
- ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+ if (resv_clu) {
+ ret = ext4_da_reserve_space(inode, resv_clu);
+ if (ret != 0) /* ENOSPC */
+ return ret;
+ }
+ }
-errout:
- return ret;
+ ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated,
+ end_allocated);
+ return 0;
}
/*
- * This function is grabs code from the very beginning of
- * ext4_map_blocks, but assumes that the caller is from delayed write
- * time. This function looks up the requested blocks and sets the
- * buffer delay bit under the protection of i_data_sem.
+ * Looks up the requested blocks and sets the delalloc extent map.
+ * First try to look up for the extent entry that contains the requested
+ * blocks in the extent status tree without i_data_sem, then try to look
+ * up for the ondisk extent mapping with i_data_sem in read mode,
+ * finally hold i_data_sem in write mode, looks up again and add a
+ * delalloc extent entry if it still couldn't find any extent. Pass out
+ * the mapped extent through @map and return 0 on success.
*/
-static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
- struct ext4_map_blocks *map,
- struct buffer_head *bh)
+static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
{
struct extent_status es;
int retval;
- sector_t invalid_block = ~((sector_t) 0xffff);
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
memcpy(&orig_map, map, sizeof(*map));
#endif
- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
- invalid_block = ~0;
-
map->m_flags = 0;
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
- if (ext4_es_is_hole(&es)) {
- retval = 0;
- down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
+
+ if (ext4_es_is_hole(&es))
goto add_delayed;
- }
+found:
/*
* Delayed extent could be allocated by fallocate.
* So we need to check it.
*/
- if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
+ if (ext4_es_is_delayed(&es)) {
+ map->m_flags |= EXT4_MAP_DELAYED;
return 0;
}
- map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
- retval = es.es_len - (iblock - es.es_lblk);
- if (retval > map->m_len)
- retval = map->m_len;
- map->m_len = retval;
+ map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
if (ext4_es_is_written(&es))
map->m_flags |= EXT4_MAP_MAPPED;
else if (ext4_es_is_unwritten(&es))
@@ -1728,7 +1941,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
- return retval;
+ return 0;
}
/*
@@ -1738,51 +1951,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_has_inline_data(inode))
retval = 0;
- else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
- retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
+ up_read(&EXT4_I(inode)->i_data_sem);
+ if (retval)
+ return retval < 0 ? retval : 0;
add_delayed:
- if (retval == 0) {
- int ret;
-
- /*
- * XXX: __block_prepare_write() unmaps passed block,
- * is it OK?
- */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ /*
+ * Page fault path (ext4_page_mkwrite does not take i_rwsem)
+ * and fallocate path (no folio lock) can race. Make sure we
+ * lookup the extent status tree here again while i_data_sem
+ * is held in write mode, before inserting a new da entry in
+ * the extent status tree.
+ */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
- ret = ext4_insert_delayed_block(inode, map->m_lblk);
- if (ret != 0) {
- retval = ret;
- goto out_unlock;
+ if (!ext4_es_is_hole(&es)) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto found;
}
-
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
- } else if (retval > 0) {
- int ret;
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
+ } else if (!ext4_has_inline_data(inode)) {
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
+ if (retval) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return retval < 0 ? retval : 0;
}
-
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- if (ret != 0)
- retval = ret;
}
-out_unlock:
- up_read((&EXT4_I(inode)->i_data_sem));
+ map->m_flags |= EXT4_MAP_DELAYED;
+ retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
+ if (!retval)
+ map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);
+ up_write(&EXT4_I(inode)->i_data_sem);
return retval;
}
@@ -1803,11 +2007,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
+ sector_t invalid_block = ~((sector_t) 0xffff);
int ret = 0;
BUG_ON(create == 0);
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
map.m_lblk = iblock;
map.m_len = 1;
@@ -1816,10 +2024,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_da_map_blocks(inode, iblock, &map, bh);
- if (ret <= 0)
+ ret = ext4_da_map_blocks(inode, &map);
+ if (ret < 0)
return ret;
+ if (map.m_flags & EXT4_MAP_DELAYED) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
+ }
+
map_bh(bh, inode->i_sb, map.m_pblk);
ext4_update_bh_state(bh, map.m_flags);
@@ -1836,253 +2051,40 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-static int bget_one(handle_t *handle, struct buffer_head *bh)
+static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
- get_bh(bh);
- return 0;
+ mpd->start_pos += folio_size(folio);
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
+ folio_unlock(folio);
}
-static int bput_one(handle_t *handle, struct buffer_head *bh)
+static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
{
- put_bh(bh);
- return 0;
-}
-
-static int __ext4_journalled_writepage(struct page *page,
- unsigned int len)
-{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct buffer_head *page_bufs = NULL;
- handle_t *handle = NULL;
- int ret = 0, err = 0;
- int inline_data = ext4_has_inline_data(inode);
- struct buffer_head *inode_bh = NULL;
-
- ClearPageChecked(page);
-
- if (inline_data) {
- BUG_ON(page->index != 0);
- BUG_ON(len > ext4_get_max_inline_size(inode));
- inode_bh = ext4_journalled_write_inline_data(inode, len, page);
- if (inode_bh == NULL)
- goto out;
- } else {
- page_bufs = page_buffers(page);
- if (!page_bufs) {
- BUG();
- goto out;
- }
- ext4_walk_page_buffers(handle, page_bufs, 0, len,
- NULL, bget_one);
- }
- /*
- * We need to release the page lock before we start the
- * journal, so grab a reference so the page won't disappear
- * out from under us.
- */
- get_page(page);
- unlock_page(page);
-
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- put_page(page);
- goto out_no_pagelock;
- }
- BUG_ON(!ext4_handle_valid(handle));
-
- lock_page(page);
- put_page(page);
- if (page->mapping != mapping) {
- /* The page got truncated from under us */
- ext4_journal_stop(handle);
- ret = 0;
- goto out;
- }
-
- if (inline_data) {
- ret = ext4_mark_inode_dirty(handle, inode);
- } else {
- ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
- do_journal_get_write_access);
-
- err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
- write_end_fn);
- }
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-
- if (!ext4_has_inline_data(inode))
- ext4_walk_page_buffers(NULL, page_bufs, 0, len,
- NULL, bput_one);
- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
-out:
- unlock_page(page);
-out_no_pagelock:
- brelse(inode_bh);
- return ret;
-}
-
-/*
- * Note that we don't need to start a transaction unless we're journaling data
- * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * need to file the inode to the transaction's list in ordered mode because if
- * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), no one guarantees in which
- * transaction the data will hit the disk. In case we are journaling data, we
- * cannot start transaction directly because transaction start ranks above page
- * lock so we have to do some magic.
- *
- * This function can get called via...
- * - ext4_writepages after taking page lock (have journal handle)
- * - journal_submit_inode_data_buffers (no journal handle)
- * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
- * - grab_page_cache when doing write_begin (have journal handle)
- *
- * We don't do any block allocation in this function. If we have page with
- * multiple blocks we need to write those buffer_heads that are mapped. This
- * is important for mmaped based write. So if we do with blocksize 1K
- * truncate(f, 1024);
- * a = mmap(f, 0, 4096);
- * a[0] = 'a';
- * truncate(f, 4096);
- * we have in the page first buffer_head mapped via page_mkwrite call back
- * but other buffer_heads would be unmapped but dirty (dirty done via the
- * do_wp_page). So writepage should write the first block. If we modify
- * the mmap area beyond 1024 we will again get a page_fault and the
- * page_mkwrite callback will do the block allocation and mark the
- * buffer_heads mapped.
- *
- * We redirty the page if we have any buffer_heads that is either delay or
- * unwritten in the page.
- *
- * We can get recursively called as show below.
- *
- * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- * ext4_writepage()
- *
- * But since we don't do any block allocation we should not deadlock.
- * Page also have the dirty flag cleared so we don't get recurive page_lock.
- */
-static int ext4_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- int ret = 0;
- loff_t size;
- unsigned int len;
- struct buffer_head *page_bufs = NULL;
- struct inode *inode = page->mapping->host;
- struct ext4_io_submit io_submit;
- bool keep_towrite = false;
-
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
- inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
- return -EIO;
- }
-
- trace_ext4_writepage(page);
- size = i_size_read(inode);
- if (page->index == size >> PAGE_SHIFT &&
- !ext4_verity_in_progress(inode))
- len = size & ~PAGE_MASK;
- else
- len = PAGE_SIZE;
-
- page_bufs = page_buffers(page);
- /*
- * We cannot do block allocation or other extent handling in this
- * function. If there are buffers needing that, we have to redirty
- * the page. But we may reach here when we do a journal commit via
- * journal_submit_inode_data_buffers() and in that case we must write
- * allocated buffers to achieve data=ordered mode guarantees.
- *
- * Also, if there is only one buffer per page (the fs block
- * size == the page size), if one buffer needs block
- * allocation or needs to modify the extent tree to clear the
- * unwritten flag, we know that the page can't be written at
- * all, so we might as well refuse the write immediately.
- * Unfortunately if the block size != page size, we can't as
- * easily detect this case using ext4_walk_page_buffers(), but
- * for the extremely common case, this is an optimization that
- * skips a useless round trip through ext4_bio_write_page().
- */
- if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
- redirty_page_for_writepage(wbc, page);
- if ((current->flags & PF_MEMALLOC) ||
- (inode->i_sb->s_blocksize == PAGE_SIZE)) {
- /*
- * For memory cleaning there's no point in writing only
- * some buffers. So just bail out. Warn if we came here
- * from direct reclaim.
- */
- WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
- == PF_MEMALLOC);
- unlock_page(page);
- return 0;
- }
- keep_towrite = true;
- }
-
- if (PageChecked(page) && ext4_should_journal_data(inode))
- /*
- * It's mmapped pagecache. Add buffers and journal it. There
- * doesn't seem much point in redirtying the page here.
- */
- return __ext4_journalled_writepage(page, len);
-
- ext4_io_submit_init(&io_submit, wbc);
- io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_submit.io_end) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return -ENOMEM;
- }
- ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
- ext4_io_submit(&io_submit);
- /* Drop io_end reference we got from init */
- ext4_put_io_end_defer(io_submit.io_end);
- return ret;
-}
-
-static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
-{
- int len;
+ size_t len;
loff_t size;
int err;
- BUG_ON(page->index != mpd->first_page);
- clear_page_dirty_for_io(page);
+ WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
+ folio_clear_dirty_for_io(folio);
/*
* We have to be very careful here! Nothing protects writeback path
* against i_size changes and the page can be writeably mapped into
* page tables. So an application can be growing i_size and writing
- * data through mmap while writeback runs. clear_page_dirty_for_io()
+ * data through mmap while writeback runs. folio_clear_dirty_for_io()
* write-protects our page in page tables and the page cannot get
- * written to again until we release page lock. So only after
- * clear_page_dirty_for_io() we are safe to sample i_size for
- * ext4_bio_write_page() to zero-out tail of the written page. We rely
- * on the barrier provided by TestClearPageDirty in
- * clear_page_dirty_for_io() to make sure i_size is really sampled only
+ * written to again until we release folio lock. So only after
+ * folio_clear_dirty_for_io() we are safe to sample i_size for
+ * ext4_bio_write_folio() to zero-out tail of the written page. We rely
+ * on the barrier provided by folio_test_clear_dirty() in
+ * folio_clear_dirty_for_io() to make sure i_size is really sampled only
* after page tables are updated.
*/
size = i_size_read(mpd->inode);
- if (page->index == size >> PAGE_SHIFT &&
+ len = folio_size(folio);
+ if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(mpd->inode))
- len = size & ~PAGE_MASK;
- else
- len = PAGE_SIZE;
- err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
- if (!err)
- mpd->wbc->nr_to_write--;
- mpd->first_page++;
+ len = size & (len - 1);
+ err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
return err;
}
@@ -2193,9 +2195,10 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
} while (lblk++, (bh = bh->b_this_page) != head);
/* So far everything mapped? Submit the page for IO. */
if (mpd->map.m_len == 0) {
- err = mpage_submit_page(mpd, head->b_page);
+ err = mpage_submit_folio(mpd, head->b_folio);
if (err < 0)
return err;
+ mpage_folio_done(mpd, head->b_folio);
}
if (lblk >= blocks) {
mpd->scanned_until_end = 1;
@@ -2205,21 +2208,22 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
}
/*
- * mpage_process_page - update page buffers corresponding to changed extent and
- * may submit fully mapped page for IO
- *
- * @mpd - description of extent to map, on return next extent to map
- * @m_lblk - logical block mapping.
- * @m_pblk - corresponding physical mapping.
- * @map_bh - determines on return whether this page requires any further
+ * mpage_process_folio - update folio buffers corresponding to changed extent
+ * and may submit fully mapped page for IO
+ * @mpd: description of extent to map, on return next extent to map
+ * @folio: Contains these buffers.
+ * @m_lblk: logical block mapping.
+ * @m_pblk: corresponding physical mapping.
+ * @map_bh: determines on return whether this page requires any further
* mapping or not.
- * Scan given page buffers corresponding to changed extent and update buffer
+ *
+ * Scan given folio buffers corresponding to changed extent and update buffer
* state according to new extent state.
* We map delalloc buffers to their physical location, clear unwritten bits.
- * If the given page is not fully mapped, we update @map to the next extent in
- * the given page that needs mapping & return @map_bh as true.
+ * If the given folio is not fully mapped, we update @mpd to the next extent in
+ * the given folio that needs mapping & return @map_bh as true.
*/
-static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
bool *map_bh)
{
@@ -2228,23 +2232,21 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
ext4_lblk_t lblk = *m_lblk;
ext4_fsblk_t pblock = *m_pblk;
int err = 0;
- int blkbits = mpd->inode->i_blkbits;
ssize_t io_end_size = 0;
struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
- bh = head = page_buffers(page);
+ bh = head = folio_buffers(folio);
do {
if (lblk < mpd->map.m_lblk)
continue;
if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
/*
* Buffer after end of mapped extent.
- * Find next buffer in the page to map.
+ * Find next buffer in the folio to map.
*/
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
io_end_vec->size += io_end_size;
- io_end_size = 0;
err = mpage_process_page_bufs(mpd, head, bh, lblk);
if (err > 0)
@@ -2255,7 +2257,8 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
err = PTR_ERR(io_end_vec);
goto out;
}
- io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode,
+ mpd->map.m_lblk);
}
*map_bh = true;
goto out;
@@ -2265,11 +2268,10 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
bh->b_blocknr = pblock++;
}
clear_buffer_unwritten(bh);
- io_end_size += (1 << blkbits);
+ io_end_size += i_blocksize(mpd->inode);
} while (lblk++, (bh = bh->b_this_page) != head);
io_end_vec->size += io_end_size;
- io_end_size = 0;
*map_bh = false;
out:
*m_lblk = lblk;
@@ -2293,31 +2295,29 @@ out:
*/
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
- struct pagevec pvec;
- int nr_pages, i;
+ struct folio_batch fbatch;
+ unsigned nr, i;
struct inode *inode = mpd->inode;
- int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
ext4_fsblk_t pblock;
int err;
bool map_bh = false;
- start = mpd->map.m_lblk >> bpp_bits;
- end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
- lblk = start << bpp_bits;
+ start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk);
+ end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1);
pblock = mpd->map.m_pblk;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
while (start <= end) {
- nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
- &start, end);
- if (nr_pages == 0)
+ nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
+ if (nr == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr; i++) {
+ struct folio *folio = fbatch.folios[i];
- err = mpage_process_page(mpd, page, &lblk, &pblock,
+ lblk = EXT4_PG_TO_LBLK(inode, folio->index);
+ err = mpage_process_folio(mpd, folio, &lblk, &pblock,
&map_bh);
/*
* If map_bh is true, means page may require further bh
@@ -2327,18 +2327,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
if (err < 0 || map_bh)
goto out;
/* Page fully mapped - let IO run! */
- err = mpage_submit_page(mpd, page);
+ err = mpage_submit_folio(mpd, folio);
if (err < 0)
goto out;
+ mpage_folio_done(mpd, folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
}
/* Extent fully mapped and matches with page boundary. We are done. */
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
out:
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
return err;
}
@@ -2349,6 +2350,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
int get_blocks_flags;
int err, dioread_nolock;
+ /* Make sure transaction has enough credits for this extent */
+ err = ext4_journal_ensure_extent_credits(handle, inode);
+ if (err < 0)
+ return err;
+
trace_ext4_da_write_pages_extent(inode, map);
/*
* Call ext4_map_blocks() to allocate any delayed allocation blocks, or
@@ -2358,21 +2364,18 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* previously reserved. However we must not fail because we're in
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
- * possible.
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
- * the blocks in question are delalloc blocks. This indicates
- * that the blocks and quotas has already been checked when
- * the data was copied into the page cache.
+ * possible. In addition, do not cache any unrelated extents, as it
+ * only holds the folio lock but does not hold the i_rwsem or
+ * invalidate_lock, which could corrupt the extent status tree.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
- EXT4_GET_BLOCKS_IO_SUBMIT;
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE;
+
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (map->m_flags & BIT(BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
@@ -2383,7 +2386,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
mpd->io_submit.io_end->handle = handle->h_rsv_handle;
handle->h_rsv_handle = NULL;
}
- ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ ext4_set_io_unwritten_flag(mpd->io_submit.io_end);
}
BUG_ON(map->m_len == 0);
@@ -2391,6 +2394,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
}
/*
+ * This is used to submit mapped buffers in a single folio that is not fully
+ * mapped for various reasons, such as insufficient space or journal credits.
+ */
+static int mpage_submit_partial_folio(struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct folio *folio;
+ loff_t pos;
+ int ret;
+
+ folio = filemap_get_folio(inode->i_mapping,
+ mpd->start_pos >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ /*
+ * The mapped position should be within the current processing folio
+ * but must not be the folio start position.
+ */
+ pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits;
+ if (WARN_ON_ONCE((folio_pos(folio) == pos) ||
+ !folio_contains(folio, pos >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = mpage_submit_folio(mpd, folio);
+ if (ret)
+ goto out;
+ /*
+ * Update start_pos to prevent this folio from being released in
+ * mpage_release_unused_pages(), it will be reset to the aligned folio
+ * pos when this folio is written again in the next round. Additionally,
+ * do not update wbc->nr_to_write here, as it will be updated once the
+ * entire folio has finished processing.
+ */
+ mpd->start_pos = pos;
+out:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+}
+
+/*
* mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
* mpd->len and submit pages underlying it for IO
*
@@ -2425,24 +2469,31 @@ static int mpage_map_and_submit_extent(handle_t *handle,
io_end_vec = ext4_alloc_io_end_vec(io_end);
if (IS_ERR(io_end_vec))
return PTR_ERR(io_end_vec);
- io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk);
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
struct super_block *sb = inode->i_sb;
- if (ext4_forced_shutdown(EXT4_SB(sb)) ||
- EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_emergency_state(sb))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
* In the case of ENOSPC, if ext4_count_free_blocks()
* is non-zero, a commit should free up blocks.
*/
- if ((err == -ENOMEM) ||
+ if ((err == -ENOMEM) || (err == -EAGAIN) ||
(err == -ENOSPC && ext4_count_free_clusters(sb))) {
- if (progress)
+ /*
+ * We may have already allocated extents for
+ * some bhs inside the folio, issue the
+ * corresponding data to prevent stale data.
+ */
+ if (progress) {
+ if (mpage_submit_partial_folio(mpd))
+ goto invalidate_dirty_pages;
goto update_disksize;
+ }
return err;
}
ext4_msg(sb, KERN_CRIT,
@@ -2476,7 +2527,7 @@ update_disksize:
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
*/
- disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
+ disksize = mpd->start_pos;
if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
int err2;
loff_t i_size;
@@ -2500,33 +2551,59 @@ update_disksize:
return err;
}
-/*
- * Calculate the total number of credits to reserve for one writepages
- * iteration. This is called from ext4_writepages(). We map an extent of
- * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
- * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
- * bpp - 1 blocks in bpp different extents.
- */
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
+ size_t len)
+{
+ struct buffer_head *page_bufs = folio_buffers(folio);
+ struct inode *inode = folio->mapping->host;
+ int ret, err;
+
+ ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+ NULL, do_journal_get_write_access);
+ err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+ NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
+ err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
+ if (ret == 0)
+ ret = err;
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+
+ return ret;
+}
+
+static int mpage_journal_page_buffers(handle_t *handle,
+ struct mpage_da_data *mpd,
+ struct folio *folio)
{
- int bpp = ext4_journal_blocks_per_page(inode);
+ struct inode *inode = mpd->inode;
+ loff_t size = i_size_read(inode);
+ size_t len = folio_size(folio);
+
+ folio_clear_checked(folio);
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
- return ext4_meta_trans_blocks(inode,
- MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+ if (folio_pos(folio) + len > size &&
+ !ext4_verity_in_progress(inode))
+ len = size & (len - 1);
+
+ return ext4_journal_folio_buffers(handle, folio, len);
}
/*
* mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
- * and underlying extent to map
+ * needing mapping, submit mapped pages
*
* @mpd - where to look for pages
*
* Walk dirty pages in the mapping. If they are fully mapped, submit them for
- * IO immediately. When we find a page which isn't mapped we start accumulating
- * extent of buffers underlying these pages that needs mapping (formed by
- * either delayed or unwritten buffers). We also lock the pages containing
- * these buffers. The extent found is returned in @mpd structure (starting at
- * mpd->lblk with length mpd->len blocks).
+ * IO immediately. If we cannot map blocks, we submit just already mapped
+ * buffers in the page for IO and keep page dirty. When we can map blocks and
+ * we find a page which isn't mapped we start accumulating extent of buffers
+ * underlying these pages that needs mapping (formed by either delayed or
+ * unwritten buffers). We also lock the pages containing these buffers. The
+ * extent found is returned in @mpd structure (starting at mpd->lblk with
+ * length mpd->len blocks).
*
* Note that this function can attach bios to one io_end structure which are
* neither logically nor physically contiguous. Although it may seem as an
@@ -2536,33 +2613,36 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
{
struct address_space *mapping = mpd->inode->i_mapping;
- struct pagevec pvec;
- unsigned int nr_pages;
- long left = mpd->wbc->nr_to_write;
- pgoff_t index = mpd->first_page;
- pgoff_t end = mpd->last_page;
+ struct folio_batch fbatch;
+ unsigned int nr_folios;
+ pgoff_t index = mpd->start_pos >> PAGE_SHIFT;
+ pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
xa_mark_t tag;
int i, err = 0;
- int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
struct buffer_head *head;
+ handle_t *handle = NULL;
+ int bpp = ext4_journal_blocks_per_folio(mpd->inode);
- if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(mpd->wbc);
- pagevec_init(&pvec);
mpd->map.m_len = 0;
- mpd->next_page = index;
+ mpd->next_pos = mpd->start_pos;
+ if (ext4_should_journal_data(mpd->inode)) {
+ handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
+ bpp);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ }
+ folio_batch_init(&fbatch);
while (index <= end) {
- nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
- tag);
- if (nr_pages == 0)
+ nr_folios = filemap_get_folios_tag(mapping, &index, end,
+ tag, &fbatch);
+ if (nr_folios == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
/*
* Accumulated enough dirty pages? This doesn't apply
@@ -2572,14 +2652,24 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
* newly appeared dirty pages, but have not synced all
* of the old dirty pages.
*/
- if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+ mpd->wbc->nr_to_write <=
+ EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len))
goto out;
/* If we can't merge this page, we are done. */
- if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ if (mpd->map.m_len > 0 &&
+ mpd->next_pos != folio_pos(folio))
goto out;
- lock_page(page);
+ if (handle) {
+ err = ext4_journal_ensure_credits(handle, bpp,
+ 0);
+ if (err < 0)
+ goto out;
+ }
+
+ folio_lock(folio);
/*
* If the page is no longer dirty, or its mapping no
* longer corresponds to inode we are writing (which
@@ -2587,59 +2677,99 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
* page is already under writeback and we are not doing
* a data integrity writeback, skip the page
*/
- if (!PageDirty(page) ||
- (PageWriteback(page) &&
+ if (!folio_test_dirty(folio) ||
+ (folio_test_writeback(folio) &&
(mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
- unlikely(page->mapping != mapping)) {
- unlock_page(page);
+ unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
continue;
}
- wait_on_page_writeback(page);
- BUG_ON(PageWriteback(page));
+ folio_wait_writeback(folio);
+ BUG_ON(folio_test_writeback(folio));
+
+ /*
+ * Should never happen but for buggy code in
+ * other subsystems that call
+ * set_page_dirty() without properly warning
+ * the file system first. See [1] for more
+ * information.
+ *
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
+ */
+ if (!folio_buffers(folio)) {
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
+ continue;
+ }
if (mpd->map.m_len == 0)
- mpd->first_page = page->index;
- mpd->next_page = page->index + 1;
- /* Add all dirty buffers to mpd */
- lblk = ((ext4_lblk_t)page->index) <<
- (PAGE_SHIFT - blkbits);
- head = page_buffers(page);
- err = mpage_process_page_bufs(mpd, head, head, lblk);
- if (err <= 0)
- goto out;
- err = 0;
- left--;
+ mpd->start_pos = folio_pos(folio);
+ mpd->next_pos = folio_next_pos(folio);
+ /*
+ * Writeout when we cannot modify metadata is simple.
+ * Just submit the page. For data=journal mode we
+ * first handle writeout of the page for checkpoint and
+ * only after that handle delayed page dirtying. This
+ * makes sure current data is checkpointed to the final
+ * location before possibly journalling it again which
+ * is desirable when the page is frequently dirtied
+ * through a pin.
+ */
+ if (!mpd->can_map) {
+ err = mpage_submit_folio(mpd, folio);
+ if (err < 0)
+ goto out;
+ /* Pending dirtying of journalled data? */
+ if (folio_test_checked(folio)) {
+ err = mpage_journal_page_buffers(handle,
+ mpd, folio);
+ if (err < 0)
+ goto out;
+ mpd->journalled_more_data = 1;
+ }
+ mpage_folio_done(mpd, folio);
+ } else {
+ /* Add all dirty buffers to mpd */
+ lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index);
+ head = folio_buffers(folio);
+ err = mpage_process_page_bufs(mpd, head, head,
+ lblk);
+ if (err <= 0)
+ goto out;
+ err = 0;
+ }
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
mpd->scanned_until_end = 1;
+ if (handle)
+ ext4_journal_stop(handle);
return 0;
out:
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
+ if (handle)
+ ext4_journal_stop(handle);
return err;
}
-static int ext4_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_do_writepages(struct mpage_da_data *mpd)
{
+ struct writeback_control *wbc = mpd->wbc;
pgoff_t writeback_index = 0;
long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
int cycled = 1;
handle_t *handle = NULL;
- struct mpage_da_data mpd;
- struct inode *inode = mapping->host;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
struct blk_plug plug;
bool give_up_on_write = false;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
-
- percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
/*
@@ -2650,26 +2780,19 @@ static int ext4_writepages(struct address_space *mapping,
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
goto out_writepages;
- if (ext4_should_journal_data(inode)) {
- ret = generic_writepages(mapping, wbc);
- goto out_writepages;
- }
-
/*
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
- * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
+ * fs shutdown state instead of sb->s_flag's SB_RDONLY because
* the latter could be true if the filesystem is mounted
* read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
- sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
- ret = -EROFS;
+ ret = ext4_emergency_state(mapping->host->i_sb);
+ if (unlikely(ret))
goto out_writepages;
- }
/*
* If we have inline data and arrive here, it means that
@@ -2689,13 +2812,33 @@ static int ext4_writepages(struct address_space *mapping,
ext4_journal_stop(handle);
}
+ /*
+ * data=journal mode does not do delalloc so we just need to writeout /
+ * journal already mapped buffers. On the other hand we need to commit
+ * transaction to make data stable. We expect all the data to be
+ * already in the journal (the only exception are DMA pinned pages
+ * dirtied behind our back) so we commit transaction here and run the
+ * writeback loop to checkpoint them. The checkpointing is not actually
+ * necessary to make data persistent *but* quite a few places (extent
+ * shifting operations, fsverity, ...) depend on being able to drop
+ * pagecache pages after calling filemap_write_and_wait() and for that
+ * checkpointing needs to happen.
+ */
+ if (ext4_should_journal_data(inode)) {
+ mpd->can_map = 0;
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ ext4_fc_commit(sbi->s_journal,
+ EXT4_I(inode)->i_datasync_tid);
+ }
+ mpd->journalled_more_data = 0;
+
if (ext4_should_dioread_nolock(inode)) {
+ int bpf = ext4_journal_blocks_per_folio(inode);
/*
* We may need to convert up to one extent per block in
- * the page and we may dirty the inode.
+ * the folio and we may dirty the inode.
*/
- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
- PAGE_SIZE >> inode->i_blkbits);
+ rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf);
}
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
@@ -2705,19 +2848,18 @@ static int ext4_writepages(struct address_space *mapping,
writeback_index = mapping->writeback_index;
if (writeback_index)
cycled = 0;
- mpd.first_page = writeback_index;
- mpd.last_page = -1;
+ mpd->start_pos = writeback_index << PAGE_SHIFT;
+ mpd->end_pos = LLONG_MAX;
} else {
- mpd.first_page = wbc->range_start >> PAGE_SHIFT;
- mpd.last_page = wbc->range_end >> PAGE_SHIFT;
+ mpd->start_pos = wbc->range_start;
+ mpd->end_pos = wbc->range_end;
}
- mpd.inode = inode;
- mpd.wbc = wbc;
- ext4_io_submit_init(&mpd.io_submit, wbc);
+ ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT,
+ mpd->end_pos >> PAGE_SHIFT);
blk_start_plug(&plug);
/*
@@ -2726,31 +2868,32 @@ retry:
* in the block layer on device congestion while having transaction
* started.
*/
- mpd.do_map = 0;
- mpd.scanned_until_end = 0;
- mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!mpd.io_submit.io_end) {
+ mpd->do_map = 0;
+ mpd->scanned_until_end = 0;
+ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd->io_submit.io_end) {
ret = -ENOMEM;
goto unplug;
}
- ret = mpage_prepare_extent_to_map(&mpd);
+ ret = mpage_prepare_extent_to_map(mpd);
/* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, false);
+ mpage_release_unused_pages(mpd, false);
/* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
- ext4_put_io_end_defer(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_io_submit(&mpd->io_submit);
+ ext4_put_io_end_defer(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
if (ret < 0)
goto unplug;
- while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
+ while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
/* For each extent of pages we use new io_end */
- mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!mpd.io_submit.io_end) {
+ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd->io_submit.io_end) {
ret = -ENOMEM;
break;
}
+ WARN_ON_ONCE(!mpd->can_map);
/*
* We have two constraints: We find one extent to map and we
* must always write out whole page (makes a difference when
@@ -2759,8 +2902,14 @@ retry:
* not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
- needed_blocks = ext4_da_writepages_trans_blocks(inode);
-
+ /*
+ * Calculate the number of credits needed to reserve for one
+ * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will
+ * attempt to extend the transaction or start a new iteration
+ * if the reserved credits are insufficient.
+ */
+ needed_blocks = ext4_chunk_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN);
/* start a new transaction */
handle = ext4_journal_start_with_reserve(inode,
EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
@@ -2770,23 +2919,24 @@ retry:
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
/* Release allocated io_end */
- ext4_put_io_end(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_put_io_end(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
break;
}
- mpd.do_map = 1;
+ mpd->do_map = 1;
- trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
- ret = mpage_prepare_extent_to_map(&mpd);
- if (!ret && mpd.map.m_len)
- ret = mpage_map_and_submit_extent(handle, &mpd,
+ trace_ext4_da_write_folios_start(inode, mpd->start_pos,
+ mpd->next_pos, wbc);
+ ret = mpage_prepare_extent_to_map(mpd);
+ if (!ret && mpd->map.m_len)
+ ret = mpage_map_and_submit_extent(handle, mpd,
&give_up_on_write);
/*
* Caution: If the handle is synchronous,
* ext4_journal_stop() can wait for transaction commit
* to finish which may depend on writeback of pages to
* complete or on page lock to be released. In that
- * case, we have to wait until after after we have
+ * case, we have to wait until after we have
* submitted all the IO, released page locks we hold,
* and dropped io_end reference (for extent conversion
* to be able to complete) before stopping the handle.
@@ -2794,12 +2944,12 @@ retry:
if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
ext4_journal_stop(handle);
handle = NULL;
- mpd.do_map = 0;
+ mpd->do_map = 0;
}
/* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, give_up_on_write);
+ mpage_release_unused_pages(mpd, give_up_on_write);
/* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
+ ext4_io_submit(&mpd->io_submit);
/*
* Drop our io_end reference we got from init. We have
@@ -2809,11 +2959,13 @@ retry:
* up doing unwritten extent conversion.
*/
if (handle) {
- ext4_put_io_end_defer(mpd.io_submit.io_end);
+ ext4_put_io_end_defer(mpd->io_submit.io_end);
ext4_journal_stop(handle);
} else
- ext4_put_io_end(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_put_io_end(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
+ trace_ext4_da_write_folios_end(inode, mpd->start_pos,
+ mpd->next_pos, wbc, ret);
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2825,6 +2977,8 @@ retry:
ret = 0;
continue;
}
+ if (ret == -EAGAIN)
+ ret = 0;
/* Fatal error - ENOMEM, EIO... */
if (ret)
break;
@@ -2833,8 +2987,8 @@ unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- mpd.last_page = writeback_index - 1;
- mpd.first_page = 0;
+ mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1;
+ mpd->start_pos = 0;
goto retry;
}
@@ -2844,33 +2998,80 @@ unplug:
* Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = mpd.first_page;
+ mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT;
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct super_block *sb = mapping->host->i_sb;
+ struct mpage_da_data mpd = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .can_map = 1,
+ };
+ int ret;
+ int alloc_ctx;
+
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
+
+ alloc_ctx = ext4_writepages_down_read(sb);
+ ret = ext4_do_writepages(&mpd);
+ /*
+ * For data=journal writeback we could have come across pages marked
+ * for delayed dirtying (PageChecked) which were just added to the
+ * running transaction. Try once more to get them to stable storage.
+ */
+ if (!ret && mpd.journalled_more_data)
+ ret = ext4_do_writepages(&mpd);
+ ext4_writepages_up_read(sb, alloc_ctx);
+
+ return ret;
+}
+
+int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+ struct mpage_da_data mpd = {
+ .inode = jinode->i_vfs_inode,
+ .wbc = &wbc,
+ .can_map = 0,
+ };
+ return ext4_do_writepages(&mpd);
+}
+
static int ext4_dax_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
int ret;
long nr_to_write = wbc->nr_to_write;
struct inode *inode = mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
- percpu_down_read(&sbi->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
- ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
+ ret = dax_writeback_mapping_range(mapping,
+ EXT4_SB(inode->i_sb)->s_daxdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_read(inode->i_sb, alloc_ctx);
return ret;
}
@@ -2908,117 +3109,67 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
-/* We always reserve for an inode update; the superblock could be there too */
-static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
-{
- if (likely(ext4_has_feature_large_file(inode->i_sb)))
- return 1;
-
- if (pos + len <= 0x7fffffffULL)
- return 1;
-
- /* We might need to update the superblock to set LARGE_FILE */
- return 2;
-}
-
-static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+static int ext4_da_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret, retries = 0;
- struct page *page;
+ struct folio *folio;
pgoff_t index;
struct inode *inode = mapping->host;
- handle_t *handle;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
- ext4_verity_in_progress(inode)) {
+ if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
- return ext4_write_begin(file, mapping, pos,
- len, flags, pagep, fsdata);
+ return ext4_write_begin(iocb, mapping, pos,
+ len, foliop, fsdata);
}
*fsdata = (void *)0;
- trace_ext4_da_write_begin(inode, pos, len, flags);
+ trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
- ret = ext4_da_write_inline_data_begin(mapping, inode,
- pos, len, flags,
- pagep, fsdata);
+ ret = ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, fsdata, true);
if (ret < 0)
return ret;
if (ret == 1)
return 0;
}
- /*
- * grab_cache_page_write_begin() can take a long time if the
- * system is thrashing due to memory pressure, or if the page
- * is being written back. So grab it first before we start
- * the transaction handle. This also allows us to allocate
- * the page (if needed) without using GFP_NOFS.
- */
-retry_grab:
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
- unlock_page(page);
-
- /*
- * With delayed allocation, we don't log the i_disksize update
- * if there is delayed block allocation. But we still need
- * to journalling the i_disksize update if writes to the end
- * of file which has an already mapped buffer.
- */
-retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_da_write_credits(inode, pos, len));
- if (IS_ERR(handle)) {
- put_page(page);
- return PTR_ERR(handle);
- }
+retry:
+ folio = write_begin_get_folio(iocb, mapping, index, len);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- lock_page(page);
- if (page->mapping != mapping) {
- /* The page got truncated from under us */
- unlock_page(page);
- put_page(page);
- ext4_journal_stop(handle);
- goto retry_grab;
- }
- /* In case writeback began while the page was unlocked */
- wait_for_stable_page(page);
+ if (len > folio_next_pos(folio) - pos)
+ len = folio_next_pos(folio) - pos;
-#ifdef CONFIG_FS_ENCRYPTION
- ret = ext4_block_write_begin(page, pos, len,
+ ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
-#else
- ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
-#endif
if (ret < 0) {
- unlock_page(page);
- ext4_journal_stop(handle);
+ folio_unlock(folio);
+ folio_put(folio);
/*
- * block_write_begin may have instantiated a few blocks
+ * ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * i_size_read because we hold inode lock.
*/
if (pos + len > inode->i_size)
ext4_truncate_failed_write(inode);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry_journal;
-
- put_page(page);
+ goto retry;
return ret;
}
- *pagep = page;
+ *foliop = folio;
return ret;
}
@@ -3026,15 +3177,15 @@ retry_journal:
* Check if we should update i_disksize
* when write to the end of file but not require block allocation
*/
-static int ext4_da_should_update_i_disksize(struct page *page,
+static int ext4_da_should_update_i_disksize(struct folio *folio,
unsigned long offset)
{
struct buffer_head *bh;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned int idx;
int i;
- bh = page_buffers(page);
+ bh = folio_buffers(folio);
idx = offset >> inode->i_blkbits;
for (i = 0; i < idx; i++)
@@ -3045,61 +3196,100 @@ static int ext4_da_should_update_i_disksize(struct page *page,
return 1;
}
-static int ext4_da_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+static int ext4_da_do_write_end(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio)
{
struct inode *inode = mapping->host;
- int ret = 0, ret2;
- handle_t *handle = ext4_journal_current_handle();
- loff_t new_i_size;
- unsigned long start, end;
- int write_mode = (int)(unsigned long)fsdata;
-
- if (write_mode == FALL_BACK_TO_NONDELALLOC)
- return ext4_write_end(file, mapping, pos,
- len, copied, page, fsdata);
-
- trace_ext4_da_write_end(inode, pos, len, copied);
- start = pos & (PAGE_SIZE - 1);
- end = start + copied - 1;
+ loff_t old_size = inode->i_size;
+ bool disksize_changed = false;
+ loff_t new_i_size, zero_len = 0;
+ handle_t *handle;
+ if (unlikely(!folio_buffers(folio))) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -EIO;
+ }
/*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
+ * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
+ * flag, which all that's needed to trigger page writeback.
*/
+ copied = block_write_end(pos, len, copied, folio);
new_i_size = pos + copied;
- if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_has_inline_data(inode) ||
- ext4_da_should_update_i_disksize(page, end)) {
+
+ /*
+ * It's important to update i_size while still holding folio lock,
+ * because folio writeout could otherwise come in and zero beyond
+ * i_size.
+ *
+ * Since we are holding inode lock, we are sure i_disksize <=
+ * i_size. We also know that if i_disksize < i_size, there are
+ * delalloc writes pending in the range up to i_size. If the end of
+ * the current write is <= i_size, there's no need to touch
+ * i_disksize since writeback will push i_disksize up to i_size
+ * eventually. If the end of the current write is > i_size and
+ * inside an allocated block which ext4_da_should_update_i_disksize()
+ * checked, we need to update i_disksize here as certain
+ * ext4_writepages() paths not allocating blocks and update i_disksize.
+ */
+ if (new_i_size > inode->i_size) {
+ unsigned long end;
+
+ i_size_write(inode, new_i_size);
+ end = offset_in_folio(folio, new_i_size - 1);
+ if (copied && ext4_da_should_update_i_disksize(folio, end)) {
ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ret = ext4_mark_inode_dirty(handle, inode);
+ disksize_changed = true;
}
}
+ folio_unlock(folio);
+ folio_put(folio);
+
+ if (pos > old_size) {
+ pagecache_isize_extended(inode, old_size, pos);
+ zero_len = pos - old_size;
+ }
+
+ if (!disksize_changed && !zero_len)
+ return copied;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ if (zero_len)
+ ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return copied;
+}
+
+static int ext4_da_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int write_mode = (int)(unsigned long)fsdata;
+
+ if (write_mode == FALL_BACK_TO_NONDELALLOC)
+ return ext4_write_end(iocb, mapping, pos,
+ len, copied, folio, fsdata);
+
+ trace_ext4_da_write_end(inode, pos, len, copied);
+
if (write_mode != CONVERT_INLINE_DATA &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
ext4_has_inline_data(inode))
- ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
- page);
- else
- ret2 = generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
- copied = ret2;
- if (ret2 < 0)
- ret = ret2;
- ret2 = ext4_journal_stop(handle);
- if (unlikely(ret2 && !ret))
- ret = ret2;
+ if (unlikely(copied < len) && !folio_test_uptodate(folio))
+ copied = 0;
- return ret ? ret : copied;
+ return ext4_da_do_write_end(mapping, pos, len, copied, folio);
}
/*
@@ -3163,70 +3353,45 @@ int ext4_alloc_da_blocks(struct inode *inode)
static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- journal_t *journal;
- int err;
+ sector_t ret = 0;
+ inode_lock_shared(inode);
/*
* We can get here for an inline file via the FIBMAP ioctl
*/
if (ext4_has_inline_data(inode))
- return 0;
+ goto out;
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
- test_opt(inode->i_sb, DELALLOC)) {
+ (test_opt(inode->i_sb, DELALLOC) ||
+ ext4_should_journal_data(inode))) {
/*
- * With delalloc we want to sync the file
- * so that we can make sure we allocate
- * blocks for file
+ * With delalloc or journalled data we want to sync the file so
+ * that we can make sure we allocate blocks for file and data
+ * is in place for the user to see it
*/
filemap_write_and_wait(mapping);
}
- if (EXT4_JOURNAL(inode) &&
- ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
- /*
- * This is a REALLY heavyweight approach, but the use of
- * bmap on dirty files is expected to be extremely rare:
- * only if we run lilo or swapon on a freshly made file
- * do we expect this to happen.
- *
- * (bmap requires CAP_SYS_RAWIO so this does not
- * represent an unprivileged user DOS attack --- we'd be
- * in trouble if mortal users could trigger this path at
- * will.)
- *
- * NB. EXT4_STATE_JDATA is not set on files other than
- * regular files. If somebody wants to bmap a directory
- * or symlink and gets confused because the buffer
- * hasn't yet been flushed to disk, they deserve
- * everything they get.
- */
-
- ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
- journal = EXT4_JOURNAL(inode);
- jbd2_journal_lock_updates(journal);
- err = jbd2_journal_flush(journal);
- jbd2_journal_unlock_updates(journal);
-
- if (err)
- return 0;
- }
+ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
- return iomap_bmap(mapping, block, &ext4_iomap_ops);
+out:
+ inode_unlock_shared(inode);
+ return ret;
}
-static int ext4_readpage(struct file *file, struct page *page)
+static int ext4_read_folio(struct file *file, struct folio *folio)
{
int ret = -EAGAIN;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
- trace_ext4_readpage(page);
+ trace_ext4_read_folio(inode, folio);
if (ext4_has_inline_data(inode))
- ret = ext4_readpage_inline(inode, page);
+ ret = ext4_readpage_inline(inode, folio);
if (ret == -EAGAIN)
- return ext4_mpage_readpages(inode, NULL, page);
+ return ext4_mpage_readpages(inode, NULL, folio);
return ret;
}
@@ -3242,73 +3407,79 @@ static void ext4_readahead(struct readahead_control *rac)
ext4_mpage_readpages(inode, rac, NULL);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ext4_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- trace_ext4_invalidatepage(page, offset, length);
+ trace_ext4_invalidate_folio(folio, offset, length);
/* No journalling happens on data buffers when this function is used */
- WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));
- block_invalidatepage(page, offset, length);
+ block_invalidate_folio(folio, offset, length);
}
-static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static int __ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset, length);
+ trace_ext4_journalled_invalidate_folio(folio, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0 && length == PAGE_SIZE)
- ClearPageChecked(page);
+ if (offset == 0 && length == folio_size(folio))
+ folio_clear_checked(folio);
- return jbd2_journal_invalidatepage(journal, page, offset, length);
+ return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}
/* Wrapper for aops... */
-static void ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static void ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset,
+ size_t length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
+ WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}
-static int ext4_releasepage(struct page *page, gfp_t wait)
+static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ struct inode *inode = folio->mapping->host;
+ journal_t *journal = EXT4_JOURNAL(inode);
- trace_ext4_releasepage(page);
+ trace_ext4_release_folio(inode, folio);
/* Page has dirty journalled data -> cannot release */
- if (PageChecked(page))
- return 0;
+ if (folio_test_checked(folio))
+ return false;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return jbd2_journal_try_to_free_buffers(journal, folio);
else
- return try_to_free_buffers(page);
+ return try_to_free_buffers(folio);
}
static bool ext4_inode_datasync_dirty(struct inode *inode)
{
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (journal)
- return !jbd2_transaction_committed(journal,
- EXT4_I(inode)->i_datasync_tid);
+ if (journal) {
+ if (jbd2_transaction_committed(journal,
+ EXT4_I(inode)->i_datasync_tid))
+ return false;
+ if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
+ return !list_empty(&EXT4_I(inode)->i_fc_list);
+ return true;
+ }
+
/* Any metadata buffers to write? */
- if (!list_empty(&inode->i_mapping->private_list))
+ if (!list_empty(&inode->i_mapping->i_private_list))
return true;
- return inode->i_state & I_DIRTY_DATASYNC;
+ return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
}
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
struct ext4_map_blocks *map, loff_t offset,
- loff_t length)
+ loff_t length, unsigned int flags)
{
u8 blkbits = inode->i_blkbits;
@@ -3325,10 +3496,16 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
- iomap->offset = (u64) map->m_lblk << blkbits;
- iomap->length = (u64) map->m_len << blkbits;
+ /* HW-offload atomics are always used */
+ if (flags & IOMAP_ATOMIC)
+ iomap->flags |= IOMAP_F_ATOMIC_BIO;
+
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ else
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk);
+ iomap->length = EXT4_LBLK_TO_B(inode, map->m_len);
if ((map->m_flags & EXT4_MAP_MAPPED) &&
!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3346,21 +3523,164 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_UNWRITTEN) {
iomap->type = IOMAP_UNWRITTEN;
iomap->addr = (u64) map->m_pblk << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
} else if (map->m_flags & EXT4_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
iomap->addr = (u64) map->m_pblk << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
+ } else if (map->m_flags & EXT4_MAP_DELAYED) {
+ iomap->type = IOMAP_DELALLOC;
+ iomap->addr = IOMAP_NULL_ADDR;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
}
}
+static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ unsigned int mapped_len = 0, m_flags = 0;
+ ext4_fsblk_t next_pblk = 0;
+ bool check_next_pblk = false;
+ int ret = 0;
+
+ WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb));
+
+ /*
+ * This is a slow path in case of mixed mapping. We use
+ * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single
+ * contiguous mapped mapping. This will ensure any unwritten or hole
+ * regions within the requested range is zeroed out and we return
+ * a single contiguous mapped extent.
+ */
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+
+ do {
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out_err;
+ /*
+ * This should never happen, but let's return an error code to
+ * avoid an infinite loop in here.
+ */
+ if (ret == 0) {
+ ret = -EFSCORRUPTED;
+ ext4_warning_inode(inode,
+ "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d",
+ m_flags, ret);
+ goto out_err;
+ }
+ /*
+ * With bigalloc we should never get ENOSPC nor discontiguous
+ * physical extents.
+ */
+ if ((check_next_pblk && next_pblk != map->m_pblk) ||
+ ret == -ENOSPC) {
+ ext4_warning_inode(inode,
+ "Non-contiguous allocation detected: expected %llu, got %llu, "
+ "or ext4_map_blocks() returned out of space ret: %d",
+ next_pblk, map->m_pblk, ret);
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+ next_pblk = map->m_pblk + map->m_len;
+ check_next_pblk = true;
+
+ mapped_len += map->m_len;
+ map->m_lblk += map->m_len;
+ map->m_len = m_len - mapped_len;
+ } while (mapped_len < m_len);
+
+ /*
+ * We might have done some work in above loop, so we need to query the
+ * start of the physical extent, based on the origin m_lblk and m_len.
+ * Let's also ensure we were able to allocate the required range for
+ * mixed mapping case.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF);
+ if (ret != m_len) {
+ ext4_warning_inode(inode,
+ "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n",
+ m_lblk, m_len, ret);
+ ret = -EINVAL;
+ }
+ return ret;
+
+out_err:
+ /* reset map before returning an error */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+ return ret;
+}
+
+/*
+ * ext4_map_blocks_atomic: Helper routine to ensure the entire requested
+ * range in @map [lblk, lblk + len) is one single contiguous extent with no
+ * mixed mappings.
+ *
+ * We first use m_flags passed to us by our caller (ext4_iomap_alloc()).
+ * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying
+ * physical extent for the requested range does not have a single contiguous
+ * mapping type i.e. (Hole, Mapped, or Unwritten) throughout.
+ * In that case we will loop over the requested range to allocate and zero out
+ * the unwritten / holes in between, to get a single mapped extent from
+ * [m_lblk, m_lblk + m_len). Note that this is only possible because we know
+ * this can be called only with bigalloc enabled filesystem where the underlying
+ * cluster is already allocated. This avoids allocating discontiguous extents
+ * in the slow path due to multiple calls to ext4_map_blocks().
+ * The slow path is mostly non-performance critical path, so it should be ok to
+ * loop using ext4_map_blocks() with appropriate flags to allocate & zero the
+ * underlying short holes/unwritten extents within the requested range.
+ */
+static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int m_flags,
+ bool *force_commit)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ int ret = 0;
+
+ WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb));
+
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 || ret == m_len)
+ goto out;
+ /*
+ * This is a mixed mapping case where we were not able to allocate
+ * a single contiguous extent. In that case let's reset requested
+ * mapping and call the slow path.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ /*
+ * slow path means we have mixed mapping, that means we will need
+ * to force txn commit.
+ */
+ *force_commit = true;
+ return ext4_map_blocks_atomic_write_slow(handle, inode, map);
+out:
+ return ret;
+}
+
static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
unsigned int flags)
{
handle_t *handle;
- u8 blkbits = inode->i_blkbits;
int ret, dio_credits, m_flags = 0, retries = 0;
+ bool force_commit = false;
/*
* Trim the mapping request to the maximum value that we can map at
@@ -3368,7 +3688,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
*/
if (map->m_len > DIO_MAX_BLOCKS)
map->m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+
+ /*
+ * journal credits estimation for atomic writes. We call
+ * ext4_map_blocks(), to find if there could be a mixed mapping. If yes,
+ * then let's assume the no. of pextents required can be m_len i.e.
+ * every alternate block can be unwritten and hole.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ unsigned int orig_mlen = map->m_len;
+
+ ret = ext4_map_blocks(NULL, inode, map, 0);
+ if (ret < 0)
+ return ret;
+ if (map->m_len < orig_mlen) {
+ map->m_len = orig_mlen;
+ dio_credits = ext4_meta_trans_blocks(inode, orig_mlen,
+ map->m_len);
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ map->m_len);
+ }
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+ }
retry:
/*
@@ -3385,8 +3728,8 @@ retry:
* DAX and direct I/O are the only two operations that are currently
* supported with IOMAP_WRITE.
*/
- WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
- if (IS_DAX(inode))
+ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
+ if (flags & IOMAP_DAX)
m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
/*
* We use i_size instead of i_disksize here because delalloc writeback
@@ -3394,12 +3737,16 @@ retry:
* i_disksize out to i_size. This could be beyond where direct I/O is
* happening and thus expose allocated blocks to direct I/O reads.
*/
- else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+ else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode))
m_flags = EXT4_GET_BLOCKS_CREATE;
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (flags & IOMAP_ATOMIC)
+ ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
+ &force_commit);
+ else
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
* We cannot fill holes in indirect tree based inodes as that could
@@ -3413,6 +3760,22 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+ /*
+ * Force commit the current transaction if the allocation spans a mixed
+ * mapping range. This ensures any pending metadata updates (like
+ * unwritten to written extents conversion) in this range are in
+ * consistent state with the file data blocks, before performing the
+ * actual write I/O. If the commit fails, the whole I/O must be aborted
+ * to prevent any possible torn writes.
+ */
+ if (ret > 0 && force_commit) {
+ int ret2;
+
+ ret2 = ext4_force_commit(inode->i_sb);
+ if (ret2)
+ return ret2;
+ }
+
return ret;
}
@@ -3423,6 +3786,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
int ret;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
+ unsigned int orig_mlen;
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
return -EINVAL;
@@ -3436,16 +3800,58 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ orig_mlen = map.m_len;
- if (flags & IOMAP_WRITE)
+ if (flags & IOMAP_WRITE) {
+ /*
+ * We check here if the blocks are already allocated, then we
+ * don't need to start a journal txn and we can directly return
+ * the mapping information. This could boost performance
+ * especially in multi-threaded overwrite requests.
+ */
+ if (offset + length <= i_size_read(inode)) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * For atomic writes the entire requested length should
+ * be mapped.
+ */
+ if (map.m_flags & EXT4_MAP_MAPPED) {
+ if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
+ (flags & IOMAP_ATOMIC && ret >= orig_mlen))
+ goto out;
+ }
+ map.m_len = orig_mlen;
+ }
ret = ext4_iomap_alloc(inode, &map, flags);
- else
+ } else {
+ /*
+ * This can be called for overwrites path from
+ * ext4_iomap_overwrite_begin().
+ */
ret = ext4_map_blocks(NULL, inode, &map, 0);
+ }
if (ret < 0)
return ret;
+out:
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
- ext4_set_iomap(inode, iomap, &map, offset, length);
+ /*
+ * Before returning to iomap, let's ensure the allocated mapping
+ * covers the entire requested length for atomic writes.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ if (map.m_len < (length >> blkbits)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ }
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
}
@@ -3462,65 +3868,23 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
*/
flags &= ~IOMAP_WRITE;
ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
- WARN_ON_ONCE(iomap->type != IOMAP_MAPPED);
+ WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
return ret;
}
-static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
- ssize_t written, unsigned flags, struct iomap *iomap)
-{
- /*
- * Check to see whether an error occurred while writing out the data to
- * the allocated blocks. If so, return the magic error code so that we
- * fallback to buffered I/O and attempt to complete the remainder of
- * the I/O. Any blocks that may have been allocated in preparation for
- * the direct I/O will be reused during buffered I/O.
- */
- if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
- return -ENOTBLK;
-
- return 0;
-}
-
const struct iomap_ops ext4_iomap_ops = {
.iomap_begin = ext4_iomap_begin,
- .iomap_end = ext4_iomap_end,
};
const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_begin = ext4_iomap_overwrite_begin,
- .iomap_end = ext4_iomap_end,
};
-static bool ext4_iomap_is_delalloc(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct extent_status es;
- ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map->m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end)
- return false;
-
- if (es.es_lblk > map->m_lblk) {
- map->m_len = es.es_lblk - map->m_lblk;
- return false;
- }
-
- offset = map->m_lblk - es.es_lblk;
- map->m_len = es.es_len - offset;
-
- return true;
-}
-
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
{
int ret;
- bool delalloc = false;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
@@ -3561,13 +3925,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
- if (ret == 0)
- delalloc = ext4_iomap_is_delalloc(inode, &map);
-
set_iomap:
- ext4_set_iomap(inode, iomap, &map, offset, length);
- if (delalloc && iomap->type == IOMAP_HOLE)
- iomap->type = IOMAP_DELALLOC;
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
}
@@ -3577,87 +3936,96 @@ const struct iomap_ops ext4_iomap_report_ops = {
};
/*
- * Pages can be marked dirty completely asynchronously from ext4's journalling
- * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
- * much here because ->set_page_dirty is called under VFS locks. The page is
- * not necessarily locked.
- *
- * We cannot just dirty the page and leave attached buffers clean, because the
- * buffers' dirty state is "definitive". We cannot just set the buffers dirty
- * or jbddirty because all the journalling code will explode.
- *
- * So what we do is to mark the page "pending dirty" and next time writepage
- * is called, propagate that into the buffers appropriately.
+ * For data=journal mode, folio should be marked dirty only when it was
+ * writeably mapped. When that happens, it was already attached to the
+ * transaction and marked as jbddirty (we take care of this in
+ * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
+ * so we should have nothing to do here, except for the case when someone
+ * had the page pinned and dirtied the page through this pin (e.g. by doing
+ * direct IO to it). In that case we'd need to attach buffers here to the
+ * transaction but we cannot due to lock ordering. We cannot just dirty the
+ * folio and leave attached buffers clean, because the buffers' dirty state is
+ * "definitive". We cannot just set the buffers dirty or jbddirty because all
+ * the journalling code will explode. So what we do is to mark the folio
+ * "pending dirty" and next time ext4_writepages() is called, attach buffers
+ * to the transaction appropriately.
*/
-static int ext4_journalled_set_page_dirty(struct page *page)
+static bool ext4_journalled_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
+ WARN_ON_ONCE(!folio_buffers(folio));
+ if (folio_maybe_dma_pinned(folio))
+ folio_set_checked(folio);
+ return filemap_dirty_folio(mapping, folio);
}
-static int ext4_set_page_dirty(struct page *page)
+static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
- WARN_ON_ONCE(!page_has_buffers(page));
- return __set_page_dirty_buffers(page);
+ WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
+ WARN_ON_ONCE(!folio_buffers(folio));
+ return block_dirty_folio(mapping, folio);
+}
+
+static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
+ struct file *file, sector_t *span)
+{
+ return iomap_swapfile_activate(sis, file, span,
+ &ext4_iomap_report_ops);
}
static const struct address_space_operations ext4_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
+ .invalidate_folio = ext4_invalidate_folio,
+ .release_folio = ext4_release_folio,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_journalled_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
- .set_page_dirty = ext4_journalled_set_page_dirty,
+ .dirty_folio = ext4_journalled_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_journalled_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = noop_direct_IO,
+ .invalidate_folio = ext4_journalled_invalidate_folio,
+ .release_folio = ext4_release_folio,
+ .migrate_folio = buffer_migrate_folio_norefs,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_da_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
+ .invalidate_folio = ext4_invalidate_folio,
+ .release_folio = ext4_release_folio,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
- .direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
+ .dirty_folio = noop_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = noop_invalidatepage,
+ .swap_activate = ext4_iomap_swap_activate,
};
void ext4_set_aops(struct inode *inode)
@@ -3680,32 +4048,38 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
+/*
+ * Here we can't skip an unwritten buffer even though it usually reads zero
+ * because it might have data in pagecache (eg, if called from ext4_zero_range,
+ * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
+ * racing writeback can come later and flush the stale pagecache to disk.
+ */
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- unsigned blocksize, pos;
+ unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
- struct page *page;
+ struct folio *folio;
int err = 0;
- page = find_or_create_page(mapping, from >> PAGE_SHIFT,
- mapping_gfp_constraint(mapping, ~__GFP_FS));
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = EXT4_PG_TO_LBLK(inode, folio->index);
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
+ bh = folio_buffers(folio);
+ if (!bh)
+ bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
- bh = page_buffers(page);
+ offset = offset_in_folio(folio, from);
pos = blocksize;
while (offset >= pos) {
bh = bh->b_this_page;
@@ -3727,20 +4101,18 @@ static int __ext4_block_zero_page_range(handle_t *handle,
}
/* Ok, it's mapped. Make sure it's up-to-date */
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ err = ext4_read_bh_lock(bh, 0, true);
+ if (err)
goto unlock;
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
- err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
+ err = fscrypt_decrypt_pagecache_blocks(folio,
+ blocksize,
bh_offset(bh));
if (err) {
clear_buffer_uptodate(bh);
@@ -3750,15 +4122,16 @@ static int __ext4_block_zero_page_range(handle_t *handle,
}
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto unlock;
}
- zero_user(page, offset, length);
+ folio_zero_range(folio, offset, length);
BUFFER_TRACE(bh, "zeroed end of block");
if (ext4_should_journal_data(inode)) {
- err = ext4_handle_dirty_metadata(handle, inode, bh);
+ err = ext4_dirty_journalled_data(handle, bh);
} else {
err = 0;
mark_buffer_dirty(bh);
@@ -3768,8 +4141,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
}
unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return err;
}
@@ -3778,15 +4151,14 @@ unlock:
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
+ * that corresponds to 'from'
*/
static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
struct inode *inode = mapping->host;
- unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize = inode->i_sb->s_blocksize;
- unsigned max = blocksize - (offset & (blocksize - 1));
+ unsigned int max = blocksize - (from & (blocksize - 1));
/*
* correct length if it does not fall between
@@ -3796,8 +4168,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
length = max;
if (IS_DAX(inode)) {
- return iomap_zero_range(inode, from, length, NULL,
- &ext4_iomap_ops);
+ return dax_zero_range(inode, from, length, NULL,
+ &ext4_iomap_ops);
}
return __ext4_block_zero_page_range(handle, mapping, from, length);
}
@@ -3811,7 +4183,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
static int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
- unsigned offset = from & (PAGE_SIZE-1);
unsigned length;
unsigned blocksize;
struct inode *inode = mapping->host;
@@ -3820,8 +4191,8 @@ static int ext4_block_truncate_page(handle_t *handle,
if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
return 0;
- blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
+ blocksize = i_blocksize(inode);
+ length = blocksize - (from & (blocksize - 1));
return ext4_block_zero_page_range(handle, mapping, from, length);
}
@@ -3879,7 +4250,11 @@ int ext4_can_truncate(struct inode *inode)
* We have to make sure i_disksize gets properly updated before we truncate
* page cache due to hole punching or zero range. Otherwise i_disksize update
* can get lost as it may have been postponed to submission of writeback but
- * that will never happen after we truncate page cache.
+ * that will never happen if we remove the folio containing i_size from the
+ * page cache. Also if we punch hole within i_size but above i_disksize,
+ * following ext4_page_mkwrite() may mistakenly allocate written blocks over
+ * the hole and thus introduce allocated blocks beyond i_disksize which is
+ * not allowed (e2fsck would complain in case of crash).
*/
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
loff_t len)
@@ -3890,9 +4265,11 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
loff_t size = i_size_read(inode);
WARN_ON(!inode_is_locked(inode));
- if (offset > size || offset + len < size)
+ if (offset > size)
return 0;
+ if (offset + len < size)
+ size = offset + len;
if (EXT4_I(inode)->i_disksize >= size)
return 0;
@@ -3906,34 +4283,81 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
-static void ext4_wait_dax_page(struct ext4_inode_info *ei)
+static inline void ext4_truncate_folio(struct inode *inode,
+ loff_t start, loff_t end)
{
- up_write(&ei->i_mmap_sem);
- schedule();
- down_write(&ei->i_mmap_sem);
+ unsigned long blocksize = i_blocksize(inode);
+ struct folio *folio;
+
+ /* Nothing to be done if no complete block needs to be truncated. */
+ if (round_up(start, blocksize) >= round_down(end, blocksize))
+ return;
+
+ folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
-int ext4_break_layouts(struct inode *inode)
+int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct page *page;
- int error;
+ unsigned long blocksize = i_blocksize(inode);
+ int ret;
- if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
- return -EINVAL;
+ /*
+ * For journalled data we need to write (and checkpoint) pages
+ * before discarding page cache to avoid inconsitent data on disk
+ * in case of crash before freeing or unwritten converting trans
+ * is committed.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start,
+ end - 1);
+ if (ret)
+ return ret;
+ goto truncate_pagecache;
+ }
- do {
- page = dax_layout_busy_page(inode->i_mapping);
- if (!page)
- return 0;
+ /*
+ * If the block size is less than the page size, the file's mapped
+ * blocks within one page could be freed or converted to unwritten.
+ * So it's necessary to remove writable userspace mappings, and then
+ * ext4_page_mkwrite() can be called during subsequent write access
+ * to these partial folios.
+ */
+ if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
+ blocksize < PAGE_SIZE && start < inode->i_size) {
+ loff_t page_boundary = round_up(start, PAGE_SIZE);
- error = ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1,
- TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(ei));
- } while (error == 0);
+ ext4_truncate_folio(inode, start, min(page_boundary, end));
+ if (end > page_boundary)
+ ext4_truncate_folio(inode,
+ round_down(end, PAGE_SIZE), end);
+ }
- return error;
+truncate_pagecache:
+ truncate_pagecache_range(inode, start, end - 1);
+ return 0;
+}
+
+static void ext4_wait_dax_page(struct inode *inode)
+{
+ filemap_invalidate_unlock(inode->i_mapping);
+ schedule();
+ filemap_invalidate_lock(inode->i_mapping);
+}
+
+int ext4_break_layouts(struct inode *inode)
+{
+ if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
+ return -EINVAL;
+
+ return dax_break_layout_inode(inode, ext4_wait_dax_page);
}
/*
@@ -3947,148 +4371,116 @@ int ext4_break_layouts(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset;
+ ext4_lblk_t start_lblk, end_lblk;
+ loff_t max_end = sb->s_maxbytes;
+ loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
- int ret = 0, ret2 = 0;
+ int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
-
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
- if (ext4_has_inline_data(inode)) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
- ret = ext4_convert_inline_data(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
- if (ret)
- return ret;
- }
+ WARN_ON_ONCE(!inode_is_locked(inode));
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
+ * For indirect-block based inodes, make sure that the hole within
+ * one block before last range.
*/
- if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + length - 1);
- if (ret)
- return ret;
- }
-
- inode_lock(inode);
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
/* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
+ if (offset >= inode->i_size || offset >= max_end)
+ return 0;
/*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
+ * If the hole extends beyond i_size, set the hole to end after
+ * the block that contains i_size to save pointless tail block zeroing.
*/
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
- offset;
- }
+ if (end >= inode->i_size)
+ end = round_up(inode->i_size, sb->s_blocksize);
+ if (end > max_end)
+ end = max_end;
+ length = end - offset;
- if (offset & (sb->s_blocksize - 1) ||
- (offset + length) & (sb->s_blocksize - 1)) {
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of
- * partial block
- */
+ /*
+ * Attach jinode to inode for jbd2 if we do any zeroing of partial
+ * block.
+ */
+ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
- goto out_mutex;
-
+ return ret;
}
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- inode_dio_wait(inode);
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- down_write(&EXT4_I(inode)->i_mmap_sem);
-
- ret = ext4_break_layouts(inode);
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
- goto out_dio;
-
- first_block_offset = round_up(offset, sb->s_blocksize);
- last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+ return ret;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset) {
- ret = ext4_update_disksize_before_punch(inode, offset, length);
- if (ret)
- goto out_dio;
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
- }
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 2);
else
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
- goto out_dio;
+ return ret;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset,
- length);
+ ret = ext4_zero_partial_blocks(handle, inode, offset, length);
if (ret)
- goto out_stop;
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+ goto out_handle;
/* If there are blocks to remove, do it */
- if (stop_block > first_block) {
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> inode->i_blkbits;
+
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t hole_len = end_lblk - start_lblk;
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ret = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ext4_es_remove_extent(inode, start_lblk, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk,
+ end_lblk - 1);
else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ ret = ext4_ind_remove_space(handle, inode, start_lblk,
+ end_lblk);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_handle;
+ }
+ ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
+ EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
+ ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-
- inode->i_mtime = inode->i_ctime = current_time(inode);
- ret2 = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(ret2))
- ret = ret2;
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_dio:
- up_write(&EXT4_I(inode)->i_mmap_sem);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -4156,14 +4548,14 @@ int ext4_truncate(struct inode *inode)
/*
* There is a possibility that we're either freeing the inode
* or it's a completely new inode. In those cases we might not
- * have i_mutex locked because it's not necessary.
+ * have i_rwsem locked because it's not necessary.
*/
- if (!(inode->i_state & (I_NEW|I_FREEING)))
+ if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
WARN_ON(!inode_is_locked(inode));
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
- return 0;
+ goto out_trace;
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
@@ -4172,26 +4564,27 @@ int ext4_truncate(struct inode *inode)
int has_inline = 1;
err = ext4_inline_data_truncate(inode, &has_inline);
- if (err)
- return err;
- if (has_inline)
- return 0;
+ if (err || has_inline)
+ goto out_trace;
}
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
- if (ext4_inode_attach_jinode(inode) < 0)
- return 0;
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
+ goto out_trace;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 1);
else
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_trace;
+ }
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4209,8 +4602,10 @@ int ext4_truncate(struct inode *inode)
if (err)
goto out_stop;
- down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4236,38 +4631,186 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(err2 && !err))
err = err2;
ext4_journal_stop(handle);
+out_trace:
trace_ext4_truncate_exit(inode);
return err;
}
+static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
+{
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return inode_peek_iversion_raw(inode);
+ else
+ return inode_peek_iversion(inode);
+}
+
+static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ struct inode *inode = &(ei->vfs_inode);
+ u64 i_blocks = READ_ONCE(inode->i_blocks);
+ struct super_block *sb = inode->i_sb;
+
+ if (i_blocks <= ~0U) {
+ /*
+ * i_blocks can be represented in a 32 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = 0;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ return 0;
+ }
+
+ /*
+ * This should never happen since sb->s_maxbytes should not have
+ * allowed this, sb->s_maxbytes was set according to the huge_file
+ * feature in ext4_fill_super().
+ */
+ if (!ext4_has_feature_huge_file(sb))
+ return -EFSCORRUPTED;
+
+ if (i_blocks <= 0xffffffffffffULL) {
+ /*
+ * i_blocks can be represented in a 48 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ } else {
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ /* i_block is stored in file system block size */
+ i_blocks = i_blocks >> (inode->i_blkbits - 9);
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ }
+ return 0;
+}
+
+static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ uid_t i_uid;
+ gid_t i_gid;
+ projid_t i_projid;
+ int block;
+ int err;
+
+ err = ext4_inode_blocks_set(raw_inode, ei);
+
+ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
+ i_projid = from_kprojid(&init_user_ns, ei->i_projid);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
+ /*
+ * Fix up interoperability with old kernels. Otherwise,
+ * old inodes get re-used with the upper 16 bits of the
+ * uid/gid intact.
+ */
+ if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) {
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ } else {
+ raw_inode->i_uid_high =
+ cpu_to_le16(high_16_bits(i_uid));
+ raw_inode->i_gid_high =
+ cpu_to_le16(high_16_bits(i_gid));
+ }
+ } else {
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+
+ EXT4_INODE_SET_CTIME(inode, raw_inode);
+ EXT4_INODE_SET_MTIME(inode, raw_inode);
+ EXT4_INODE_SET_ATIME(inode, raw_inode);
+ EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
+ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+ ext4_isize_set(raw_inode, ei->i_disksize);
+
+ raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ raw_inode->i_block[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ raw_inode->i_block[1] = 0;
+ } else {
+ raw_inode->i_block[0] = 0;
+ raw_inode->i_block[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ raw_inode->i_block[2] = 0;
+ }
+ } else if (!ext4_has_inline_data(inode)) {
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+ }
+
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ u64 ivers = ext4_inode_peek_iversion(inode);
+
+ raw_inode->i_disk_version = cpu_to_le32(ivers);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(ivers >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
+ }
+
+ if (i_projid != EXT4_DEF_PROJID &&
+ !ext4_has_feature_project(inode->i_sb))
+ err = err ?: -EFSCORRUPTED;
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ raw_inode->i_projid = cpu_to_le32(i_projid);
+
+ ext4_inode_csum_set(inode, raw_inode, ei);
+ return err;
+}
+
/*
* ext4_get_inode_loc returns with an extra refcount against the inode's
- * underlying buffer_head on success. If 'in_mem' is true, we have all
- * data in memory that is needed to recreate the on-disk version of this
- * inode.
+ * underlying buffer_head on success. If we pass 'inode' and it does not
+ * have in-inode xattr, we have all inode data in memory that is needed
+ * to recreate the on-disk version of this inode.
*/
-static int __ext4_get_inode_loc(struct inode *inode,
- struct ext4_iloc *iloc, int in_mem)
+static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
+ struct inode *inode, struct ext4_iloc *iloc,
+ ext4_fsblk_t *ret_block)
{
struct ext4_group_desc *gdp;
struct buffer_head *bh;
- struct super_block *sb = inode->i_sb;
ext4_fsblk_t block;
struct blk_plug plug;
int inodes_per_block, inode_offset;
iloc->bh = NULL;
- if (inode->i_ino < EXT4_ROOT_INO ||
- inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+ if (ino < EXT4_ROOT_INO ||
+ ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
return -EFSCORRUPTED;
- iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
if (!gdp)
return -EIO;
@@ -4276,136 +4819,169 @@ static int __ext4_get_inode_loc(struct inode *inode,
* Figure out the offset within the block group inode table
*/
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
- inode_offset = ((inode->i_ino - 1) %
+ inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
- block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ block = ext4_inode_table(sb, gdp);
+ if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
+ (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
+ ext4_error(sb, "Invalid inode table block %llu in "
+ "block_group %u", block, iloc->block_group);
+ return -EFSCORRUPTED;
+ }
+ block += (inode_offset / inodes_per_block);
+
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
- if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
- goto simulate_eio;
- if (!buffer_uptodate(bh)) {
- lock_buffer(bh);
+ if (ext4_buffer_uptodate(bh))
+ goto has_buffer;
- /*
- * If the buffer has the write error flag, we have failed
- * to write out another inode in the same block. In this
- * case, we don't have to read the block because we may
- * read the old inode data successfully.
- */
- if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
- set_buffer_uptodate(bh);
-
- if (buffer_uptodate(bh)) {
- /* someone brought it uptodate while we waited */
- unlock_buffer(bh);
- goto has_buffer;
- }
+ lock_buffer(bh);
+ if (ext4_buffer_uptodate(bh)) {
+ /* Someone brought it uptodate while we waited */
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
- /*
- * If we have all information of the inode in memory and this
- * is the only valid inode in the block, we need not read the
- * block.
- */
- if (in_mem) {
- struct buffer_head *bitmap_bh;
- int i, start;
+ /*
+ * If we have all information of the inode in memory and this
+ * is the only valid inode in the block, we need not read the
+ * block.
+ */
+ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+ struct buffer_head *bitmap_bh;
+ int i, start;
- start = inode_offset & ~(inodes_per_block - 1);
+ start = inode_offset & ~(inodes_per_block - 1);
- /* Is the inode bitmap in cache? */
- bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- if (unlikely(!bitmap_bh))
- goto make_io;
+ /* Is the inode bitmap in cache? */
+ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+ if (unlikely(!bitmap_bh))
+ goto make_io;
- /*
- * If the inode bitmap isn't in cache then the
- * optimisation may end up performing two reads instead
- * of one, so skip it.
- */
- if (!buffer_uptodate(bitmap_bh)) {
- brelse(bitmap_bh);
- goto make_io;
- }
- for (i = start; i < start + inodes_per_block; i++) {
- if (i == inode_offset)
- continue;
- if (ext4_test_bit(i, bitmap_bh->b_data))
- break;
- }
+ /*
+ * If the inode bitmap isn't in cache then the
+ * optimisation may end up performing two reads instead
+ * of one, so skip it.
+ */
+ if (!buffer_uptodate(bitmap_bh)) {
brelse(bitmap_bh);
- if (i == start + inodes_per_block) {
- /* all other inodes are free, so skip I/O */
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- goto has_buffer;
- }
+ goto make_io;
}
+ for (i = start; i < start + inodes_per_block; i++) {
+ if (i == inode_offset)
+ continue;
+ if (ext4_test_bit(i, bitmap_bh->b_data))
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i == start + inodes_per_block) {
+ struct ext4_inode *raw_inode =
+ (struct ext4_inode *) (bh->b_data + iloc->offset);
+
+ /* all other inodes are free, so skip I/O */
+ memset(bh->b_data, 0, bh->b_size);
+ if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ ext4_fill_raw_inode(inode, raw_inode);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
+ }
make_io:
- /*
- * If we need to do any I/O, try to pre-readahead extra
- * blocks from the inode table.
- */
- blk_start_plug(&plug);
- if (EXT4_SB(sb)->s_inode_readahead_blks) {
- ext4_fsblk_t b, end, table;
- unsigned num;
- __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
-
- table = ext4_inode_table(sb, gdp);
- /* s_inode_readahead_blks is always a power of 2 */
- b = block & ~((ext4_fsblk_t) ra_blks - 1);
- if (table > b)
- b = table;
- end = b + ra_blks;
- num = EXT4_INODES_PER_GROUP(sb);
- if (ext4_has_group_desc_csum(sb))
- num -= ext4_itable_unused_count(sb, gdp);
- table += num / inodes_per_block;
- if (end > table)
- end = table;
- while (b <= end)
- sb_breadahead_unmovable(sb, b++);
- }
+ /*
+ * If we need to do any I/O, try to pre-readahead extra
+ * blocks from the inode table.
+ */
+ blk_start_plug(&plug);
+ if (EXT4_SB(sb)->s_inode_readahead_blks) {
+ ext4_fsblk_t b, end, table;
+ unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
+
+ table = ext4_inode_table(sb, gdp);
+ /* s_inode_readahead_blks is always a power of 2 */
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
+ if (table > b)
+ b = table;
+ end = b + ra_blks;
+ num = EXT4_INODES_PER_GROUP(sb);
+ if (ext4_has_group_desc_csum(sb))
+ num -= ext4_itable_unused_count(sb, gdp);
+ table += num / inodes_per_block;
+ if (end > table)
+ end = table;
+ while (b <= end)
+ ext4_sb_breadahead_unmovable(sb, b++);
+ }
- /*
- * There are other valid inodes in the buffer, this inode
- * has in-inode xattrs, or we don't have this inode in memory.
- * Read the block from disk.
- */
- trace_ext4_load_inode(inode);
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
- blk_finish_plug(&plug);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- simulate_eio:
- ext4_error_inode_block(inode, block, EIO,
- "unable to read itable block");
- brelse(bh);
- return -EIO;
- }
+ /*
+ * There are other valid inodes in the buffer, this inode
+ * has in-inode xattrs, or we don't have this inode in memory.
+ * Read the block from disk.
+ */
+ trace_ext4_load_inode(sb, ino);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL,
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO));
+ blk_finish_plug(&plug);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ if (ret_block)
+ *ret_block = block;
+ brelse(bh);
+ return -EIO;
}
has_buffer:
iloc->bh = bh;
return 0;
}
+static int __ext4_get_inode_loc_noinmem(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ ext4_fsblk_t err_blk = 0;
+ int ret;
+
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
+ &err_blk);
+
+ if (ret == -EIO)
+ ext4_error_inode_block(inode, err_blk, EIO,
+ "unable to read itable block");
+
+ return ret;
+}
+
int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
- /* We have all inode data except xattrs in memory here. */
- return __ext4_get_inode_loc(inode, iloc,
- !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+ ext4_fsblk_t err_blk = 0;
+ int ret;
+
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
+ &err_blk);
+
+ if (ret == -EIO)
+ ext4_error_inode_block(inode, err_blk, EIO,
+ "unable to read itable block");
+
+ return ret;
+}
+
+
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc)
+{
+ return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
}
-static bool ext4_should_use_dax(struct inode *inode)
+static bool ext4_should_enable_dax(struct inode *inode)
{
- if (!test_opt(inode->i_sb, DAX))
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (test_opt2(inode->i_sb, DAX_NEVER))
return false;
if (!S_ISREG(inode->i_mode))
return false;
@@ -4417,14 +4993,21 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
return false;
- return true;
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
+ return false;
+ if (test_opt(inode->i_sb, DAX_ALWAYS))
+ return true;
+
+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}
-void ext4_set_inode_flags(struct inode *inode)
+void ext4_set_inode_flags(struct inode *inode, bool init)
{
unsigned int flags = EXT4_I(inode)->i_flags;
unsigned int new_fl = 0;
+ WARN_ON_ONCE(IS_DAX(inode) && init);
+
if (flags & EXT4_SYNC_FL)
new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
@@ -4435,8 +5018,13 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (ext4_should_use_dax(inode))
+
+ /* Because of the way inode_set_flags() works we must preserve S_DAX
+ * here if already set. */
+ new_fl |= (inode->i_flags & S_DAX);
+ if (init && ext4_should_enable_dax(inode))
new_fl |= S_DAX;
+
if (flags & EXT4_ENCRYPT_FL)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
@@ -4477,11 +5065,20 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
- EXT4_INODE_SIZE(inode->i_sb) &&
+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ int err;
+
+ err = xattr_check_inode(inode, IHDR(inode, raw_inode),
+ ITAIL(inode, raw_inode));
+ if (err)
+ return err;
+
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
- return ext4_find_inline_data_nolock(inode);
+ err = ext4_find_inline_data_nolock(inode);
+ if (!err && ext4_has_inline_data(inode))
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return err;
} else
EXT4_I(inode)->i_inline_off = 0;
return 0;
@@ -4507,12 +5104,63 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
else
inode_set_iversion_queried(inode, val);
}
-static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
+
+static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
+ const char *function, unsigned int line)
{
- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return inode_peek_iversion_raw(inode);
- else
- return inode_peek_iversion(inode);
+ const char *err_str;
+
+ if (flags & EXT4_IGET_EA_INODE) {
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ err_str = "missing EA_INODE flag";
+ goto error;
+ }
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ EXT4_I(inode)->i_file_acl) {
+ err_str = "ea_inode with extended attributes";
+ goto error;
+ }
+ } else {
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ /*
+ * open_by_handle_at() could provide an old inode number
+ * that has since been reused for an ea_inode; this does
+ * not indicate filesystem corruption
+ */
+ if (flags & EXT4_IGET_HANDLE)
+ return -ESTALE;
+ err_str = "unexpected EA_INODE flag";
+ goto error;
+ }
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
+ goto error;
+ }
+ return 0;
+
+error:
+ ext4_error_inode(inode, function, line, 0, "%s", err_str);
+ return -EFSCORRUPTED;
+}
+
+void ext4_set_inode_mapping_order(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ u16 min_order, max_order;
+
+ max_order = EXT4_SB(sb)->s_max_folio_order;
+ if (!max_order)
+ return;
+
+ min_order = EXT4_SB(sb)->s_min_folio_order;
+ if (!min_order && !S_ISREG(inode->i_mode))
+ return;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ max_order = min_order;
+
+ mapping_set_folio_order_range(inode->i_mapping, min_order, max_order);
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -4522,6 +5170,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
@@ -4531,13 +5180,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
gid_t i_gid;
projid_t i_projid;
- if ((!(flags & EXT4_IGET_SPECIAL) &&
- (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
+ if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) ||
(ino < EXT4_ROOT_INO) ||
- (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+ (ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
return ERR_PTR(-ESTALE);
- __ext4_error(sb, function, line, EFSCORRUPTED, 0,
+ __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
"inode #%lu: comm %s: iget: illegal inode #",
ino, current->comm);
return ERR_PTR(-EFSCORRUPTED);
@@ -4546,24 +5194,23 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW)) {
+ ret = check_igot_inode(inode, flags, function, line);
+ if (ret) {
+ iput(inode);
+ return ERR_PTR(ret);
+ }
return inode;
+ }
ei = EXT4_I(inode);
iloc.bh = NULL;
- ret = __ext4_get_inode_loc(inode, &iloc, 0);
+ ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
- if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
- ext4_error_inode(inode, function, line, 0,
- "iget: root inode unallocated");
- ret = -EFSCORRUPTED;
- goto bad_inode;
- }
-
if ((flags & EXT4_IGET_HANDLE) &&
(raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
ret = -ESTALE;
@@ -4587,21 +5234,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_extra_isize = 0;
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = raw_inode->i_generation;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
- if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
- ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
- ext4_error_inode_err(inode, function, line, 0, EFSBADCRC,
- "iget: checksum invalid");
+ if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
+ (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
+ ext4_error_inode_err(inode, function, line, 0,
+ EFSBADCRC, "iget: checksum invalid");
ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4625,7 +5272,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_projid = make_kprojid(&init_user_ns, i_projid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
- ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
@@ -4635,11 +5281,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if ((inode->i_mode == 0 ||
+ if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
ino != EXT4_BOOT_LOADER_INO) {
- /* this inode is deleted */
- ret = -ESTALE;
+ /* this inode is deleted or unallocated */
+ if (flags & EXT4_IGET_SPECIAL) {
+ ext4_error_inode(inode, function, line, 0,
+ "iget: special inode unallocated");
+ ret = -EFSCORRUPTED;
+ } else
+ ret = -ESTALE;
goto bad_inode;
}
/* The only unlinked inodes we let through here have
@@ -4650,14 +5301,23 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
+ /* Detect invalid flag combination - can't have both inline data and extents */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ext4_error_inode(inode, function, line, 0,
+ "inode has both inline data and extents flags");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
- if ((size = i_size_read(inode)) < 0) {
+ size = i_size_read(inode);
+ if (size < 0 || size > ext4_get_maxbytes(inode)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -4668,7 +5328,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* we'd normally treat htree data as empty space. But with metadata
* checksumming that corrupts checksums so forbid that.
*/
- if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ if (!ext4_has_feature_dir_index(sb) &&
+ ext4_has_feature_metadata_csum(sb) &&
ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
ext4_error_inode(inode, function, line, 0,
"iget: Dir with htree data on filesystem without dir_index feature.");
@@ -4689,6 +5350,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
for (block = 0; block < EXT4_N_BLOCKS; block++)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ ext4_fc_init_inode(&ei->vfs_inode);
/*
* Set transaction id's of transactions that have to be committed
@@ -4728,9 +5390,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
}
- EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
- EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_GET_CTIME(inode, raw_inode);
+ EXT4_INODE_GET_ATIME(inode, raw_inode);
+ EXT4_INODE_GET_MTIME(inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
@@ -4746,7 +5408,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = 0;
if (ei->i_file_acl &&
- !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad extended attribute block %llu",
ei->i_file_acl);
@@ -4754,9 +5416,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
/* validate the block references in the inode */
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
+ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode)))) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_check_inode(inode);
else
@@ -4784,17 +5447,23 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
- ext4_set_aops(inode);
} else if (ext4_inode_is_fast_symlink(inode)) {
- inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations;
- nd_terminate_link(ei->i_data, inode->i_size,
- sizeof(ei->i_data) - 1);
+ if (inode->i_size == 0 ||
+ inode->i_size >= sizeof(ei->i_data) ||
+ strnlen((char *)ei->i_data, inode->i_size + 1) !=
+ inode->i_size) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid fast symlink length %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+ inode_set_cached_link(inode, (char *)ei->i_data,
+ inode->i_size);
} else {
inode->i_op = &ext4_symlink_inode_operations;
- ext4_set_aops(inode);
}
- inode_nohighmem(inode);
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
@@ -4812,11 +5481,32 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
"iget: bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
- if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+ if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) {
ext4_error_inode(inode, function, line, 0,
"casefold flag without casefold feature");
- brelse(iloc.bh);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+
+ ext4_set_inode_mapping_order(inode);
+ ret = check_igot_inode(inode, flags, function, line);
+ /*
+ * -ESTALE here means there is nothing inherently wrong with the inode,
+ * it's just not an inode we can return for an fhandle lookup.
+ */
+ if (ret == -ESTALE) {
+ brelse(iloc.bh);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ if (ret)
+ goto bad_inode;
+ brelse(iloc.bh);
+ /* Initialize the "no ACL's" state for the simple cases */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl)
+ cache_no_acl(inode);
unlock_new_inode(inode);
return inode;
@@ -4826,45 +5516,6 @@ bad_inode:
return ERR_PTR(ret);
}
-static int ext4_inode_blocks_set(handle_t *handle,
- struct ext4_inode *raw_inode,
- struct ext4_inode_info *ei)
-{
- struct inode *inode = &(ei->vfs_inode);
- u64 i_blocks = READ_ONCE(inode->i_blocks);
- struct super_block *sb = inode->i_sb;
-
- if (i_blocks <= ~0U) {
- /*
- * i_blocks can be represented in a 32 bit variable
- * as multiple of 512 bytes
- */
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = 0;
- ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- return 0;
- }
- if (!ext4_has_feature_huge_file(sb))
- return -EFBIG;
-
- if (i_blocks <= 0xffffffffffffULL) {
- /*
- * i_blocks can be represented in a 48 bit variable
- * as multiple of 512 bytes
- */
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- } else {
- ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- /* i_block is stored in file system block size */
- i_blocks = i_blocks >> (inode->i_blkbits - 9);
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- }
- return 0;
-}
-
static void __ext4_update_other_inode_time(struct super_block *sb,
unsigned long orig_ino,
unsigned long ino,
@@ -4876,24 +5527,20 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
if (!inode)
return;
- if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_INODE)) ||
- ((inode->i_state & I_DIRTY_TIME) == 0))
+ if (!inode_is_dirtytime_only(inode))
return;
spin_lock(&inode->i_lock);
- if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_INODE)) == 0) &&
- (inode->i_state & I_DIRTY_TIME)) {
+ if (inode_is_dirtytime_only(inode)) {
struct ext4_inode_info *ei = EXT4_I(inode);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_SET_CTIME(inode, raw_inode);
+ EXT4_INODE_SET_MTIME(inode, raw_inode);
+ EXT4_INODE_SET_ATIME(inode, raw_inode);
ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
trace_ext4_other_inode_update_time(inode, orig_ino);
@@ -4944,134 +5591,62 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
struct super_block *sb = inode->i_sb;
- int err = 0, rc, block;
+ int err;
int need_datasync = 0, set_large_file = 0;
- uid_t i_uid;
- gid_t i_gid;
- projid_t i_projid;
spin_lock(&ei->i_raw_lock);
- /* For fields not tracked in the in-memory inode,
- * initialise them to zero for new inodes. */
+ /*
+ * For fields not tracked in the in-memory inode, initialise them
+ * to zero for new inodes.
+ */
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- raw_inode->i_mode = cpu_to_le16(inode->i_mode);
- i_uid = i_uid_read(inode);
- i_gid = i_gid_read(inode);
- i_projid = from_kprojid(&init_user_ns, ei->i_projid);
- if (!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
-/*
- * Fix up interoperability with old kernels. Otherwise, old inodes get
- * re-used with the upper 16 bits of the uid/gid intact
- */
- if (ei->i_dtime && list_empty(&ei->i_orphan)) {
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- } else {
- raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(i_uid));
- raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(i_gid));
- }
- } else {
- raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- }
- raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
- EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
-
- err = ext4_inode_blocks_set(handle, raw_inode, ei);
- if (err) {
- spin_unlock(&ei->i_raw_lock);
- goto out_brelse;
- }
- raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
- if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
- raw_inode->i_file_acl_high =
- cpu_to_le16(ei->i_file_acl >> 32);
- raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) {
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
need_datasync = 1;
- }
if (ei->i_disksize > 0x7fffffffULL) {
if (!ext4_has_feature_large_file(sb) ||
- EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV))
+ EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
set_large_file = 1;
}
- raw_inode->i_generation = cpu_to_le32(inode->i_generation);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- raw_inode->i_block[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- raw_inode->i_block[1] = 0;
- } else {
- raw_inode->i_block[0] = 0;
- raw_inode->i_block[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- raw_inode->i_block[2] = 0;
- }
- } else if (!ext4_has_inline_data(inode)) {
- for (block = 0; block < EXT4_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
- }
- if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
- u64 ivers = ext4_inode_peek_iversion(inode);
-
- raw_inode->i_disk_version = cpu_to_le32(ivers);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(ivers >> 32);
- raw_inode->i_extra_isize =
- cpu_to_le16(ei->i_extra_isize);
- }
+ err = ext4_fill_raw_inode(inode, raw_inode);
+ spin_unlock(&ei->i_raw_lock);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "corrupted inode contents");
+ goto out_brelse;
}
- BUG_ON(!ext4_has_feature_project(inode->i_sb) &&
- i_projid != EXT4_DEF_PROJID);
-
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
- EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
- raw_inode->i_projid = cpu_to_le32(i_projid);
-
- ext4_inode_csum_set(inode, raw_inode, ei);
- spin_unlock(&ei->i_raw_lock);
if (inode->i_sb->s_flags & SB_LAZYTIME)
ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (!err)
- err = rc;
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_error;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
if (set_large_file) {
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb,
+ EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (err)
- goto out_brelse;
+ goto out_error;
+ lock_buffer(EXT4_SB(sb)->s_sbh);
ext4_set_feature_large_file(sb);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
ext4_handle_sync(handle);
- err = ext4_handle_dirty_super(handle, sb);
+ err = ext4_handle_dirty_metadata(handle, NULL,
+ EXT4_SB(sb)->s_sbh);
}
ext4_update_inode_fsync_trans(handle, inode, need_datasync);
+out_error:
+ ext4_std_error(inode->i_sb, err);
out_brelse:
brelse(bh);
- ext4_std_error(inode->i_sb, err);
return err;
}
@@ -5113,16 +5688,16 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
- sb_rdonly(inode->i_sb))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ ext4_debug("called recursively, non-PF_MEMALLOC!\n");
dump_stack();
return -EIO;
}
@@ -5135,12 +5710,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
- err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+ err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
EXT4_I(inode)->i_sync_tid);
} else {
struct ext4_iloc iloc;
- err = __ext4_get_inode_loc(inode, &iloc, 0);
+ err = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (err)
return err;
/*
@@ -5160,47 +5735,49 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
- * buffers that are attached to a page stradding i_size and are undergoing
+ * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
+ * buffers that are attached to a folio straddling i_size and are undergoing
* commit. In that case we have to wait for commit to finish and try again.
*/
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
- struct page *page;
unsigned offset;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- tid_t commit_tid = 0;
+ tid_t commit_tid;
int ret;
+ bool has_transaction;
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * If the page is fully truncated, we don't need to wait for any commit
- * (and we even should not as __ext4_journalled_invalidatepage() may
- * strip all buffers from the page but keep the page dirty which can then
- * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * If the folio is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidate_folio() may
+ * strip all buffers from the folio but keep the folio dirty which can then
+ * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
* buffers). Also we don't need to wait for any commit if all buffers in
- * the page remain valid. This is most beneficial for the common case of
+ * the folio remain valid. This is most beneficial for the common case of
* blocksize == PAGESIZE.
*/
if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
- page = find_lock_page(inode->i_mapping,
+ struct folio *folio = filemap_lock_folio(inode->i_mapping,
inode->i_size >> PAGE_SHIFT);
- if (!page)
+ if (IS_ERR(folio))
return;
- ret = __ext4_journalled_invalidatepage(page, offset,
- PAGE_SIZE - offset);
- unlock_page(page);
- put_page(page);
+ ret = __ext4_journalled_invalidate_folio(folio, offset,
+ folio_size(folio) - offset);
+ folio_unlock(folio);
+ folio_put(folio);
if (ret != -EBUSY)
return;
- commit_tid = 0;
+ has_transaction = false;
read_lock(&journal->j_state_lock);
- if (journal->j_committing_transaction)
+ if (journal->j_committing_transaction) {
commit_tid = journal->j_committing_transaction->t_tid;
+ has_transaction = true;
+ }
read_unlock(&journal->j_state_lock);
- if (commit_tid)
+ if (has_transaction)
jbd2_log_wait_commit(journal, commit_tid);
}
}
@@ -5227,17 +5804,20 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
* transaction are already on disk (truncate waits for pages under
* writeback).
*
- * Called with inode->i_mutex down.
+ * Called with inode->i_rwsem down.
*/
-int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int error, rc = 0;
int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
+ bool inc_ivers = true;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ error = ext4_emergency_state(inode->i_sb);
+ if (unlikely(error))
+ return error;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -5247,7 +5827,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ATTR_GID | ATTR_TIMES_SET))))
return -EPERM;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(idmap, dentry, attr);
if (error)
return error;
@@ -5259,13 +5839,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(idmap, inode, attr)) {
error = dquot_initialize(inode);
if (error)
return error;
}
- if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+
+ if (i_uid_needs_update(idmap, attr, inode) ||
+ i_gid_needs_update(idmap, attr, inode)) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
@@ -5282,7 +5863,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* counts xattr inode references.
*/
down_read(&EXT4_I(inode)->xattr_sem);
- error = dquot_transfer(inode, attr);
+ error = dquot_transfer(idmap, inode, attr);
up_read(&EXT4_I(inode)->xattr_sem);
if (error) {
@@ -5291,32 +5872,34 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
/* Update corresponding info in inode so that everything is in
* one transaction */
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(idmap, attr, inode);
+ i_gid_update(idmap, attr, inode);
error = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
- if (unlikely(error))
+ if (unlikely(error)) {
return error;
+ }
}
if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
loff_t oldsize = inode->i_size;
+ loff_t old_disksize;
int shrink = (attr->ia_size < inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (attr->ia_size > sbi->s_bitmap_maxbytes)
+ if (attr->ia_size > sbi->s_bitmap_maxbytes) {
return -EFBIG;
+ }
}
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode)) {
return -EINVAL;
+ }
- if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
- inode_inc_iversion(inode);
+ if (attr->ia_size == inode->i_size)
+ inc_ivers = false;
if (shrink) {
if (ext4_should_order_data(inode)) {
@@ -5332,15 +5915,23 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
inode_dio_wait(inode);
}
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
rc = ext4_break_layouts(inode);
if (rc) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
- return rc;
+ filemap_invalidate_unlock(inode->i_mapping);
+ goto err_out;
}
if (attr->ia_size != inode->i_size) {
+ /* attach jbd2 jinode for EOF folio tail zeroing */
+ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
+ oldsize & (inode->i_sb->s_blocksize - 1)) {
+ error = ext4_inode_attach_jinode(inode);
+ if (error)
+ goto out_mmap_sem;
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
@@ -5351,18 +5942,35 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
orphan = 1;
}
/*
- * Update c/mtime on truncate up, ext4_truncate() will
- * update c/mtime in shrink case below
+ * Update c/mtime and tail zero the EOF folio on
+ * truncate up. ext4_truncate() handles the shrink case
+ * below.
*/
if (!shrink) {
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
+ if (oldsize & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle,
+ inode->i_mapping, oldsize);
}
+
+ if (shrink)
+ ext4_fc_track_range(handle, inode,
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits,
+ EXT_MAX_BLOCKS - 1);
+ else
+ ext4_fc_track_range(
+ handle, inode,
+ (oldsize > 0 ? oldsize - 1 : oldsize) >>
+ inode->i_sb->s_blocksize_bits,
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits);
+
down_write(&EXT4_I(inode)->i_data_sem);
+ old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+
/*
* We have to update i_size under i_data_sem together
* with i_disksize to avoid races with writeback code
@@ -5370,7 +5978,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
*/
if (!error)
i_size_write(inode, attr->ia_size);
+ else
+ EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
ext4_journal_stop(handle);
if (error)
goto out_mmap_sem;
@@ -5397,11 +6010,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
error = rc;
}
out_mmap_sem:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
}
if (!error) {
- setattr_copy(inode, attr);
+ if (inc_ivers)
+ inode_inc_iversion(inode);
+ setattr_copy(idmap, inode, attr);
mark_inode_dirty(inode);
}
@@ -5413,17 +6028,34 @@ out_mmap_sem:
ext4_orphan_del(NULL, inode);
if (!error && (ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(inode, inode->i_mode);
+ rc = posix_acl_chmod(idmap, dentry, inode->i_mode);
err_out:
- ext4_std_error(inode->i_sb, error);
+ if (error)
+ ext4_std_error(inode->i_sb, error);
if (!error)
error = rc;
return error;
}
-int ext4_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+u32 ext4_dio_alignment(struct inode *inode)
+{
+ if (fsverity_active(inode))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ if (ext4_has_inline_data(inode))
+ return 0;
+ if (IS_ENCRYPTED(inode)) {
+ if (!fscrypt_dio_supported(inode))
+ return 0;
+ return i_blocksize(inode);
+ }
+ return 1; /* use the iomap defaults */
+}
+
+int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct ext4_inode *raw_inode;
@@ -5437,6 +6069,39 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ u32 dio_align = ext4_dio_alignment(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (dio_align == 1) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+
+ /* iomap defaults */
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ } else {
+ stat->dio_mem_align = dio_align;
+ stat->dio_offset_align = dio_align;
+ }
+ }
+
+ if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int awu_min = 0, awu_max = 0;
+
+ if (ext4_inode_can_atomic_write(inode)) {
+ awu_min = sbi->s_awu_min;
+ awu_max = sbi->s_awu_max;
+ }
+
+ generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
@@ -5458,17 +6123,18 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
-int ext4_file_getattr(const struct path *path, struct kstat *stat,
+int ext4_file_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
u64 delalloc_blocks;
- ext4_getattr(path, stat, request_mask, query_flags);
+ ext4_getattr(idmap, path, stat, request_mask, query_flags);
/*
* If there is inline data in the inode, the inode will normally not
@@ -5514,22 +6180,19 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents)
+int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
int idxblocks;
- int ret = 0;
+ int ret;
/*
- * How many index blocks need to touch to map @lblocks logical blocks
- * to @pextents physical extents?
+ * How many index and leaf blocks need to touch to map @lblocks
+ * logical blocks to @pextents physical extents?
*/
idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
- ret = idxblocks;
-
/*
* Now let's see how many group bitmaps and group descriptors need
* to account
@@ -5542,7 +6205,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
/* bitmaps and block group descriptor blocks */
- ret += groups + gdpblocks;
+ ret = idxblocks + groups + gdpblocks;
/* Blocks for super block, inode, quota and xattr blocks */
ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
@@ -5551,25 +6214,19 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
}
/*
- * Calculate the total number of credits to reserve to fit
- * the modification of a single pages into a single transaction,
- * which may include multiple chunks of block allocations.
- *
- * This could be called via ext4_write_begin()
- *
- * We need to consider the worse case, when
- * one new block per extent.
+ * Calculate the journal credits for modifying the number of blocks
+ * in a single extent within one transaction. 'nrblocks' is used only
+ * for non-extent inodes. For extent type inodes, 'nrblocks' can be
+ * zero if the exact number of blocks is unknown.
*/
-int ext4_writepage_trans_blocks(struct inode *inode)
+int ext4_chunk_trans_extent(struct inode *inode, int nrblocks)
{
- int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, bpp);
-
+ ret = ext4_meta_trans_blocks(inode, nrblocks, 1);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
- ret += bpp;
+ ret += nrblocks;
return ret;
}
@@ -5596,12 +6253,12 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err)) {
put_bh(iloc->bh);
- return -EIO;
+ return err;
}
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
+ ext4_fc_track_inode(handle, inode);
/* the do_update_inode consumes one bh->b_count */
get_bh(iloc->bh);
@@ -5623,17 +6280,20 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
{
int err;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
err = ext4_get_inode_loc(inode, iloc);
if (!err) {
BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ iloc->bh, EXT4_JTR_NONE);
if (err) {
brelse(iloc->bh);
iloc->bh = NULL;
}
+ ext4_fc_track_inode(handle, inode);
}
ext4_std_error(inode->i_sb, err);
return err;
@@ -5677,6 +6337,14 @@ static int __ext4_expand_extra_isize(struct inode *inode,
return 0;
}
+ /*
+ * We may need to allocate external xattr block so we need quotas
+ * initialized. Here we can be called with various locks held so we
+ * cannot affort to initialize quotas ourselves. So just bail.
+ */
+ if (dquot_initialize_needed(inode))
+ return -EAGAIN;
+
/* try to expand with EAs present */
error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
raw_inode, handle);
@@ -5752,7 +6420,8 @@ int ext4_expand_extra_isize(struct inode *inode,
ext4_write_lock_xattr(inode, &no_expand);
BUFFER_TRACE(iloc->bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, iloc->bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
+ EXT4_JTR_NONE);
if (error) {
brelse(iloc->bh);
goto out_unlock;
@@ -5822,26 +6491,16 @@ out:
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
- *
- * If only the I_DIRTY_TIME flag is set, we can skip everything. If
- * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
- * to copy into the on-disk inode structure are the timestamp files.
*/
void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
- if (flags == I_DIRTY_TIME)
- return;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
- goto out;
-
+ return;
ext4_mark_inode_dirty(handle, inode);
-
ext4_journal_stop(handle);
-out:
- return;
}
int ext4_change_inode_journal_flag(struct inode *inode, int val)
@@ -5849,7 +6508,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
journal_t *journal;
handle_t *handle;
int err;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int alloc_ctx;
/*
* We have to be very careful here: changing a data block's
@@ -5878,16 +6537,16 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
* dirty data which can be converted only after flushing the dirty
* data (and journalled aops don't know how to handle these cases).
*/
- if (val) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
- err = filemap_write_and_wait(inode->i_mapping);
- if (err < 0) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
- return err;
- }
+ filemap_invalidate_lock(inode->i_mapping);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err < 0) {
+ filemap_invalidate_unlock(inode->i_mapping);
+ return err;
}
+ /* Before switch the inode journalling mode evict all the page cache. */
+ truncate_pagecache(inode, 0);
- percpu_down_write(&sbi->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_write(inode->i_sb);
jbd2_journal_lock_updates(journal);
/*
@@ -5901,21 +6560,21 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else {
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
+ filemap_invalidate_unlock(inode->i_mapping);
return err;
}
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
}
ext4_set_aops(inode);
+ ext4_set_inode_mapping_order(inode);
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_writepages_rwsem);
-
- if (val)
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
+ filemap_invalidate_unlock(inode->i_mapping);
/* Finally we can mark the inode as dirty. */
@@ -5923,6 +6582,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (IS_ERR(handle))
return PTR_ERR(handle);
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
err = ext4_mark_inode_dirty(handle, inode);
ext4_handle_sync(handle);
ext4_journal_stop(handle);
@@ -5931,15 +6592,65 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
-static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
return !buffer_mapped(bh);
}
+static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio,
+ get_block_t get_block)
+{
+ handle_t *handle;
+ loff_t size;
+ unsigned long len;
+ int credits;
+ int ret;
+
+ credits = ext4_chunk_trans_extent(inode,
+ ext4_journal_blocks_per_folio(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ folio_lock(folio);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) {
+ ret = -EFAULT;
+ goto out_error;
+ }
+
+ len = folio_size(folio);
+ if (folio_pos(folio) + len > size)
+ len = size - folio_pos(folio);
+
+ ret = ext4_block_write_begin(handle, folio, 0, len, get_block);
+ if (ret)
+ goto out_error;
+
+ if (!ext4_should_journal_data(inode)) {
+ block_commit_write(folio, 0, len);
+ folio_mark_dirty(folio);
+ } else {
+ ret = ext4_journal_folio_buffers(handle, folio, len);
+ if (ret)
+ goto out_error;
+ }
+ ext4_journal_stop(handle);
+ folio_wait_stable(folio);
+ return ret;
+
+out_error:
+ folio_unlock(folio);
+ ext4_journal_stop(handle);
+ return ret;
+}
+
vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
loff_t size;
unsigned long len;
int err;
@@ -5947,8 +6658,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
- handle_t *handle;
- get_block_t *get_block;
+ get_block_t *get_block = ext4_get_block;
int retries = 0;
if (unlikely(IS_IMMUTABLE(inode)))
@@ -5957,15 +6667,23 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
err = ext4_convert_inline_data(inode);
if (err)
goto out_ret;
+ /*
+ * On data journalling we skip straight to the transaction handle:
+ * there's no delalloc; page truncated will be checked later; the
+ * early return w/ all buffers mapped (calculates size/len) can't
+ * be used; and there's no dioread_nolock, so only ext4_get_block.
+ */
+ if (ext4_should_journal_data(inode))
+ goto retry_alloc;
+
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
- !ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
err = block_page_mkwrite(vma, vmf,
@@ -5975,76 +6693,49 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
goto out_ret;
}
- lock_page(page);
+ folio_lock(folio);
size = i_size_read(inode);
/* Page got truncated from under us? */
- if (page->mapping != mapping || page_offset(page) > size) {
- unlock_page(page);
+ if (folio->mapping != mapping || folio_pos(folio) > size) {
+ folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
goto out;
}
- if (page->index == size >> PAGE_SHIFT)
- len = size & ~PAGE_MASK;
- else
- len = PAGE_SIZE;
+ len = folio_size(folio);
+ if (folio_pos(folio) + len > size)
+ len = size - folio_pos(folio);
/*
* Return if we have all the buffers mapped. This avoids the need to do
* journal_start/journal_stop which can block and take a long time
+ *
+ * This cannot be done for data journalling, as we have to add the
+ * inode to the transaction's list to writeprotect pages on commit.
*/
- if (page_has_buffers(page)) {
- if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ if (folio_buffers(folio)) {
+ if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio),
0, len, NULL,
ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
ret = VM_FAULT_LOCKED;
goto out;
}
}
- unlock_page(page);
+ folio_unlock(folio);
/* OK, we need to fill the hole... */
if (ext4_should_dioread_nolock(inode))
get_block = ext4_get_block_unwritten;
- else
- get_block = ext4_get_block;
retry_alloc:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
- err = block_page_mkwrite(vma, vmf, get_block);
- if (!err && ext4_should_journal_data(inode)) {
- if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_SIZE, NULL, do_journal_get_write_access)) {
- unlock_page(page);
- ret = VM_FAULT_SIGBUS;
- ext4_journal_stop(handle);
- goto out;
- }
- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
- }
- ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ /* Start journal and allocate blocks */
+ err = ext4_block_page_mkwrite(inode, folio, get_block);
+ if (err == -EAGAIN ||
+ (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_alloc;
out_ret:
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
out:
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
return ret;
}
-
-vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- vm_fault_t ret;
-
- down_read(&EXT4_I(inode)->i_mmap_sem);
- ret = filemap_fault(vmf);
- up_read(&EXT4_I(inode)->i_mmap_sem);
-
- return ret;
-}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2162db0c747d..7ce0fc40aec2 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -16,17 +16,268 @@
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/random.h>
-#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/delay.h>
#include <linux/iversion.h>
+#include <linux/fileattr.h>
+#include <linux/uuid.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include <linux/fsmap.h>
#include "fsmap.h"
#include <trace/events/ext4.h>
-/**
+typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es,
+ const void *arg);
+
+/*
+ * Superblock modification callback function for changing file system
+ * label
+ */
+static void ext4_sb_setlabel(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ /* Sanity check, this should never happen */
+ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX);
+
+ memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX);
+}
+
+/*
+ * Superblock modification callback function for changing file system
+ * UUID.
+ */
+static void ext4_sb_setuuid(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
+}
+
+static
+int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
+ ext4_update_sb_callback func,
+ const void *arg)
+{
+ int err = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh = sbi->s_sbh;
+ struct ext4_super_block *es = sbi->s_es;
+
+ trace_ext4_update_sb(sb, bh->b_blocknr, 1);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb,
+ bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+
+ lock_buffer(bh);
+ func(sbi, es, arg);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(bh);
+
+ if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) {
+ ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_err;
+ err = sync_dirty_buffer(bh);
+out_err:
+ ext4_std_error(sb, err);
+ return err;
+}
+
+/*
+ * Update one backup superblock in the group 'grp' using the callback
+ * function 'func' and argument 'arg'. If the handle is NULL the
+ * modification is not journalled.
+ *
+ * Returns: 0 when no modification was done (no superblock in the group)
+ * 1 when the modification was successful
+ * <0 on error
+ */
+static int ext4_update_backup_sb(struct super_block *sb,
+ handle_t *handle, ext4_group_t grp,
+ ext4_update_sb_callback func, const void *arg)
+{
+ int err = 0;
+ ext4_fsblk_t sb_block;
+ struct buffer_head *bh;
+ unsigned long offset = 0;
+ struct ext4_super_block *es;
+
+ if (!ext4_bg_has_super(sb, grp))
+ return 0;
+
+ /*
+ * For the group 0 there is always 1k padding, so we have
+ * either adjust offset, or sb_block depending on blocksize
+ */
+ if (grp == 0) {
+ sb_block = 1 * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(sb_block, sb->s_blocksize);
+ } else {
+ sb_block = ext4_group_first_block_no(sb, grp);
+ offset = 0;
+ }
+
+ trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0);
+
+ bh = ext4_sb_bread(sb, sb_block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ if (handle) {
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb,
+ bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_bh;
+ }
+
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ lock_buffer(bh);
+ if (ext4_has_feature_metadata_csum(sb) &&
+ es->s_checksum != ext4_superblock_csum(es)) {
+ ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
+ "superblock %llu", sb_block);
+ unlock_buffer(bh);
+ goto out_bh;
+ }
+ func(EXT4_SB(sb), es, arg);
+ if (ext4_has_feature_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(es);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ if (handle) {
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_bh;
+ } else {
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ }
+ err = sync_dirty_buffer(bh);
+
+out_bh:
+ brelse(bh);
+ ext4_std_error(sb, err);
+ return (err) ? err : 1;
+}
+
+/*
+ * Update primary and backup superblocks using the provided function
+ * func and argument arg.
+ *
+ * Only the primary superblock and at most two backup superblock
+ * modifications are journalled; the rest is modified without journal.
+ * This is safe because e2fsck will re-write them if there is a problem,
+ * and we're very unlikely to ever need more than two backups.
+ */
+static
+int ext4_update_superblocks_fn(struct super_block *sb,
+ ext4_update_sb_callback func,
+ const void *arg)
+{
+ handle_t *handle;
+ ext4_group_t ngroups;
+ unsigned int three = 1;
+ unsigned int five = 5;
+ unsigned int seven = 7;
+ int err = 0, ret, i;
+ ext4_group_t grp, primary_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /*
+ * We can't update superblocks while the online resize is running
+ */
+ if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
+ &sbi->s_ext4_flags)) {
+ ext4_msg(sb, KERN_ERR, "Can't modify superblock while"
+ "performing online resize");
+ return -EBUSY;
+ }
+
+ /*
+ * We're only going to update primary superblock and two
+ * backup superblocks in this transaction.
+ */
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out;
+ }
+
+ /* Update primary superblock */
+ err = ext4_update_primary_sb(sb, handle, func, arg);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "Failed to update primary "
+ "superblock");
+ goto out_journal;
+ }
+
+ primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr);
+ ngroups = ext4_get_groups_count(sb);
+
+ /*
+ * Update backup superblocks. We have to start from group 0
+ * because it might not be where the primary superblock is
+ * if the fs is mounted with -o sb=<backup_sb_block>
+ */
+ i = 0;
+ grp = 0;
+ while (grp < ngroups) {
+ /* Skip primary superblock */
+ if (grp == primary_grp)
+ goto next_grp;
+
+ ret = ext4_update_backup_sb(sb, handle, grp, func, arg);
+ if (ret < 0) {
+ /* Ignore bad checksum; try to update next sb */
+ if (ret == -EFSBADCRC)
+ goto next_grp;
+ err = ret;
+ goto out_journal;
+ }
+
+ i += ret;
+ if (handle && i > 1) {
+ /*
+ * We're only journalling primary superblock and
+ * two backup superblocks; the rest is not
+ * journalled.
+ */
+ err = ext4_journal_stop(handle);
+ if (err)
+ goto out;
+ handle = NULL;
+ }
+next_grp:
+ grp = ext4_list_backups(sb, &three, &five, &seven);
+ }
+
+out_journal:
+ if (handle) {
+ ret = ext4_journal_stop(handle);
+ if (ret && !err)
+ err = ret;
+ }
+out:
+ clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags);
+ smp_mb__after_atomic();
+ return err ? err : 0;
+}
+
+/*
* Swap memory between @a and @b for @len bytes.
*
* @a: pointer to first memory area
@@ -47,7 +298,7 @@ static void memswap(void *a, void *b, size_t len)
}
}
-/**
+/*
* Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors.
@@ -64,13 +315,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
unsigned long tmp;
+ struct timespec64 ts1, ts2;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_version, inode2->i_version);
- swap(inode1->i_atime, inode2->i_atime);
- swap(inode1->i_mtime, inode2->i_mtime);
+
+ ts1 = inode_get_atime(inode1);
+ ts2 = inode_get_atime(inode2);
+ inode_set_atime_to_ts(inode1, ts2);
+ inode_set_atime_to_ts(inode2, ts1);
+
+ ts1 = inode_get_mtime(inode1);
+ ts2 = inode_get_mtime(inode2);
+ inode_set_mtime_to_ts(inode1, ts2);
+ inode_set_mtime_to_ts(inode2, ts1);
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
@@ -86,7 +346,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
i_size_write(inode2, isize);
}
-static void reset_inode_seed(struct inode *inode)
+void ext4_reset_inode_seed(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -94,23 +354,25 @@ static void reset_inode_seed(struct inode *inode)
__le32 gen = cpu_to_le32(inode->i_generation);
__u32 csum;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
-/**
+/*
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes.
*
* @sb: the super block of the filesystem
+ * @idmap: idmap of the mount the inode was found from
* @inode: the inode to swap with EXT4_BOOT_LOADER_INO
*
*/
static long swap_inode_boot_loader(struct super_block *sb,
+ struct mnt_idmap *idmap,
struct inode *inode)
{
handle_t *handle;
@@ -121,7 +383,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
blkcnt_t blocks;
unsigned short bytes;
- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO,
+ EXT4_IGET_SPECIAL | EXT4_IGET_BAD);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
@@ -139,12 +402,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
}
if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
- !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ !inode_owner_or_capable(idmap, inode) ||
+ !capable(CAP_SYS_ADMIN)) {
err = -EPERM;
goto journal_err_out;
}
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto err_out;
@@ -165,11 +429,12 @@ static long swap_inode_boot_loader(struct super_block *sb,
err = -EINVAL;
goto err_out;
}
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle);
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
- if (inode_bl->i_nlink == 0) {
+ if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) {
/* this inode has never been used as a BOOT_LOADER */
set_nlink(inode_bl, 1);
i_uid_write(inode_bl, 0);
@@ -178,6 +443,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
ei_bl->i_flags = 0;
inode_set_iversion(inode_bl, 1);
i_size_write(inode_bl, 0);
+ EXT4_I(inode_bl)->i_disksize = inode_bl->i_size;
inode_bl->i_mode = S_IFREG;
if (ext4_has_feature_extents(sb)) {
ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
@@ -195,12 +461,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
diff = size - size_bl;
swap_inode_data(inode, inode_bl);
- inode->i_ctime = inode_bl->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
+ inode_set_ctime_current(inode_bl);
+ inode_inc_iversion(inode);
- inode->i_generation = prandom_u32();
- inode_bl->i_generation = prandom_u32();
- reset_inode_seed(inode);
- reset_inode_seed(inode_bl);
+ inode->i_generation = get_random_u32();
+ inode_bl->i_generation = get_random_u32();
+ ext4_reset_inode_seed(inode);
+ ext4_reset_inode_seed(inode_bl);
ext4_discard_preallocations(inode);
@@ -250,25 +518,13 @@ err_out1:
ext4_double_up_write_data_sem(inode, inode_bl);
err_out:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
journal_err_out:
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
return err;
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int uuid_is_zero(__u8 u[16])
-{
- int i;
-
- for (i = 0; i < 16; i++)
- if (u[i])
- return 0;
- return 1;
-}
-#endif
-
/*
* If immutable is set and we are not clearing it, we're not allowed to change
* anything else in the inode. Don't error out if we're only trying to set
@@ -292,6 +548,44 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
return 0;
}
+static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (test_opt2(inode->i_sb, DAX_NEVER) ||
+ test_opt(inode->i_sb, DAX_ALWAYS))
+ return;
+
+ if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
+ d_mark_dontcache(inode);
+}
+
+static bool dax_compatible(struct inode *inode, unsigned int oldflags,
+ unsigned int flags)
+{
+ /* Allow the DAX flag to be changed on inline directories */
+ if (S_ISDIR(inode->i_mode)) {
+ flags &= ~EXT4_INLINE_DATA_FL;
+ oldflags &= ~EXT4_INLINE_DATA_FL;
+ }
+
+ if (flags & EXT4_DAX_FL) {
+ if ((oldflags & EXT4_DAX_MUT_EXCL) ||
+ ext4_test_inode_state(inode,
+ EXT4_STATE_VERITY_IN_PROGRESS)) {
+ return false;
+ }
+ }
+
+ if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
+ return false;
+
+ return true;
+}
+
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
{
@@ -300,7 +594,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
int err = -EPERM, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
- unsigned int jflag;
struct super_block *sb = inode->i_sb;
/* Is it quota file? Do not allow user to mess with it */
@@ -308,22 +601,20 @@ static int ext4_ioctl_setflags(struct inode *inode,
goto flags_out;
oldflags = ei->i_flags;
-
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT4_JOURNAL_DATA_FL;
-
- err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (err)
- goto flags_out;
-
/*
* The JOURNAL_DATA flag can only be changed by
* the relevant capability.
*/
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
+
+ if (!dax_compatible(inode, oldflags, flags)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
@@ -369,6 +660,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
if (err)
goto flags_err;
+ ext4_dax_dontcache(inode, flags);
+
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
@@ -381,8 +674,10 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_clear_inode_flag(inode, i);
}
- ext4_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ ext4_set_inode_flags(inode, false);
+
+ inode_set_ctime_current(inode);
+ inode_inc_iversion(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -390,17 +685,18 @@ flags_err:
if (err)
goto flags_out;
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
/*
* Changes to the journaling mode can cause unsafe changes to
- * S_DAX if we are using the DAX mount option.
+ * S_DAX if the inode is DAX
*/
- if (test_opt(inode->i_sb, DAX)) {
+ if (IS_DAX(inode)) {
err = -EBUSY;
goto flags_out;
}
- err = ext4_change_inode_journal_flag(inode, jflag);
+ err = ext4_change_inode_journal_flag(inode,
+ flags & EXT4_JOURNAL_DATA_FL);
if (err)
goto flags_out;
}
@@ -416,9 +712,8 @@ flags_out:
}
#ifdef CONFIG_QUOTA
-static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
{
- struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
int err, rc;
@@ -448,6 +743,10 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
if (ext4_is_quota_file(inode))
return err;
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
@@ -463,10 +762,6 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
brelse(iloc.bh);
}
- err = dquot_initialize(inode);
- if (err)
- return err;
-
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(sb) +
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
@@ -492,7 +787,8 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
}
EXT4_I(inode)->i_projid = kprojid;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
+ inode_inc_iversion(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
@@ -502,7 +798,7 @@ out_stop:
return err;
}
#else
-static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
{
if (projid != EXT4_DEF_PROJID)
return -EOPNOTSUPP;
@@ -510,66 +806,15 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
}
#endif
-/* Transfer internal flags to xflags */
-static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
-{
- __u32 xflags = 0;
-
- if (iflags & EXT4_SYNC_FL)
- xflags |= FS_XFLAG_SYNC;
- if (iflags & EXT4_IMMUTABLE_FL)
- xflags |= FS_XFLAG_IMMUTABLE;
- if (iflags & EXT4_APPEND_FL)
- xflags |= FS_XFLAG_APPEND;
- if (iflags & EXT4_NODUMP_FL)
- xflags |= FS_XFLAG_NODUMP;
- if (iflags & EXT4_NOATIME_FL)
- xflags |= FS_XFLAG_NOATIME;
- if (iflags & EXT4_PROJINHERIT_FL)
- xflags |= FS_XFLAG_PROJINHERIT;
- return xflags;
-}
-
-#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
- FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
- FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
-
-/* Transfer xflags flags to internal */
-static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
-{
- unsigned long iflags = 0;
-
- if (xflags & FS_XFLAG_SYNC)
- iflags |= EXT4_SYNC_FL;
- if (xflags & FS_XFLAG_IMMUTABLE)
- iflags |= EXT4_IMMUTABLE_FL;
- if (xflags & FS_XFLAG_APPEND)
- iflags |= EXT4_APPEND_FL;
- if (xflags & FS_XFLAG_NODUMP)
- iflags |= EXT4_NODUMP_FL;
- if (xflags & FS_XFLAG_NOATIME)
- iflags |= EXT4_NOATIME_FL;
- if (xflags & FS_XFLAG_PROJINHERIT)
- iflags |= EXT4_PROJINHERIT_FL;
-
- return iflags;
-}
-
-static int ext4_shutdown(struct super_block *sb, unsigned long arg)
+int ext4_force_shutdown(struct super_block *sb, u32 flags)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- __u32 flags;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (get_user(flags, (__u32 __user *)arg))
- return -EFAULT;
+ int ret;
if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
return -EINVAL;
- if (ext4_forced_shutdown(sbi))
+ if (ext4_forced_shutdown(sb))
return 0;
ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags);
@@ -577,9 +822,11 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
switch (flags) {
case EXT4_GOING_FLAGS_DEFAULT:
- freeze_bdev(sb->s_bdev);
+ ret = bdev_freeze(sb->s_bdev);
+ if (ret)
+ return ret;
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
- thaw_bdev(sb->s_bdev, sb);
+ bdev_thaw(sb->s_bdev);
break;
case EXT4_GOING_FLAGS_LOGFLUSH:
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
@@ -600,6 +847,19 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
return 0;
}
+static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg)
+{
+ u32 flags;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, (__u32 __user *)arg))
+ return -EFAULT;
+
+ return ext4_force_shutdown(sb, flags);
+}
+
struct getfsmap_info {
struct super_block *gi_sb;
struct fsmap_head __user *gi_data;
@@ -660,10 +920,9 @@ static int ext4_ioc_getfsmap(struct super_block *sb,
info.gi_sb = sb;
info.gi_data = arg;
error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info);
- if (error == EXT4_QUERY_RANGE_ABORT) {
- error = 0;
+ if (error == EXT4_QUERY_RANGE_ABORT)
aborted = true;
- } else if (error)
+ else if (error)
return error;
/* If we didn't abort, set the "last" flag in the last fmx */
@@ -708,7 +967,7 @@ static long ext4_ioctl_group_add(struct file *file,
err = ext4_group_add(sb, input);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -718,19 +977,56 @@ static long ext4_ioctl_group_add(struct file *file,
test_opt(sb, INIT_INODE_TABLE))
err = ext4_register_li_request(sb, input->group);
group_add_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
return err;
}
-static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
+ struct inode *inode = d_inode(dentry);
struct ext4_inode_info *ei = EXT4_I(inode);
+ u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
- simple_fill_fsxattr(fa, ext4_iflags_to_xflags(ei->i_flags &
- EXT4_FL_USER_VISIBLE));
+ if (S_ISREG(inode->i_mode))
+ flags &= ~FS_PROJINHERIT_FL;
+ fileattr_fill_flags(fa, flags);
if (ext4_has_feature_project(inode->i_sb))
fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
+
+ return 0;
+}
+
+int ext4_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct file_kattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ u32 flags = fa->flags;
+ int err = -EOPNOTSUPP;
+
+ if (flags & ~EXT4_FL_USER_VISIBLE)
+ goto out;
+
+ /*
+ * chattr(1) grabs flags via GETFLAGS, modifies the result and
+ * passes that to SETFLAGS. So we cannot easily make SETFLAGS
+ * more restrictive than just silently masking off visible but
+ * not settable flags as we always did.
+ */
+ flags &= EXT4_FL_USER_MODIFIABLE;
+ if (ext4_mask_flags(inode->i_mode, flags) != flags)
+ goto out;
+ err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags);
+ if (err)
+ goto out;
+ err = ext4_ioctl_setflags(inode, flags);
+ if (err)
+ goto out;
+ err = ext4_ioctl_setproject(inode, fa->fsx_projid);
+out:
+ return err;
}
/* So that the fiemap access checks can't overflow on 32 bit machines. */
@@ -764,58 +1060,483 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
return error;
}
-long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
{
- struct inode *inode = file_inode(filp);
- struct super_block *sb = inode->i_sb;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned int flags;
+ int err = 0;
+ __u32 flags = 0;
+ unsigned int flush_flags = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
- ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
+ if (copy_from_user(&flags, (__u32 __user *)arg,
+ sizeof(__u32)))
+ return -EFAULT;
- switch (cmd) {
- case FS_IOC_GETFSMAP:
- return ext4_ioc_getfsmap(sb, (void __user *)arg);
- case EXT4_IOC_GETFLAGS:
- flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
- if (S_ISREG(inode->i_mode))
- flags &= ~EXT4_PROJINHERIT_FL;
- return put_user(flags, (int __user *) arg);
- case EXT4_IOC_SETFLAGS: {
- int err;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
- if (!inode_owner_or_capable(inode))
- return -EACCES;
+ /* check for invalid bits set */
+ if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) ||
+ ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+ return -EINVAL;
+
+ if (!EXT4_SB(sb)->s_journal)
+ return -ENODEV;
+
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
+ return -EOPNOTSUPP;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+ return 0;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD)
+ flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) {
+ flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT;
+ pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow");
+ }
+
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+ return err;
+}
+
+static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label)
+{
+ size_t len;
+ int ret = 0;
+ char new_label[EXT4_LABEL_MAX + 1];
+ struct super_block *sb = file_inode(filp)->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Copy the maximum length allowed for ext4 label with one more to
+ * find the required terminating null byte in order to test the
+ * label length. The on disk label doesn't need to be null terminated.
+ */
+ if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1))
+ return -EFAULT;
+
+ len = strnlen(new_label, EXT4_LABEL_MAX + 1);
+ if (len > EXT4_LABEL_MAX)
+ return -EINVAL;
+
+ /*
+ * Clear the buffer after the new label
+ */
+ memset(new_label + len, 0, EXT4_LABEL_MAX - len);
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label);
+
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label)
+{
+ char label[EXT4_LABEL_MAX + 1];
+
+ /*
+ * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because
+ * FSLABEL_MAX must include terminating null byte, while s_volume_name
+ * does not have to.
+ */
+ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
+
+ lock_buffer(sbi->s_sbh);
+ memtostr_pad(label, sbi->s_es->s_volume_name);
+ unlock_buffer(sbi->s_sbh);
+
+ if (copy_to_user(user_label, label, sizeof(label)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
+ struct fsuuid __user *ufsuuid)
+{
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
- if (get_user(flags, (int __user *) arg))
+ if (fsuuid.fsu_len == 0) {
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len,
+ sizeof(fsuuid.fsu_len)))
return -EFAULT;
+ return 0;
+ }
+
+ if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ lock_buffer(sbi->s_sbh);
+ memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
+ unlock_buffer(sbi->s_sbh);
+
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) ||
+ copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_ioctl_setuuid(struct file *filp,
+ const struct fsuuid __user *ufsuuid)
+{
+ int ret = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * If any checksums (group descriptors or metadata) are being used
+ * then the checksum seed feature is required to change the UUID.
+ */
+ if (((ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb))
+ && !ext4_has_feature_csum_seed(sb))
+ || ext4_has_feature_stable_inodes(sb))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
- if (flags & ~EXT4_FL_USER_VISIBLE)
+ if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE))
+ return -EFAULT;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid);
+ mnt_drop_write_file(filp);
+
+ return ret;
+}
+
+
+#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \
+ EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \
+ EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \
+ EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \
+ EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \
+ EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \
+ EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \
+ EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \
+ EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \
+ EXT4_TUNE_FL_ENCODING_FLAGS)
+
+#define EXT4_TUNE_SET_COMPAT_SUPP \
+ (EXT4_FEATURE_COMPAT_DIR_INDEX | \
+ EXT4_FEATURE_COMPAT_STABLE_INODES)
+#define EXT4_TUNE_SET_INCOMPAT_SUPP \
+ (EXT4_FEATURE_INCOMPAT_EXTENTS | \
+ EXT4_FEATURE_INCOMPAT_EA_INODE | \
+ EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR | \
+ EXT4_FEATURE_INCOMPAT_CASEFOLD)
+#define EXT4_TUNE_SET_RO_COMPAT_SUPP \
+ (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \
+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+ EXT4_FEATURE_RO_COMPAT_PROJECT | \
+ EXT4_FEATURE_RO_COMPAT_VERITY)
+
+#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0)
+#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0)
+#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0)
+
+#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \
+ SB_ENC_NO_COMPAT_FALLBACK_FL)
+
+static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi,
+ struct ext4_tune_sb_params __user *params)
+{
+ struct ext4_tune_sb_params ret;
+ struct ext4_super_block *es = sbi->s_es;
+
+ memset(&ret, 0, sizeof(ret));
+ ret.set_flags = TUNE_OPS_SUPPORTED;
+ ret.errors_behavior = le16_to_cpu(es->s_errors);
+ ret.mnt_count = le16_to_cpu(es->s_mnt_count);
+ ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count);
+ ret.checkinterval = le32_to_cpu(es->s_checkinterval);
+ ret.last_check_time = le32_to_cpu(es->s_lastcheck);
+ ret.reserved_blocks = ext4_r_blocks_count(es);
+ ret.blocks_count = ext4_blocks_count(es);
+ ret.reserved_uid = ext4_get_resuid(es);
+ ret.reserved_gid = ext4_get_resgid(es);
+ ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts);
+ ret.def_hash_alg = es->s_def_hash_version;
+ ret.raid_stride = le16_to_cpu(es->s_raid_stride);
+ ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width);
+ ret.encoding = le16_to_cpu(es->s_encoding);
+ ret.encoding_flags = le16_to_cpu(es->s_encoding_flags);
+ strscpy_pad(ret.mount_opts, es->s_mount_opts);
+ ret.feature_compat = le32_to_cpu(es->s_feature_compat);
+ ret.feature_incompat = le32_to_cpu(es->s_feature_incompat);
+ ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat);
+ ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP;
+ ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP;
+ ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP;
+ ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP;
+ ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP;
+ ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP;
+ if (copy_to_user(params, &ret, sizeof(ret)))
+ return -EFAULT;
+ return 0;
+}
+
+static void ext4_sb_setparams(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ const struct ext4_tune_sb_params *params = arg;
+
+ if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR)
+ es->s_errors = cpu_to_le16(params->errors_behavior);
+ if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT)
+ es->s_mnt_count = cpu_to_le16(params->mnt_count);
+ if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT)
+ es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count);
+ if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL)
+ es->s_checkinterval = cpu_to_le32(params->checkinterval);
+ if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME)
+ es->s_lastcheck = cpu_to_le32(params->last_check_time);
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) {
+ ext4_fsblk_t blk = params->reserved_blocks;
+
+ es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) {
+ int uid = params->reserved_uid;
+
+ es->s_def_resuid = cpu_to_le16(uid & 0xFFFF);
+ es->s_def_resuid_hi = cpu_to_le16(uid >> 16);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) {
+ int gid = params->reserved_gid;
+
+ es->s_def_resgid = cpu_to_le16(gid & 0xFFFF);
+ es->s_def_resgid_hi = cpu_to_le16(gid >> 16);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS)
+ es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts);
+ if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ es->s_def_hash_version = params->def_hash_alg;
+ if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE)
+ es->s_raid_stride = cpu_to_le16(params->raid_stride);
+ if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH)
+ es->s_raid_stripe_width =
+ cpu_to_le32(params->raid_stripe_width);
+ if (params->set_flags & EXT4_TUNE_FL_ENCODING)
+ es->s_encoding = cpu_to_le16(params->encoding);
+ if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)
+ es->s_encoding_flags = cpu_to_le16(params->encoding_flags);
+ strscpy_pad(es->s_mount_opts, params->mount_opts);
+ if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
+ es->s_feature_compat |=
+ cpu_to_le32(params->set_feature_compat_mask);
+ es->s_feature_incompat |=
+ cpu_to_le32(params->set_feature_incompat_mask);
+ es->s_feature_ro_compat |=
+ cpu_to_le32(params->set_feature_ro_compat_mask);
+ es->s_feature_compat &=
+ ~cpu_to_le32(params->clear_feature_compat_mask);
+ es->s_feature_incompat &=
+ ~cpu_to_le32(params->clear_feature_incompat_mask);
+ es->s_feature_ro_compat &=
+ ~cpu_to_le32(params->clear_feature_ro_compat_mask);
+ if (params->set_feature_compat_mask &
+ EXT4_FEATURE_COMPAT_DIR_INDEX)
+ es->s_def_hash_version = sbi->s_def_hash_version;
+ if (params->set_feature_incompat_mask &
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK)
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+}
+
+static int ext4_ioctl_set_tune_sb(struct file *filp,
+ struct ext4_tune_sb_params __user *in)
+{
+ struct ext4_tune_sb_params params;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int enabling_casefold = 0;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&params, in, sizeof(params)))
+ return -EFAULT;
+
+ if (strnlen(params.mount_opts, sizeof(params.mount_opts)) ==
+ sizeof(params.mount_opts))
+ return -E2BIG;
+
+ if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0)
+ return -EOPNOTSUPP;
+
+ if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) &&
+ (params.errors_behavior > EXT4_ERRORS_PANIC))
+ return -EINVAL;
+
+ if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) &&
+ (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2))
+ return -EINVAL;
+ if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) &&
+ ((params.def_hash_alg > DX_HASH_LAST) ||
+ (params.def_hash_alg == DX_HASH_SIPHASH)))
+ return -EINVAL;
+ if ((params.set_flags & EXT4_TUNE_FL_FEATURES) &&
+ (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES))
+ return -EINVAL;
+
+ if (params.set_flags & EXT4_TUNE_FL_FEATURES) {
+ params.set_feature_compat_mask =
+ params.feature_compat &
+ ~le32_to_cpu(es->s_feature_compat);
+ params.set_feature_incompat_mask =
+ params.feature_incompat &
+ ~le32_to_cpu(es->s_feature_incompat);
+ params.set_feature_ro_compat_mask =
+ params.feature_ro_compat &
+ ~le32_to_cpu(es->s_feature_ro_compat);
+ params.clear_feature_compat_mask =
+ ~params.feature_compat &
+ le32_to_cpu(es->s_feature_compat);
+ params.clear_feature_incompat_mask =
+ ~params.feature_incompat &
+ le32_to_cpu(es->s_feature_incompat);
+ params.clear_feature_ro_compat_mask =
+ ~params.feature_ro_compat &
+ le32_to_cpu(es->s_feature_ro_compat);
+ params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES;
+ }
+ if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
+ if ((params.set_feature_compat_mask &
+ ~EXT4_TUNE_SET_COMPAT_SUPP) ||
+ (params.set_feature_incompat_mask &
+ ~EXT4_TUNE_SET_INCOMPAT_SUPP) ||
+ (params.set_feature_ro_compat_mask &
+ ~EXT4_TUNE_SET_RO_COMPAT_SUPP) ||
+ (params.clear_feature_compat_mask &
+ ~EXT4_TUNE_CLEAR_COMPAT_SUPP) ||
+ (params.clear_feature_incompat_mask &
+ ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) ||
+ (params.clear_feature_ro_compat_mask &
+ ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP))
return -EOPNOTSUPP;
+
/*
- * chattr(1) grabs flags via GETFLAGS, modifies the result and
- * passes that to SETFLAGS. So we cannot easily make SETFLAGS
- * more restrictive than just silently masking off visible but
- * not settable flags as we always did.
+ * Filter out the features that are already set from
+ * the set_mask.
*/
- flags &= EXT4_FL_USER_MODIFIABLE;
- if (ext4_mask_flags(inode->i_mode, flags) != flags)
- return -EOPNOTSUPP;
+ params.set_feature_compat_mask &=
+ ~le32_to_cpu(es->s_feature_compat);
+ params.set_feature_incompat_mask &=
+ ~le32_to_cpu(es->s_feature_incompat);
+ params.set_feature_ro_compat_mask &=
+ ~le32_to_cpu(es->s_feature_ro_compat);
+ if ((params.set_feature_incompat_mask &
+ EXT4_FEATURE_INCOMPAT_CASEFOLD)) {
+ enabling_casefold = 1;
+ if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) {
+ params.encoding = EXT4_ENC_UTF8_12_1;
+ params.set_flags |= EXT4_TUNE_FL_ENCODING;
+ }
+ if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) {
+ params.encoding_flags = 0;
+ params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS;
+ }
+ }
+ if ((params.set_feature_compat_mask &
+ EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ uuid_t uu;
+
+ memcpy(&uu, sbi->s_hash_seed, UUID_SIZE);
+ if (uuid_is_null(&uu))
+ generate_random_uuid((char *)
+ &sbi->s_hash_seed);
+ if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ sbi->s_def_hash_version = params.def_hash_alg;
+ else if (sbi->s_def_hash_version == 0)
+ sbi->s_def_hash_version = DX_HASH_HALF_MD4;
+ if (!(es->s_flags &
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) &&
+ !(es->s_flags &
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) {
+#ifdef __CHAR_UNSIGNED__
+ sbi->s_hash_unsigned = 3;
+#else
+ sbi->s_hash_unsigned = 0;
+#endif
+ }
+ }
+ }
+ if (params.set_flags & EXT4_TUNE_FL_ENCODING) {
+ if (!enabling_casefold)
+ return -EINVAL;
+ if (params.encoding == 0)
+ params.encoding = EXT4_ENC_UTF8_12_1;
+ else if (params.encoding != EXT4_ENC_UTF8_12_1)
+ return -EINVAL;
+ }
+ if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) {
+ if (!enabling_casefold)
+ return -EINVAL;
+ if (params.encoding_flags & ~SB_ENC_SUPP_MASK)
+ return -EINVAL;
+ }
- err = mnt_want_write_file(filp);
- if (err)
- return err;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
- inode_lock(inode);
- err = ext4_ioctl_check_immutable(inode,
- from_kprojid(&init_user_ns, ei->i_projid),
- flags);
- if (!err)
- err = ext4_ioctl_setflags(inode, flags);
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return err;
- }
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, &params);
+ mnt_drop_write_file(filp);
+
+ if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ sbi->s_def_hash_version = params.def_hash_alg;
+
+ return ret;
+}
+
+static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ struct mnt_idmap *idmap = file_mnt_idmap(filp);
+
+ ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
+ case FS_IOC_GETFSMAP:
+ return ext4_ioc_getfsmap(sb, (void __user *)arg);
case EXT4_IOC_GETVERSION:
case EXT4_IOC_GETVERSION_OLD:
return put_user(inode->i_generation, (int __user *) arg);
@@ -826,10 +1547,10 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
__u32 generation;
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(idmap, inode))
return -EPERM;
- if (ext4_has_metadata_csum(inode->i_sb)) {
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
ext4_warning(sb, "Setting inode version is not "
"supported with metadata_csum enabled.");
return -ENOTTY;
@@ -851,7 +1572,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
+ inode_inc_iversion(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
@@ -890,20 +1612,21 @@ setversion_out:
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
group_extend_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
return err;
}
case EXT4_IOC_MOVE_EXT: {
struct move_extent me;
- struct fd donor;
int err;
if (!(filp->f_mode & FMODE_READ) ||
@@ -915,40 +1638,24 @@ group_extend_out:
return -EFAULT;
me.moved_len = 0;
- donor = fdget(me.donor_fd);
- if (!donor.file)
+ CLASS(fd, donor)(me.donor_fd);
+ if (fd_empty(donor))
return -EBADF;
- if (!(donor.file->f_mode & FMODE_WRITE)) {
- err = -EBADF;
- goto mext_out;
- }
-
- if (ext4_has_feature_bigalloc(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Online defrag not supported with bigalloc");
- err = -EOPNOTSUPP;
- goto mext_out;
- } else if (IS_DAX(inode)) {
- ext4_msg(sb, KERN_ERR,
- "Online defrag not supported with DAX");
- err = -EOPNOTSUPP;
- goto mext_out;
- }
+ if (!(fd_file(donor)->f_mode & FMODE_WRITE))
+ return -EBADF;
err = mnt_want_write_file(filp);
if (err)
- goto mext_out;
+ return err;
- err = ext4_move_extents(filp, donor.file, me.orig_start,
+ err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
me.donor_start, me.len, &me.moved_len);
mnt_drop_write_file(filp);
if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
-mext_out:
- fdput(donor);
return err;
}
@@ -965,7 +1672,7 @@ mext_out:
case EXT4_IOC_MIGRATE:
{
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(idmap, inode))
return -EACCES;
err = mnt_want_write_file(filp);
@@ -987,7 +1694,7 @@ mext_out:
case EXT4_IOC_ALLOC_DA_BLKS:
{
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(idmap, inode))
return -EACCES;
err = mnt_want_write_file(filp);
@@ -1006,7 +1713,7 @@ mext_out:
err = mnt_want_write_file(filp);
if (err)
return err;
- err = swap_inode_boot_loader(sb, inode);
+ err = swap_inode_boot_loader(sb, idmap, inode);
mnt_drop_write_file(filp);
return err;
}
@@ -1031,8 +1738,9 @@ mext_out:
err = ext4_resize_fs(sb, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -1044,20 +1752,21 @@ mext_out:
err = ext4_register_li_request(sb, o_group);
resizefs_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, true);
+ if (err == 0)
+ err = err2;
return err;
}
case FITRIM:
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
/*
@@ -1071,8 +1780,6 @@ resizefs_out:
sizeof(range)))
return -EFAULT;
- range.minlen = max((unsigned int)range.minlen,
- q->limits.discard_granularity);
ret = ext4_trim_fs(sb, &range);
if (ret < 0)
return ret;
@@ -1084,54 +1791,23 @@ resizefs_out:
return 0;
}
case EXT4_IOC_PRECACHE_EXTENTS:
- return ext4_ext_precache(inode);
+ {
+ int ret;
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
+ inode_lock_shared(inode);
+ ret = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
+ return ret;
+ }
+ case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_FS_ENCRYPTION
- int err, err2;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- handle_t *handle;
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg);
- if (!ext4_has_feature_encrypt(sb))
- return -EOPNOTSUPP;
- if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
- err = mnt_want_write_file(filp);
- if (err)
- return err;
- handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto pwsalt_err_exit;
- }
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
- goto pwsalt_err_journal;
- generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
- err = ext4_handle_dirty_metadata(handle, NULL,
- sbi->s_sbh);
- pwsalt_err_journal:
- err2 = ext4_journal_stop(handle);
- if (err2 && !err)
- err = err2;
- pwsalt_err_exit:
- mnt_drop_write_file(filp);
- if (err)
- return err;
- }
- if (copy_to_user((void __user *) arg,
- sbi->s_es->s_encrypt_pw_salt, 16))
- return -EFAULT;
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
- }
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
@@ -1168,7 +1844,7 @@ resizefs_out:
case EXT4_IOC_CLEAR_ES_CACHE:
{
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(idmap, inode))
return -EACCES;
ext4_clear_inode_es(inode);
return 0;
@@ -1193,62 +1869,8 @@ resizefs_out:
case EXT4_IOC_GET_ES_CACHE:
return ext4_ioctl_get_es_cache(filp, arg);
- case EXT4_IOC_FSGETXATTR:
- {
- struct fsxattr fa;
-
- ext4_fill_fsxattr(inode, &fa);
-
- if (copy_to_user((struct fsxattr __user *)arg,
- &fa, sizeof(fa)))
- return -EFAULT;
- return 0;
- }
- case EXT4_IOC_FSSETXATTR:
- {
- struct fsxattr fa, old_fa;
- int err;
-
- if (copy_from_user(&fa, (struct fsxattr __user *)arg,
- sizeof(fa)))
- return -EFAULT;
-
- /* Make sure caller has proper permission */
- if (!inode_owner_or_capable(inode))
- return -EACCES;
-
- if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
- return -EOPNOTSUPP;
-
- flags = ext4_xflags_to_iflags(fa.fsx_xflags);
- if (ext4_mask_flags(inode->i_mode, flags) != flags)
- return -EOPNOTSUPP;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- inode_lock(inode);
- ext4_fill_fsxattr(inode, &old_fa);
- err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
- if (err)
- goto out;
- flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
- (flags & EXT4_FL_XFLAG_VISIBLE);
- err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags);
- if (err)
- goto out;
- err = ext4_ioctl_setflags(inode, flags);
- if (err)
- goto out;
- err = ext4_ioctl_setproject(filp, fa.fsx_projid);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return err;
- }
case EXT4_IOC_SHUTDOWN:
- return ext4_shutdown(sb, arg);
+ return ext4_ioctl_shutdown(sb, arg);
case FS_IOC_ENABLE_VERITY:
if (!ext4_has_feature_verity(sb))
@@ -1260,22 +1882,46 @@ out:
return -EOPNOTSUPP;
return fsverity_ioctl_measure(filp, (void __user *)arg);
+ case FS_IOC_READ_VERITY_METADATA:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_read_metadata(filp,
+ (const void __user *)arg);
+
+ case EXT4_IOC_CHECKPOINT:
+ return ext4_ioctl_checkpoint(filp, arg);
+
+ case FS_IOC_GETFSLABEL:
+ return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg);
+
+ case FS_IOC_SETFSLABEL:
+ return ext4_ioctl_setlabel(filp,
+ (const void __user *)arg);
+
+ case EXT4_IOC_GETFSUUID:
+ return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
+ case EXT4_IOC_SETFSUUID:
+ return ext4_ioctl_setuuid(filp, (const void __user *)arg);
+ case EXT4_IOC_GET_TUNE_SB_PARAM:
+ return ext4_ioctl_get_tune_sb(EXT4_SB(sb),
+ (void __user *)arg);
+ case EXT4_IOC_SET_TUNE_SB_PARAM:
+ return ext4_ioctl_set_tune_sb(filp, (void __user *)arg);
default:
return -ENOTTY;
}
}
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ return __ext4_ioctl(filp, cmd, arg);
+}
+
#ifdef CONFIG_COMPAT
long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case EXT4_IOC32_GETFLAGS:
- cmd = EXT4_IOC_GETFLAGS;
- break;
- case EXT4_IOC32_SETFLAGS:
- cmd = EXT4_IOC_SETFLAGS;
- break;
case EXT4_IOC32_GETVERSION:
cmd = EXT4_IOC_GETVERSION;
break;
@@ -1318,9 +1964,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_RESIZE_FS:
case FITRIM:
case EXT4_IOC_PRECACHE_EXTENTS:
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
- case EXT4_IOC_GET_ENCRYPTION_PWSALT:
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
case FS_IOC_GET_ENCRYPTION_POLICY_EX:
case FS_IOC_ADD_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY:
@@ -1331,11 +1977,15 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_GETFSMAP:
case FS_IOC_ENABLE_VERITY:
case FS_IOC_MEASURE_VERITY:
+ case FS_IOC_READ_VERITY_METADATA:
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
- case EXT4_IOC_FSGETXATTR:
- case EXT4_IOC_FSSETXATTR:
+ case EXT4_IOC_CHECKPOINT:
+ case FS_IOC_GETFSLABEL:
+ case FS_IOC_SETFSLABEL:
+ case EXT4_IOC_GETFSUUID:
+ case EXT4_IOC_SETFSUUID:
break;
default:
return -ENOIOCTLCMD;
@@ -1343,3 +1993,22 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
+
+static void set_overhead(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
+}
+
+int ext4_update_overhead(struct super_block *sb, bool force)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (ext4_emergency_state(sb) || sb_rdonly(sb))
+ return 0;
+ if (!force &&
+ (sbi->s_overhead == 0 ||
+ sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)))
+ return 0;
+ return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
+}
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
new file mode 100644
index 000000000000..a9416b20ff64
--- /dev/null
+++ b/fs/ext4/mballoc-test.c
@@ -0,0 +1,999 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 multiblocks allocation.
+ */
+
+#include <kunit/test.h>
+#include <kunit/static_stub.h>
+#include <linux/random.h>
+
+#include "ext4.h"
+
+struct mbt_grp_ctx {
+ struct buffer_head bitmap_bh;
+ /* desc and gd_bh are just the place holders for now */
+ struct ext4_group_desc desc;
+ struct buffer_head gd_bh;
+};
+
+struct mbt_ctx {
+ struct mbt_grp_ctx *grp_ctx;
+};
+
+struct mbt_ext4_super_block {
+ struct ext4_super_block es;
+ struct ext4_sb_info sbi;
+ struct mbt_ctx mbt_ctx;
+};
+
+#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi))
+#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx)
+#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+
+static struct inode *mbt_alloc_inode(struct super_block *sb)
+{
+ struct ext4_inode_info *ei;
+
+ ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL);
+ if (!ei)
+ return NULL;
+
+ INIT_LIST_HEAD(&ei->i_orphan);
+ init_rwsem(&ei->xattr_sem);
+ init_rwsem(&ei->i_data_sem);
+ inode_init_once(&ei->vfs_inode);
+ ext4_fc_init_inode(&ei->vfs_inode);
+
+ return &ei->vfs_inode;
+}
+
+static void mbt_free_inode(struct inode *inode)
+{
+ kfree(EXT4_I(inode));
+}
+
+static const struct super_operations mbt_sops = {
+ .alloc_inode = mbt_alloc_inode,
+ .free_inode = mbt_free_inode,
+};
+
+static void mbt_kill_sb(struct super_block *sb)
+{
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type mbt_fs_type = {
+ .name = "mballoc test",
+ .kill_sb = mbt_kill_sb,
+};
+
+static int mbt_mb_init(struct super_block *sb)
+{
+ ext4_fsblk_t block;
+ int ret;
+
+ /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */
+ sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL);
+ if (sb->s_bdev == NULL)
+ return -ENOMEM;
+
+ sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL);
+ if (sb->s_bdev->bd_queue == NULL) {
+ kfree(sb->s_bdev);
+ return -ENOMEM;
+ }
+
+ /*
+ * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache =
+ * new_inode(sb);
+ */
+ INIT_LIST_HEAD(&sb->s_inodes);
+ sb->s_op = &mbt_sops;
+
+ ret = ext4_mb_init(sb);
+ if (ret != 0)
+ goto err_out;
+
+ block = ext4_count_free_clusters(sb);
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_mb_release;
+
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_freeclusters;
+
+ return 0;
+
+err_freeclusters:
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+err_mb_release:
+ ext4_mb_release(sb);
+err_out:
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+ return ret;
+}
+
+static void mbt_mb_release(struct super_block *sb)
+{
+ percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter);
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+ ext4_mb_release(sb);
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+}
+
+static int mbt_set(struct super_block *sb, void *data)
+{
+ return 0;
+}
+
+static struct super_block *mbt_ext4_alloc_super_block(void)
+{
+ struct mbt_ext4_super_block *fsb;
+ struct super_block *sb;
+ struct ext4_sb_info *sbi;
+
+ fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+ if (fsb == NULL)
+ return NULL;
+
+ sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+ if (IS_ERR(sb))
+ goto out;
+
+ sbi = &fsb->sbi;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock)
+ goto out_deactivate;
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ sbi->s_es = &fsb->es;
+ sbi->s_sb = sb;
+ sb->s_fs_info = sbi;
+
+ up_write(&sb->s_umount);
+ return sb;
+
+out_deactivate:
+ deactivate_locked_super(sb);
+out:
+ kfree(fsb);
+ return NULL;
+}
+
+static void mbt_ext4_free_super_block(struct super_block *sb)
+{
+ struct mbt_ext4_super_block *fsb = MBT_SB(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ kfree(sbi->s_blockgroup_lock);
+ deactivate_super(sb);
+ kfree(fsb);
+}
+
+struct mbt_ext4_block_layout {
+ unsigned char blocksize_bits;
+ unsigned int cluster_bits;
+ uint32_t blocks_per_group;
+ ext4_group_t group_count;
+ uint16_t desc_size;
+};
+
+static void mbt_init_sb_layout(struct super_block *sb,
+ struct mbt_ext4_block_layout *layout)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+
+ sb->s_blocksize = 1UL << layout->blocksize_bits;
+ sb->s_blocksize_bits = layout->blocksize_bits;
+
+ sbi->s_groups_count = layout->group_count;
+ sbi->s_blocks_per_group = layout->blocks_per_group;
+ sbi->s_cluster_bits = layout->cluster_bits;
+ sbi->s_cluster_ratio = 1U << layout->cluster_bits;
+ sbi->s_clusters_per_group = layout->blocks_per_group >>
+ layout->cluster_bits;
+ sbi->s_desc_size = layout->desc_size;
+ sbi->s_desc_per_block_bits =
+ sb->s_blocksize_bits - (fls(layout->desc_size) - 1);
+ sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits;
+
+ es->s_first_data_block = cpu_to_le32(0);
+ es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
+ layout->group_count);
+}
+
+static int mbt_grp_ctx_init(struct super_block *sb,
+ struct mbt_grp_ctx *grp_ctx)
+{
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+
+ grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
+ if (grp_ctx->bitmap_bh.b_data == NULL)
+ return -ENOMEM;
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max);
+ ext4_free_group_clusters_set(sb, &grp_ctx->desc, max);
+
+ return 0;
+}
+
+static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx)
+{
+ kfree(grp_ctx->bitmap_bh.b_data);
+ grp_ctx->bitmap_bh.b_data = NULL;
+}
+
+static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
+ unsigned int start, unsigned int len)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
+}
+
+static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ return grp_ctx->bitmap_bh.b_data;
+}
+
+/* called after mbt_init_sb_layout */
+static int mbt_ctx_init(struct super_block *sb)
+{
+ struct mbt_ctx *ctx = MBT_CTX(sb);
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx),
+ GFP_KERNEL);
+ if (ctx->grp_ctx == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < ngroups; i++)
+ if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i]))
+ goto out;
+
+ /*
+ * first data block(first cluster in first group) is used by
+ * metadata, mark it used to avoid to alloc data block at first
+ * block which will fail ext4_sb_block_valid check.
+ */
+ mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+ ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc,
+ EXT4_CLUSTERS_PER_GROUP(sb) - 1);
+
+ return 0;
+out:
+ while (i-- > 0)
+ mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+ kfree(ctx->grp_ctx);
+ return -ENOMEM;
+}
+
+static void mbt_ctx_release(struct super_block *sb)
+{
+ struct mbt_ctx *ctx = MBT_CTX(sb);
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ for (i = 0; i < ngroups; i++)
+ mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+ kfree(ctx->grp_ctx);
+}
+
+static struct buffer_head *
+ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+ /* paired with brelse from caller of ext4_read_block_bitmap_nowait */
+ get_bh(&grp_ctx->bitmap_bh);
+ return &grp_ctx->bitmap_bh;
+}
+
+static int ext4_wait_block_bitmap_stub(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ /*
+ * real ext4_wait_block_bitmap will set these flags and
+ * functions like ext4_mb_init_cache will verify the flags.
+ */
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ set_buffer_verified(bh);
+ return 0;
+}
+
+static struct ext4_group_desc *
+ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group,
+ struct buffer_head **bh)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+ if (bh != NULL)
+ *bh = &grp_ctx->gd_bh;
+
+ return &grp_ctx->desc;
+}
+
+static int
+ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state,
+ ext4_group_t group, ext4_grpblk_t blkoff,
+ ext4_grpblk_t len, int flags,
+ ext4_grpblk_t *ret_changed)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+ struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh;
+
+ if (state)
+ mb_set_bits(bitmap_bh->b_data, blkoff, len);
+ else
+ mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+
+ return 0;
+}
+
+#define TEST_GOAL_GROUP 1
+static int mbt_kunit_init(struct kunit *test)
+{
+ struct mbt_ext4_block_layout *layout =
+ (struct mbt_ext4_block_layout *)(test->param_value);
+ struct super_block *sb;
+ int ret;
+
+ sb = mbt_ext4_alloc_super_block();
+ if (sb == NULL)
+ return -ENOMEM;
+
+ mbt_init_sb_layout(sb, layout);
+
+ ret = mbt_ctx_init(sb);
+ if (ret != 0) {
+ mbt_ext4_free_super_block(sb);
+ return ret;
+ }
+
+ test->priv = sb;
+ kunit_activate_static_stub(test,
+ ext4_read_block_bitmap_nowait,
+ ext4_read_block_bitmap_nowait_stub);
+ kunit_activate_static_stub(test,
+ ext4_wait_block_bitmap,
+ ext4_wait_block_bitmap_stub);
+ kunit_activate_static_stub(test,
+ ext4_get_group_desc,
+ ext4_get_group_desc_stub);
+ kunit_activate_static_stub(test,
+ ext4_mb_mark_context,
+ ext4_mb_mark_context_stub);
+
+ /* stub function will be called in mbt_mb_init->ext4_mb_init */
+ if (mbt_mb_init(sb) != 0) {
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void mbt_kunit_exit(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+
+ mbt_mb_release(sb);
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+}
+
+static void test_new_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode *inode;
+ struct ext4_allocation_request ar;
+ ext4_group_t i, goal_group = TEST_GOAL_GROUP;
+ int err = 0;
+ ext4_fsblk_t found;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+
+ inode->i_sb = sb;
+ ar.inode = inode;
+
+ /* get block at goal */
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test, ar.goal, found,
+ "failed to alloc block at goal, expected %llu found %llu",
+ ar.goal, found);
+
+ /* get block after goal in goal group */
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found,
+ "failed to alloc block after goal in goal group, expected %llu found %llu",
+ ar.goal + 1, found);
+
+ /* get block after goal group */
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test,
+ ext4_group_first_block_no(sb, goal_group + 1), found,
+ "failed to alloc block after goal group, expected %llu found %llu",
+ ext4_group_first_block_no(sb, goal_group + 1), found);
+
+ /* get block before goal group */
+ for (i = goal_group; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test,
+ ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found,
+ "failed to alloc block before goal group, expected %llu found %llu",
+ ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found);
+
+ /* no block available, fail to allocate block */
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_NE_MSG(test, err, 0,
+ "unexpectedly get block when no block is available");
+}
+
+#define TEST_RANGE_COUNT 8
+
+struct test_range {
+ ext4_grpblk_t start;
+ ext4_grpblk_t len;
+};
+
+static void
+mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges,
+ int count)
+{
+ ext4_grpblk_t start, len, max;
+ int i;
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb) / count;
+ for (i = 0; i < count; i++) {
+ start = get_random_u32() % max;
+ len = get_random_u32() % max;
+ len = min(len, max - start);
+
+ ranges[i].start = start + i * max;
+ ranges[i].len = len;
+ }
+}
+
+static void
+validate_free_blocks_simple(struct kunit *test, struct super_block *sb,
+ ext4_group_t goal_group, ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ void *bitmap;
+ ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++) {
+ if (i == goal_group)
+ continue;
+
+ bitmap = mbt_ctx_bitmap(sb, i);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ_MSG(test, bit, max,
+ "free block on unexpected group %d", i);
+ }
+
+ bitmap = mbt_ctx_bitmap(sb, goal_group);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, bit, start);
+
+ bit = mb_find_next_bit(bitmap, max, bit + 1);
+ KUNIT_ASSERT_EQ(test, bit, start + len);
+}
+
+static void
+test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group,
+ ext4_grpblk_t start, ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode;
+ ext4_fsblk_t block;
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ if (len == 0)
+ return;
+
+ block = ext4_group_first_block_no(sb, goal_group) +
+ EXT4_C2B(sbi, start);
+ ext4_free_blocks_simple(inode, block, len);
+ validate_free_blocks_simple(test, sb, goal_group, start, len);
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+}
+
+static void test_free_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, max);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_free_blocks_simple_range(test, TEST_GOAL_GROUP,
+ ranges[i].start, ranges[i].len);
+}
+
+static void
+test_mark_diskspace_used_range(struct kunit *test,
+ struct ext4_allocation_context *ac,
+ ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int ret;
+ void *bitmap;
+ ext4_grpblk_t i, max;
+
+ /* ext4_mb_mark_diskspace_used will BUG if len is 0 */
+ if (len == 0)
+ return;
+
+ ac->ac_b_ex.fe_group = TEST_GOAL_GROUP;
+ ac->ac_b_ex.fe_start = start;
+ ac->ac_b_ex.fe_len = len;
+
+ bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
+ memset(bitmap, 0, sb->s_blocksize);
+ ret = ext4_mb_mark_diskspace_used(ac, NULL, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ i = mb_find_next_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, i, start);
+ i = mb_find_next_zero_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, i, start + len);
+ i = mb_find_next_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, max, i);
+}
+
+static void test_mark_diskspace_used(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode *inode;
+ struct ext4_allocation_context ac;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ ac.ac_status = AC_STATUS_FOUND;
+ ac.ac_sb = sb;
+ ac.ac_inode = inode;
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mark_diskspace_used_range(test, &ac, ranges[i].start,
+ ranges[i].len);
+}
+
+static void mbt_generate_buddy(struct super_block *sb, void *buddy,
+ void *bitmap, struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ uint32_t order, off;
+ void *bb, *bb_h;
+ int max;
+
+ memset(buddy, 0xff, sb->s_blocksize);
+ memset(grp, 0, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]));
+
+ bb = bitmap;
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ bb_h = buddy + sbi->s_mb_offsets[1];
+
+ off = mb_find_next_zero_bit(bb, max, 0);
+ grp->bb_first_free = off;
+ while (off < max) {
+ grp->bb_counters[0]++;
+ grp->bb_free++;
+
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ grp->bb_free++;
+ grp->bb_counters[0]--;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[1]++;
+ grp->bb_largest_free_order = 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+
+ for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) {
+ bb = buddy + sbi->s_mb_offsets[order];
+ bb_h = buddy + sbi->s_mb_offsets[order + 1];
+ max = max >> 1;
+ off = mb_find_next_zero_bit(bb, max, 0);
+
+ while (off < max) {
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ mb_set_bits(bb, off, 2);
+ grp->bb_counters[order] -= 2;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[order + 1]++;
+ grp->bb_largest_free_order = order + 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+ }
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ off = mb_find_next_zero_bit(bitmap, max, 0);
+ while (off < max) {
+ grp->bb_fragments++;
+
+ off = mb_find_next_bit(bitmap, max, off + 1);
+ if (off + 1 >= max)
+ break;
+
+ off = mb_find_next_zero_bit(bitmap, max, off + 1);
+ }
+}
+
+static void
+mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1,
+ struct ext4_group_info *grp2)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ KUNIT_ASSERT_EQ(test, grp1->bb_first_free,
+ grp2->bb_first_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_fragments,
+ grp2->bb_fragments);
+ KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order,
+ grp2->bb_largest_free_order);
+
+ for (i = 1; i < MB_NUM_ORDERS(sb); i++) {
+ KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i],
+ grp2->bb_counters[i],
+ "bb_counters[%d] diffs, expected %d, generated %d",
+ i, grp1->bb_counters[i],
+ grp2->bb_counters[i]);
+ }
+}
+
+static void
+do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
+ void *mbt_buddy, struct ext4_group_info *mbt_grp,
+ void *ext4_buddy, struct ext4_group_info *ext4_grp)
+{
+ int i;
+
+ mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp);
+
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ ext4_grp->bb_counters[i] = 0;
+ /* needed by validation in ext4_mb_generate_buddy */
+ ext4_grp->bb_free = mbt_grp->bb_free;
+ memset(ext4_buddy, 0xff, sb->s_blocksize);
+ ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
+ ext4_grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, mbt_grp, ext4_grp);
+}
+
+static void test_mb_generate_buddy(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *expected_bb, *generate_bb;
+ struct ext4_group_info *expected_grp, *generate_grp;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb);
+ generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb);
+ expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp);
+ generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP);
+ KUNIT_ASSERT_NOT_NULL(test, generate_grp);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ mb_set_bits(bitmap, ranges[i].start, ranges[i].len);
+ do_test_generate_buddy(test, sb, bitmap, expected_bb,
+ expected_grp, generate_bb, generate_grp);
+ }
+}
+
+static void
+test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int i;
+
+ /* mb_mark_used only accepts non-zero len */
+ if (len == 0)
+ return;
+
+ ex.fe_start = start;
+ ex.fe_len = len;
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ mb_set_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free -= len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+}
+
+static void test_mb_mark_used(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_mark_used_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static void
+test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ /* mb_free_blocks will WARN if len is 0 */
+ if (len == 0)
+ return;
+
+ ext4_lock_group(sb, e4b->bd_group);
+ mb_free_blocks(NULL, e4b, start, len);
+ ext4_unlock_group(sb, e4b->bd_group);
+
+ mb_clear_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free += len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+
+}
+
+static void test_mb_free_blocks(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ struct ext4_free_extent ex;
+ int ret;
+ int i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_start = 0;
+ ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb);
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ grp->bb_free = 0;
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
+ memset(bitmap, 0xff, sb->s_blocksize);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_free_blocks_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
+#define COUNT_FOR_ESTIMATE 100000
+static void test_mb_mark_used_cost(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i, j;
+ unsigned long start, end, all = 0;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_group = TEST_GOAL_GROUP;
+ for (j = 0; j < COUNT_FOR_ESTIMATE; j++) {
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ start = jiffies;
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ if (ranges[i].len == 0)
+ continue;
+
+ ex.fe_start = ranges[i].start;
+ ex.fe_len = ranges[i].len;
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+ }
+ end = jiffies;
+ all += (end - start);
+
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ if (ranges[i].len == 0)
+ continue;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_free_blocks(NULL, &e4b, ranges[i].start,
+ ranges[i].len);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+ }
+ }
+
+ kunit_info(test, "costed jiffies %lu\n", all);
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
+ {
+ .blocksize_bits = 10,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+ {
+ .blocksize_bits = 12,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+ {
+ .blocksize_bits = 16,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+};
+
+static void mbt_show_layout(const struct mbt_ext4_block_layout *layout,
+ char *desc)
+{
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d "
+ "blocks_per_group=%d group_count=%d desc_size=%d\n",
+ layout->blocksize_bits, layout->cluster_bits,
+ layout->blocks_per_group, layout->group_count,
+ layout->desc_size);
+}
+KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
+
+static struct kunit_case mbt_test_cases[] = {
+ KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params,
+ { .speed = KUNIT_SPEED_SLOW }),
+ {}
+};
+
+static struct kunit_suite mbt_test_suite = {
+ .name = "ext4_mballoc_test",
+ .init = mbt_kunit_init,
+ .exit = mbt_kunit_exit,
+ .test_cases = mbt_test_cases,
+};
+
+kunit_test_suites(&mbt_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a9083113a8c0..56d50fd3310b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -16,7 +16,9 @@
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
+#include <linux/freezer.h>
#include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
/*
* MUSTDO:
@@ -96,14 +98,14 @@
* block bitmap and buddy information. The information are stored in the
* inode as:
*
- * { page }
+ * { folio }
* [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information. So for each group we
- * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
- * blocksize) blocks. So it can have information regarding groups_per_page
- * which is blocks_per_page/2
+ * take up 2 blocks. A folio can contain blocks_per_folio (folio_size /
+ * blocksize) blocks. So it can have information regarding groups_per_folio
+ * which is blocks_per_folio/2
*
* The buddy cache inode is not stored on disk. The inode is thrown
* away when the filesystem is unmounted.
@@ -124,14 +126,73 @@
* /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
- * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * smallest multiple of the stripe value (sbi->s_stripe) which is
* greater than the default mb_group_prealloc.
*
+ * If "mb_optimize_scan" mount option is set, we maintain in memory group info
+ * structures in two data structures:
+ *
+ * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders)
+ *
+ * Locking: Writers use xa_lock, readers use rcu_read_lock.
+ *
+ * This is an array of xarrays where the index in the array represents the
+ * largest free order in the buddy bitmap of the participating group infos of
+ * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total
+ * number of buddy bitmap orders possible) number of xarrays. Group-infos are
+ * placed in appropriate xarrays.
+ *
+ * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size)
+ *
+ * Locking: Writers use xa_lock, readers use rcu_read_lock.
+ *
+ * This is an array of xarrays where in the i-th xarray there are groups with
+ * average fragment size >= 2^i and < 2^(i+1). The average fragment size
+ * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
+ * Note that we don't bother with a special xarray for completely empty
+ * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed
+ * in appropriate xarrays.
+ *
+ * In xarray, the index is the block group number, the value is the block group
+ * information, and a non-empty value indicates the block group is present in
+ * the current xarray.
+ *
+ * When "mb_optimize_scan" mount option is set, mballoc consults the above data
+ * structures to decide the order in which groups are to be traversed for
+ * fulfilling an allocation request.
+ *
+ * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
+ * >= the order of the request. We directly look at the largest free order list
+ * in the data structure (1) above where largest_free_order = order of the
+ * request. If that list is empty, we look at remaining list in the increasing
+ * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
+ * lookup in O(1) time.
+ *
+ * At CR_GOAL_LEN_FAST, we only consider groups where
+ * average fragment size > request size. So, we lookup a group which has average
+ * fragment size just above or equal to request size using our average fragment
+ * size group lists (data structure 2) in O(1) time.
+ *
+ * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
+ * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
+ * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
+ * fragment size > goal length. So before falling to the slower
+ * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
+ * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
+ * enough average fragment size. This increases the chances of finding a
+ * suitable block group in O(1) time and results in faster allocation at the
+ * cost of reduced size of allocation.
+ *
+ * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
+ * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
+ * CR_GOAL_LEN_FAST phase.
+ *
* The regular allocator (using the buddy cache) supports a few tunables.
*
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
* /sys/fs/ext4/<partition>/mb_order2_req
+ * /sys/fs/ext4/<partition>/mb_max_linear_groups
*
* The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
@@ -149,6 +210,16 @@
* can be used for allocation. ext4_mb_good_group explains how the groups are
* checked.
*
+ * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
+ * get traversed linearly. That may result in subsequent allocations being not
+ * close to each other. And so, the underlying device may get filled up in a
+ * non-linear fashion. While that may not matter on non-rotational devices, for
+ * rotational devices that may result in higher seek times. "mb_max_linear_groups"
+ * tells mballoc how many groups mballoc should search linearly before
+ * performing consulting above data structures for more efficient lookups. For
+ * non rotational devices, this value defaults to 0 and for rotational devices
+ * this is set to MB_DEFAULT_LINEAR_LIMIT.
+ *
* Both the prealloc space are getting populated as above. So for the first
* request we will hit the buddy cache which will result in this prealloc
* space getting filled. The prealloc space is then later used for the
@@ -299,6 +370,8 @@
* - bitlock on a group (group)
* - object (inode/locality) (object)
* - per-pa lock (pa)
+ * - cr_power2_aligned lists lock (cr_power2_aligned)
+ * - cr_goal_len_fast lists lock (cr_goal_len_fast)
*
* Paths:
* - new pa
@@ -328,6 +401,9 @@
* group
* object
*
+ * - allocation path (ext4_mb_regular_allocator)
+ * group
+ * cr_power2_aligned/cr_goal_len_fast
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
@@ -347,10 +423,15 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
- ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
+ ext4_group_t group);
+
+static int ext4_try_to_trim_range(struct super_block *sb,
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
+ ext4_grpblk_t max, ext4_grpblk_t minblocks);
+
/*
* The algorithm using this percpu seq counter goes below:
* 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -488,14 +569,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
"freeing block already freed "
"(bit %u)",
first + i);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
}
@@ -601,7 +682,25 @@ do { \
} \
} while (0)
-static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+/*
+ * Perform buddy integrity check with the following steps:
+ *
+ * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap):
+ * For each pair of adjacent orders, if a higher-order bit is set (indicating a free block),
+ * at most one of the two corresponding lower-order bits may be clear (free).
+ *
+ * 2. Order-0 (bitmap) validation, performed on bit pairs:
+ * - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits
+ * must not be free (0).
+ * - If both bits in a pair are clear (0, free), then exactly one of the corresponding
+ * higher-order bits must be free (0).
+ *
+ * 3. Preallocation (pa) list validation:
+ * For each preallocated block (pa) in the group:
+ * - Verify that pa_pstart falls within the bounds of this block group.
+ * - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1).
+ */
+static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
const char *function, int line)
{
struct super_block *sb = e4b->bd_sb;
@@ -619,11 +718,8 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
void *buddy;
void *buddy2;
- {
- static int mb_check_counter;
- if (mb_check_counter++ % 100 != 0)
- return 0;
- }
+ if (e4b->bd_info->bb_check_counter++ % 10)
+ return;
while (order > 1) {
buddy = mb_find_buddy(e4b, order, &max);
@@ -637,26 +733,14 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
for (i = 0; i < max; i++) {
if (mb_test_bit(i, buddy)) {
- /* only single bit in buddy2 may be 1 */
+ /* only single bit in buddy2 may be 0 */
if (!mb_test_bit(i << 1, buddy2)) {
MB_CHECK_ASSERT(
mb_test_bit((i<<1)+1, buddy2));
- } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
- MB_CHECK_ASSERT(
- mb_test_bit(i << 1, buddy2));
}
continue;
}
- /* both bits in buddy2 must be 1 */
- MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
- MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
-
- for (j = 0; j < (1 << order); j++) {
- k = (i * (1 << order)) + j;
- MB_CHECK_ASSERT(
- !mb_test_bit(k, e4b->bd_bitmap));
- }
count++;
}
MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
@@ -672,31 +756,40 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
fragments++;
fstart = i;
}
- continue;
+ } else {
+ fstart = -1;
}
- fstart = -1;
- /* check used bits only */
- for (j = 0; j < e4b->bd_blkbits + 1; j++) {
- buddy2 = mb_find_buddy(e4b, j, &max2);
- k = i >> j;
- MB_CHECK_ASSERT(k < max2);
- MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
+ if (!(i & 1)) {
+ int in_use, zero_bit_count = 0;
+
+ in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy);
+ for (j = 1; j < e4b->bd_blkbits + 2; j++) {
+ buddy2 = mb_find_buddy(e4b, j, &max2);
+ k = i >> j;
+ MB_CHECK_ASSERT(k < max2);
+ if (!mb_test_bit(k, buddy2))
+ zero_bit_count++;
+ }
+ MB_CHECK_ASSERT(zero_bit_count == !in_use);
}
}
MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
grp = ext4_get_group_info(sb, e4b->bd_group);
+ if (!grp)
+ return;
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+ if (!pa->pa_len)
+ continue;
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
MB_CHECK_ASSERT(groupnr == e4b->bd_group);
for (i = 0; i < pa->pa_len; i++)
MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
}
- return 0;
}
#undef MB_CHECK_ASSERT
#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
@@ -747,6 +840,392 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}
+static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
+{
+ int order;
+
+ /*
+ * We don't bother with a special lists groups with only 1 block free
+ * extents and for completely empty groups.
+ */
+ order = fls(len) - 2;
+ if (order < 0)
+ return 0;
+ if (order == MB_NUM_ORDERS(sb))
+ order--;
+ if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
+ order = MB_NUM_ORDERS(sb) - 1;
+ return order;
+}
+
+/* Move group to appropriate avg_fragment_size list */
+static void
+mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int new, old;
+
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN))
+ return;
+
+ old = grp->bb_avg_fragment_size_order;
+ new = grp->bb_fragments == 0 ? -1 :
+ mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments);
+ if (new == old)
+ return;
+
+ if (old >= 0)
+ xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group);
+
+ grp->bb_avg_fragment_size_order = new;
+ if (new >= 0) {
+ /*
+ * Cannot use __GFP_NOFAIL because we hold the group lock.
+ * Although allocation for insertion may fails, it's not fatal
+ * as we have linear traversal to fall back on.
+ */
+ int err = xa_insert(&sbi->s_mb_avg_fragment_size[new],
+ grp->bb_group, grp, GFP_ATOMIC);
+ if (err)
+ mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d",
+ grp->bb_group, new, err);
+ }
+}
+
+static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
+ struct xarray *xa,
+ ext4_group_t start, ext4_group_t end)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ enum criteria cr = ac->ac_criteria;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ unsigned long group = start;
+ struct ext4_group_info *grp;
+
+ if (WARN_ON_ONCE(end > ngroups || start >= end))
+ return 0;
+
+ xa_for_each_range(xa, group, grp, start, end - 1) {
+ int err;
+
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
+
+ err = ext4_mb_scan_group(ac, grp->bb_group);
+ if (err || ac->ac_status != AC_STATUS_CONTINUE)
+ return err;
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * Find a suitable group of given order from the largest free orders xarray.
+ */
+static inline int
+ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac,
+ int order, ext4_group_t start,
+ ext4_group_t end)
+{
+ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order];
+
+ if (xa_empty(xa))
+ return 0;
+
+ return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
+}
+
+/*
+ * Choose next group by traversing largest_free_order lists. Updates *new_cr if
+ * cr level needs an update.
+ */
+static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int i;
+ int ret = 0;
+ ext4_group_t start, end;
+
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
+ for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
+ }
+
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+
+ /* Increment cr and search again if no group is found */
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
+ return ret;
+}
+
+/*
+ * Find a suitable group of given order from the average fragments xarray.
+ */
+static int
+ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac,
+ int order, ext4_group_t start,
+ ext4_group_t end)
+{
+ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order];
+
+ if (xa_empty(xa))
+ return 0;
+
+ return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
+}
+
+/*
+ * Choose next group by traversing average fragment size list of suitable
+ * order. Updates *new_cr if cr level needs an update.
+ */
+static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int i, ret = 0;
+ ext4_group_t start, end;
+
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
+ i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
+ for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
+ }
+
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+ /*
+ * CR_BEST_AVAIL_LEN works based on the concept that we have
+ * a larger normalized goal len request which can be trimmed to
+ * a smaller goal len such that it can still satisfy original
+ * request len. However, allocation request for non-regular
+ * files never gets normalized.
+ * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
+ */
+ if (ac->ac_flags & EXT4_MB_HINT_DATA)
+ ac->ac_criteria = CR_BEST_AVAIL_LEN;
+ else
+ ac->ac_criteria = CR_GOAL_LEN_SLOW;
+
+ return ret;
+}
+
+/*
+ * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
+ * order we have and proactively trim the goal request length to that order to
+ * find a suitable group faster.
+ *
+ * This optimizes allocation speed at the cost of slightly reduced
+ * preallocations. However, we make sure that we don't trim the request too
+ * much and fall to CR_GOAL_LEN_SLOW in that case.
+ */
+static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ int ret = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int i, order, min_order;
+ unsigned long num_stripe_clusters = 0;
+ ext4_group_t start, end;
+
+ /*
+ * mb_avg_fragment_size_order() returns order in a way that makes
+ * retrieving back the length using (1 << order) inaccurate. Hence, use
+ * fls() instead since we need to know the actual length while modifying
+ * goal length.
+ */
+ order = fls(ac->ac_g_ex.fe_len) - 1;
+ if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb)))
+ order = MB_NUM_ORDERS(ac->ac_sb);
+ min_order = order - sbi->s_mb_best_avail_max_trim_order;
+ if (min_order < 0)
+ min_order = 0;
+
+ if (sbi->s_stripe > 0) {
+ /*
+ * We are assuming that stripe size is always a multiple of
+ * cluster ratio otherwise __ext4_fill_super exists early.
+ */
+ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
+ if (1 << min_order < num_stripe_clusters)
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
+ }
+
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
+ for (i = order; i >= min_order; i--) {
+ int frag_order;
+ /*
+ * Scale down goal len to make sure we find something
+ * in the free fragments list. Basically, reduce
+ * preallocations.
+ */
+ ac->ac_g_ex.fe_len = 1 << i;
+
+ if (num_stripe_clusters > 0) {
+ /*
+ * Try to round up the adjusted goal length to
+ * stripe size (in cluster units) multiple for
+ * efficiency.
+ */
+ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
+ num_stripe_clusters);
+ }
+
+ frag_order = mb_avg_fragment_size_order(ac->ac_sb,
+ ac->ac_g_ex.fe_len);
+
+ ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
+ }
+
+ /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
+ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+ ac->ac_criteria = CR_GOAL_LEN_SLOW;
+
+ return ret;
+}
+
+static inline int should_optimize_scan(struct ext4_allocation_context *ac)
+{
+ if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
+ return 0;
+ if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
+ return 0;
+ if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
+ return 0;
+ return 1;
+}
+
+/*
+ * next linear group for allocation.
+ */
+static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups)
+{
+ /*
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
+ */
+ *group = *group + 1 >= ngroups ? 0 : *group + 1;
+}
+
+static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac,
+ ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count)
+{
+ int ret, i;
+ enum criteria cr = ac->ac_criteria;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group = *start;
+
+ for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) {
+ ret = ext4_mb_scan_group(ac, group);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ cond_resched();
+ }
+
+ *start = group;
+ if (count == ngroups)
+ ac->ac_criteria++;
+
+ /* Processed all groups and haven't found blocks */
+ if (sbi->s_mb_stats && i == ngroups)
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+
+ return 0;
+}
+
+static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
+{
+ int ret = 0;
+ ext4_group_t start;
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
+
+ /* non-extent files are limited to low blocks/groups */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = sbi->s_blockfile_groups;
+
+ /* searching for the right group start from the goal value specified */
+ start = ac->ac_g_ex.fe_group;
+ ac->ac_prefetch_grp = start;
+ ac->ac_prefetch_nr = 0;
+
+ if (!should_optimize_scan(ac))
+ return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups);
+
+ /*
+ * Optimized scanning can return non adjacent groups which can cause
+ * seek overhead for rotational disks. So try few linear groups before
+ * trying optimized scan.
+ */
+ if (sbi->s_mb_max_linear_groups)
+ ret = ext4_mb_scan_groups_linear(ac, ngroups, &start,
+ sbi->s_mb_max_linear_groups);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+
+ switch (ac->ac_criteria) {
+ case CR_POWER2_ALIGNED:
+ return ext4_mb_scan_groups_p2_aligned(ac, start);
+ case CR_GOAL_LEN_FAST:
+ return ext4_mb_scan_groups_goal_fast(ac, start);
+ case CR_BEST_AVAIL_LEN:
+ return ext4_mb_scan_groups_best_avail(ac, start);
+ default:
+ /*
+ * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
+ * rb tree sorted by bb_free. But until that happens, we should
+ * never come here.
+ */
+ WARN_ON(1);
+ }
+
+ return 0;
+}
+
/*
* Cache the order of the largest free extent we have available in this block
* group.
@@ -754,25 +1233,44 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
- int i;
- int bits;
-
- grp->bb_largest_free_order = -1; /* uninit */
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int new, old = grp->bb_largest_free_order;
- bits = sb->s_blocksize_bits + 1;
- for (i = bits; i >= 0; i--) {
- if (grp->bb_counters[i] > 0) {
- grp->bb_largest_free_order = i;
+ for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--)
+ if (grp->bb_counters[new] > 0)
break;
- }
+
+ /* No need to move between order lists? */
+ if (new == old)
+ return;
+
+ if (old >= 0) {
+ struct xarray *xa = &sbi->s_mb_largest_free_orders[old];
+
+ if (!xa_empty(xa) && xa_load(xa, grp->bb_group))
+ xa_erase(xa, grp->bb_group);
+ }
+
+ grp->bb_largest_free_order = new;
+ if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) {
+ /*
+ * Cannot use __GFP_NOFAIL because we hold the group lock.
+ * Although allocation for insertion may fails, it's not fatal
+ * as we have linear traversal to fall back on.
+ */
+ int err = xa_insert(&sbi->s_mb_largest_free_orders[new],
+ grp->bb_group, grp, GFP_ATOMIC);
+ if (err)
+ mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d",
+ grp->bb_group, new, err);
}
}
static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
- void *buddy, void *bitmap, ext4_group_t group)
+ void *buddy, void *bitmap, ext4_group_t group,
+ struct ext4_group_info *grp)
{
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
@@ -815,14 +1313,13 @@ void ext4_mb_generate_buddy(struct super_block *sb,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_set_largest_free_order(sb, grp);
+ mb_update_avg_fragment_size(sb, grp);
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
period = get_cycles() - period;
- spin_lock(&sbi->s_bal_lock);
- sbi->s_mb_buddies_generated++;
- sbi->s_mb_generation_time += period;
- spin_unlock(&sbi->s_bal_lock);
+ atomic_inc(&sbi->s_mb_buddies_generated);
+ atomic64_add(period, &sbi->s_mb_generation_time);
}
static void mb_regenerate_buddy(struct ext4_buddy *e4b)
@@ -831,16 +1328,16 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
int order = 1;
void *buddy;
- while ((buddy = mb_find_buddy(e4b, order++, &count))) {
- ext4_set_bits(buddy, 0, count);
- }
+ while ((buddy = mb_find_buddy(e4b, order++, &count)))
+ mb_set_bits(buddy, 0, count);
+
e4b->bd_info->bb_fragments = 0;
memset(e4b->bd_info->bb_counters, 0,
sizeof(*e4b->bd_info->bb_counters) *
(e4b->bd_sb->s_blocksize_bits + 2));
ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
- e4b->bd_bitmap, e4b->bd_group);
+ e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
}
/* The buddy information is attached the buddy cache inode
@@ -849,26 +1346,25 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
* block bitmap and buddy information. The information are
* stored in the inode as
*
- * { page }
+ * { folio }
* [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information.
- * So for each group we take up 2 blocks. A page can
- * contain blocks_per_page (PAGE_SIZE / blocksize) blocks.
- * So it can have information regarding groups_per_page which
- * is blocks_per_page/2
+ * So for each group we take up 2 blocks. A folio can
+ * contain blocks_per_folio (folio_size / blocksize) blocks.
+ * So it can have information regarding groups_per_folio which
+ * is blocks_per_folio/2
*
* Locking note: This routine takes the block group lock of all groups
- * for this page; do not hold this lock when calling this routine!
+ * for this folio; do not hold this lock when calling this routine!
*/
-
-static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
+static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
{
ext4_group_t ngroups;
- int blocksize;
- int blocks_per_page;
- int groups_per_page;
+ unsigned int blocksize;
+ int blocks_per_folio;
+ int groups_per_folio;
int err = 0;
int i;
ext4_group_t first_group, group;
@@ -881,48 +1377,46 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
char *bitmap;
struct ext4_group_info *grinfo;
- inode = page->mapping->host;
+ inode = folio->mapping->host;
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = i_blocksize(inode);
- blocks_per_page = PAGE_SIZE / blocksize;
+ blocks_per_folio = folio_size(folio) / blocksize;
+ WARN_ON_ONCE(!blocks_per_folio);
+ groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2);
- mb_debug(sb, "init page %lu\n", page->index);
-
- groups_per_page = blocks_per_page >> 1;
- if (groups_per_page == 0)
- groups_per_page = 1;
+ mb_debug(sb, "init folio %lu\n", folio->index);
/* allocate buffer_heads to read bitmaps */
- if (groups_per_page > 1) {
- i = sizeof(struct buffer_head *) * groups_per_page;
+ if (groups_per_folio > 1) {
+ i = sizeof(struct buffer_head *) * groups_per_folio;
bh = kzalloc(i, gfp);
- if (bh == NULL) {
- err = -ENOMEM;
- goto out;
- }
+ if (bh == NULL)
+ return -ENOMEM;
} else
bh = &bhs;
- first_group = page->index * blocks_per_page / 2;
-
- /* read all groups the page covers into the cache */
- for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ /* read all groups the folio covers into the cache */
+ first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2;
+ for (i = 0, group = first_group; i < groups_per_folio; i++, group++) {
if (group >= ngroups)
break;
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo)
+ continue;
/*
- * If page is uptodate then we came here after online resize
+ * If folio is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
- * we must skip all initialized uptodate buddies on the page,
+ * we must skip all initialized uptodate buddies on the folio,
* which may be currently in use by an allocating task.
*/
- if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ if (folio_test_uptodate(folio) &&
+ !EXT4_MB_GRP_NEED_INIT(grinfo)) {
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
@@ -932,7 +1426,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
}
/* wait for I/O completion */
- for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ for (i = 0, group = first_group; i < groups_per_folio; i++, group++) {
int err2;
if (!bh[i])
@@ -942,8 +1436,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
err = err2;
}
- first_block = page->index * blocks_per_page;
- for (i = 0; i < blocks_per_page; i++) {
+ first_block = EXT4_PG_TO_LBLK(inode, folio->index);
+ for (i = 0; i < blocks_per_folio; i++) {
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
@@ -963,38 +1457,42 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
* above
*
*/
- data = page_address(page) + (i * blocksize);
+ data = folio_address(folio) + (i * blocksize);
bitmap = bh[group - first_group]->b_data;
/*
* We place the buddy block and bitmap block
* close together
*/
+ grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo) {
+ err = -EFSCORRUPTED;
+ goto out;
+ }
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
- group, page->index, i * blocksize);
+ mb_debug(sb, "put buddy for group %u in folio %lu/%x\n",
+ group, folio->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
- grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
sizeof(*grinfo->bb_counters) *
- (sb->s_blocksize_bits+2));
+ (MB_NUM_ORDERS(sb)));
/*
* incore got set to the group block bitmap below
*/
ext4_lock_group(sb, group);
/* init the buddy */
memset(data, 0xff, blocksize);
- ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
ext4_unlock_group(sb, group);
incore = NULL;
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
- group, page->index, i * blocksize);
+ mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n",
+ group, folio->index, i * blocksize);
trace_ext4_mb_bitmap_load(sb, group);
/* see comments in ext4_mb_put_pa() */
@@ -1003,7 +1501,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
/* mark all preallocated blks used in in-core bitmap */
ext4_mb_generate_from_pa(sb, data, group);
- ext4_mb_generate_from_freelist(sb, data, group);
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
ext4_unlock_group(sb, group);
/* set incore so that the buddy information can be
@@ -1012,11 +1510,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
incore = data;
}
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
out:
if (bh) {
- for (i = 0; i < groups_per_page; i++)
+ for (i = 0; i < groups_per_folio; i++)
brelse(bh[i]);
if (bh != &bhs)
kfree(bh);
@@ -1025,68 +1523,71 @@ out:
}
/*
- * Lock the buddy and bitmap pages. This make sure other parallel init_group
- * on the same buddy page doesn't happen whild holding the buddy page lock.
- * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
- * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ * Lock the buddy and bitmap folios. This makes sure other parallel init_group
+ * on the same buddy folio doesn't happen while holding the buddy folio lock.
+ * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap
+ * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0.
*/
-static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+static int ext4_mb_get_buddy_folio_lock(struct super_block *sb,
ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{
struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
- int block, pnum, poff;
- int blocks_per_page;
- struct page *page;
+ int block, pnum;
+ struct folio *folio;
- e4b->bd_buddy_page = NULL;
- e4b->bd_bitmap_page = NULL;
+ e4b->bd_buddy_folio = NULL;
+ e4b->bd_bitmap_folio = NULL;
- blocks_per_page = PAGE_SIZE / sb->s_blocksize;
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
* So for each group we need two blocks.
*/
block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (!page)
- return -ENOMEM;
- BUG_ON(page->mapping != inode->i_mapping);
- e4b->bd_bitmap_page = page;
- e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+ pnum = EXT4_LBLK_TO_PG(inode, block);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ BUG_ON(folio->mapping != inode->i_mapping);
+ WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize);
+ e4b->bd_bitmap_folio = folio;
+ e4b->bd_bitmap = folio_address(folio) +
+ offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
- if (blocks_per_page >= 2) {
- /* buddy and bitmap are on the same page */
+ block++;
+ pnum = EXT4_LBLK_TO_PG(inode, block);
+ if (folio_contains(folio, pnum)) {
+ /* buddy and bitmap are on the same folio */
return 0;
}
- block++;
- pnum = block / blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (!page)
- return -ENOMEM;
- BUG_ON(page->mapping != inode->i_mapping);
- e4b->bd_buddy_page = page;
+ /* we need another folio for the buddy */
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ BUG_ON(folio->mapping != inode->i_mapping);
+ WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize);
+ e4b->bd_buddy_folio = folio;
return 0;
}
-static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b)
{
- if (e4b->bd_bitmap_page) {
- unlock_page(e4b->bd_bitmap_page);
- put_page(e4b->bd_bitmap_page);
+ if (e4b->bd_bitmap_folio) {
+ folio_unlock(e4b->bd_bitmap_folio);
+ folio_put(e4b->bd_bitmap_folio);
}
- if (e4b->bd_buddy_page) {
- unlock_page(e4b->bd_buddy_page);
- put_page(e4b->bd_buddy_page);
+ if (e4b->bd_buddy_folio) {
+ folio_unlock(e4b->bd_buddy_folio);
+ folio_put(e4b->bd_buddy_folio);
}
}
/*
* Locking note: This routine calls ext4_mb_init_cache(), which takes the
- * block group lock of all groups for this page; do not hold the BG lock when
+ * block group lock of all groups for this folio; do not hold the BG lock when
* calling this routine!
*/
static noinline_for_stack
@@ -1095,22 +1596,25 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
struct ext4_group_info *this_grp;
struct ext4_buddy e4b;
- struct page *page;
+ struct folio *folio;
int ret = 0;
might_sleep();
mb_debug(sb, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
+ if (!this_grp)
+ return -EFSCORRUPTED;
+
/*
* This ensures that we don't reinit the buddy cache
- * page which map to the group from which we are already
+ * folio which map to the group from which we are already
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
- * would have pinned buddy page to page cache.
- * The call to ext4_mb_get_buddy_page_lock will mark the
- * page accessed.
+ * would have pinned buddy folio to page cache.
+ * The call to ext4_mb_get_buddy_folio_lock will mark the
+ * folio accessed.
*/
- ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
+ ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
@@ -1119,52 +1623,50 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
goto err;
}
- page = e4b.bd_bitmap_page;
- ret = ext4_mb_init_cache(page, NULL, gfp);
+ folio = e4b.bd_bitmap_folio;
+ ret = ext4_mb_init_cache(folio, NULL, gfp);
if (ret)
goto err;
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- if (e4b.bd_buddy_page == NULL) {
+ if (e4b.bd_buddy_folio == NULL) {
/*
* If both the bitmap and buddy are in
- * the same page we don't need to force
+ * the same folio we don't need to force
* init the buddy
*/
ret = 0;
goto err;
}
/* init buddy cache */
- page = e4b.bd_buddy_page;
- ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
+ folio = e4b.bd_buddy_folio;
+ ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp);
if (ret)
goto err;
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
err:
- ext4_mb_put_buddy_page_lock(&e4b);
+ ext4_mb_put_buddy_folio_lock(&e4b);
return ret;
}
/*
* Locking note: This routine calls ext4_mb_init_cache(), which takes the
- * block group lock of all groups for this page; do not hold the BG lock when
+ * block group lock of all groups for this folio; do not hold the BG lock when
* calling this routine!
*/
static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b, gfp_t gfp)
{
- int blocks_per_page;
int block;
int pnum;
- int poff;
- struct page *page;
+ struct folio *folio;
int ret;
struct ext4_group_info *grp;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1173,15 +1675,16 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
might_sleep();
mb_debug(sb, "load group %u\n", group);
- blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ return -EFSCORRUPTED;
e4b->bd_blkbits = sb->s_blocksize_bits;
e4b->bd_info = grp;
e4b->bd_sb = sb;
e4b->bd_group = group;
- e4b->bd_buddy_page = NULL;
- e4b->bd_bitmap_page = NULL;
+ e4b->bd_buddy_folio = NULL;
+ e4b->bd_bitmap_folio = NULL;
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
/*
@@ -1199,98 +1702,115 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
* So for each group we need two blocks.
*/
block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
-
- /* we could use find_or_create_page(), but it locks page
- * what we'd like to avoid in fast path ... */
- page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
- if (page == NULL || !PageUptodate(page)) {
- if (page)
+ pnum = EXT4_LBLK_TO_PG(inode, block);
+
+ /* Avoid locking the folio in the fast path ... */
+ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (!IS_ERR(folio))
/*
- * drop the page reference and try
- * to get the page with lock. If we
+ * drop the folio reference and try
+ * to get the folio with lock. If we
* are not uptodate that implies
- * somebody just created the page but
- * is yet to initialize the same. So
+ * somebody just created the folio but
+ * is yet to initialize it. So
* wait for it to initialize.
*/
- put_page(page);
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, NULL, gfp);
+ folio_put(folio);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (!IS_ERR(folio)) {
+ if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
+ "ext4: bitmap's mapping != inode->i_mapping\n")) {
+ /* should never happen */
+ folio_unlock(folio);
+ ret = -EINVAL;
+ goto err;
+ }
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_mb_init_cache(folio, NULL, gfp);
if (ret) {
- unlock_page(page);
+ folio_unlock(folio);
goto err;
}
- mb_cmp_bitmaps(e4b, page_address(page) +
- (poff * sb->s_blocksize));
+ mb_cmp_bitmaps(e4b, folio_address(folio) +
+ offset_in_folio(folio,
+ EXT4_LBLK_TO_B(inode, block)));
}
- unlock_page(page);
+ folio_unlock(folio);
}
}
- if (page == NULL) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto err;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- /* Pages marked accessed already */
- e4b->bd_bitmap_page = page;
- e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+ /* Folios marked accessed already */
+ e4b->bd_bitmap_folio = folio;
+ e4b->bd_bitmap = folio_address(folio) +
+ offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
-
- page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
- if (page == NULL || !PageUptodate(page)) {
- if (page)
- put_page(page);
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
+ pnum = EXT4_LBLK_TO_PG(inode, block);
+ /* buddy and bitmap are on the same folio? */
+ if (folio_contains(folio, pnum)) {
+ folio_get(folio);
+ goto update_buddy;
+ }
+
+ /* we need another folio for the buddy */
+ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (!IS_ERR(folio))
+ folio_put(folio);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (!IS_ERR(folio)) {
+ if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
+ "ext4: buddy bitmap's mapping != inode->i_mapping\n")) {
+ /* should never happen */
+ folio_unlock(folio);
+ ret = -EINVAL;
+ goto err;
+ }
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_mb_init_cache(folio, e4b->bd_bitmap,
gfp);
if (ret) {
- unlock_page(page);
+ folio_unlock(folio);
goto err;
}
}
- unlock_page(page);
+ folio_unlock(folio);
}
}
- if (page == NULL) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto err;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- /* Pages marked accessed already */
- e4b->bd_buddy_page = page;
- e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
-
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
+update_buddy:
+ /* Folios marked accessed already */
+ e4b->bd_buddy_folio = folio;
+ e4b->bd_buddy = folio_address(folio) +
+ offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
return 0;
err:
- if (page)
- put_page(page);
- if (e4b->bd_bitmap_page)
- put_page(e4b->bd_bitmap_page);
- if (e4b->bd_buddy_page)
- put_page(e4b->bd_buddy_page);
+ if (!IS_ERR_OR_NULL(folio))
+ folio_put(folio);
+ if (e4b->bd_bitmap_folio)
+ folio_put(e4b->bd_bitmap_folio);
+
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
return ret;
@@ -1304,31 +1824,27 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
- if (e4b->bd_bitmap_page)
- put_page(e4b->bd_bitmap_page);
- if (e4b->bd_buddy_page)
- put_page(e4b->bd_buddy_page);
+ if (e4b->bd_bitmap_folio)
+ folio_put(e4b->bd_bitmap_folio);
+ if (e4b->bd_buddy_folio)
+ folio_put(e4b->bd_buddy_folio);
}
static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
- int order = 1;
- int bb_incr = 1 << (e4b->bd_blkbits - 1);
+ int order = 1, max;
void *bb;
BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
- bb = e4b->bd_buddy;
while (order <= e4b->bd_blkbits + 1) {
- block = block >> 1;
- if (!mb_test_bit(block, bb)) {
+ bb = mb_find_buddy(e4b, order, &max);
+ if (!mb_test_bit(block >> order, bb)) {
/* this block is part of buddy of order 'order' */
return order;
}
- bb += bb_incr;
- bb_incr >>= 1;
order++;
}
return 0;
@@ -1379,7 +1895,7 @@ static int mb_test_and_clear_bits(void *bm, int cur, int len)
return zero_bit;
}
-void ext4_set_bits(void *bm, int cur, int len)
+void mb_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1397,9 +1913,6 @@ void ext4_set_bits(void *bm, int cur, int len)
}
}
-/*
- * _________________________________________________________________ */
-
static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
if (mb_test_bit(*bit + side, bitmap)) {
@@ -1460,7 +1973,8 @@ static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
break;
order++;
- if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+ buddy2 = mb_find_buddy(e4b, order, &max);
+ if (!buddy2) {
mb_clear_bits(buddy, first, last - first + 1);
e4b->bd_info->bb_counters[order - 1] += last - first + 1;
break;
@@ -1491,11 +2005,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
- this_cpu_inc(discard_pa_seq);
- e4b->bd_info->bb_free += count;
- if (first < e4b->bd_info->bb_first_free)
- e4b->bd_info->bb_first_free = first;
-
/* access memory sequentially: check left neighbour,
* clear range and then check right neighbour
*/
@@ -1509,20 +2018,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t blocknr;
+ /*
+ * Fastcommit replay can free already freed blocks which
+ * corrupts allocation info. Regenerate it.
+ */
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ mb_regenerate_buddy(e4b);
+ goto check;
+ }
+
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u); block bitmap corrupt.",
- block);
ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
- mb_regenerate_buddy(e4b);
- goto done;
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0, blocknr,
+ "freeing already freed block (bit %u); block bitmap corrupt.",
+ block);
+ return;
}
+ this_cpu_inc(discard_pa_seq);
+ e4b->bd_info->bb_free += count;
+ if (first < e4b->bd_info->bb_first_free)
+ e4b->bd_info->bb_first_free = first;
+
/* let's maintain fragments counter */
if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
@@ -1547,16 +2067,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first <= last)
mb_buddy_mark_free(e4b, first >> 1, last >> 1);
-done:
mb_set_largest_free_order(sb, e4b->bd_info);
+ mb_update_avg_fragment_size(sb, e4b->bd_info);
+check:
mb_check_buddy(e4b);
}
static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
- int next = block;
- int max, order;
+ int max, order, next;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1574,16 +2094,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
/* find actual order */
order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- ex->fe_len = 1 << order;
- ex->fe_start = block << order;
+ ex->fe_len = (1 << order) - (block & ((1 << order) - 1));
+ ex->fe_start = block;
ex->fe_group = e4b->bd_group;
- /* calc difference from given start */
- next = next - ex->fe_start;
- ex->fe_len -= next;
- ex->fe_start += next;
+ block = block >> order;
while (needed > ex->fe_len &&
mb_find_buddy(e4b, order, &max)) {
@@ -1604,10 +2120,11 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
/* Should never happen! (but apparently sometimes does?!?) */
WARN_ON(1);
- ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
- "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
- block, order, needed, ex->fe_group, ex->fe_start,
- ex->fe_len, ex->fe_logical);
+ ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
+ "corruption or bug in mb_find_extent "
+ "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
+ block, order, needed, ex->fe_group, ex->fe_start,
+ ex->fe_len, ex->fe_logical);
ex->fe_len = 0;
ex->fe_start = 0;
ex->fe_group = 0;
@@ -1620,12 +2137,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
int ord;
int mlen = 0;
int max = 0;
- int cur;
int start = ex->fe_start;
int len = ex->fe_len;
unsigned ret = 0;
int len0 = len;
void *buddy;
+ int ord_start, ord_end;
BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
BUG_ON(e4b->bd_group != ex->fe_group);
@@ -1669,23 +2186,34 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
if (ret == 0)
ret = len | (ord << 16);
- /* we have to split large buddy */
BUG_ON(ord <= 0);
buddy = mb_find_buddy(e4b, ord, &max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
- ord--;
- cur = (start >> ord) & ~1U;
- buddy = mb_find_buddy(e4b, ord, &max);
- mb_clear_bit(cur, buddy);
- mb_clear_bit(cur + 1, buddy);
- e4b->bd_info->bb_counters[ord]++;
- e4b->bd_info->bb_counters[ord]++;
+ ord_start = (start >> ord) << ord;
+ ord_end = ord_start + (1 << ord);
+ /* first chunk */
+ if (start > ord_start)
+ ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
+ ord_start, start - ord_start,
+ e4b->bd_info);
+
+ /* last chunk */
+ if (start + len < ord_end) {
+ ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
+ start + len,
+ ord_end - (start + len),
+ e4b->bd_info);
+ break;
+ }
+ len = start + len - ord_end;
+ start = ord_end;
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
- ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+ mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
+ mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -1716,23 +2244,23 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->ac_buddy = ret >> 16;
/*
- * take the page reference. We want the page to be pinned
+ * take the folio reference. We want the folio to be pinned
* so that we don't get a ext4_mb_init_cache_call for this
* group until we update the bitmap. That would mean we
* double allocate blocks. The reference is dropped
* in ext4_mb_release_context
*/
- ac->ac_bitmap_page = e4b->bd_bitmap_page;
- get_page(ac->ac_bitmap_page);
- ac->ac_buddy_page = e4b->bd_buddy_page;
- get_page(ac->ac_buddy_page);
+ ac->ac_bitmap_folio = e4b->bd_bitmap_folio;
+ folio_get(ac->ac_bitmap_folio);
+ ac->ac_buddy_folio = e4b->bd_buddy_folio;
+ folio_get(ac->ac_buddy_folio);
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
- spin_lock(&sbi->s_md_lock);
- sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
- sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
- spin_unlock(&sbi->s_md_lock);
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group);
}
+
/*
* As we've just preallocated more space than
* user requested originally, we store allocated
@@ -1743,10 +2271,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
}
-/*
- * regular allocator, for general purposes allocation
- */
-
static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b,
int finish_group)
@@ -1754,8 +2278,6 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_free_extent *bex = &ac->ac_b_ex;
struct ext4_free_extent *gex = &ac->ac_g_ex;
- struct ext4_free_extent ex;
- int max;
if (ac->ac_status == AC_STATUS_FOUND)
return;
@@ -1774,17 +2296,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
if (bex->fe_len < gex->fe_len)
return;
- if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
- && bex->fe_group == e4b->bd_group) {
- /* recheck chunk's availability - we don't know
- * when it was found (within this lock-unlock
- * period or not) */
- max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
- if (max >= gex->fe_len) {
- ext4_mb_use_best_found(ac, e4b);
- return;
- }
- }
+ if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
+ ext4_mb_use_best_found(ac, e4b);
}
/*
@@ -1795,6 +2308,20 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
* in the context. Later, the best found extent will be used, if
* mballoc can't find good enough extent.
*
+ * The algorithm used is roughly as follows:
+ *
+ * * If free extent found is exactly as big as goal, then
+ * stop the scan and use it immediately
+ *
+ * * If free extent found is smaller than goal, then keep retrying
+ * upto a max of sbi->s_mb_max_to_scan times (default 200). After
+ * that stop scanning and use whatever we have.
+ *
+ * * If free extent found is bigger than goal, then keep retrying
+ * upto a max of sbi->s_mb_min_to_scan times (default 10) before
+ * stopping the scan and using the extent.
+ *
+ *
* FIXME: real allocation policy is to be designed yet!
*/
static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
@@ -1810,6 +2337,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
/*
* The special case - take what you catch first
@@ -1857,7 +2385,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
}
static noinline_for_stack
-int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct ext4_free_extent ex = ac->ac_b_ex;
@@ -1868,9 +2396,12 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
BUG_ON(ex.fe_len <= 0);
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
- return err;
+ return;
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
@@ -1878,10 +2409,9 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
-
- return 0;
}
static noinline_for_stack
@@ -1895,7 +2425,9 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
- if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+ if (!grp)
+ return -EFSCORRUPTED;
+ if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
return 0;
if (grp->bb_free == 0)
return 0;
@@ -1904,21 +2436,19 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
if (err)
return err;
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
- ext4_mb_unload_buddy(e4b);
- return 0;
- }
-
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
ex.fe_logical = 0xDEADFA11; /* debug value */
- if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+ if (max >= ac->ac_g_ex.fe_len &&
+ ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) {
ext4_fsblk_t start;
- start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
- ex.fe_start;
+ start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
/* use do_div to get remainder (would be 64-bit modulo) */
if (do_div(start, sbi->s_stripe) == 0) {
ac->ac_found++;
@@ -1942,6 +2472,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
@@ -1964,24 +2495,27 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
int max;
BUG_ON(ac->ac_2order <= 0);
- for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+ for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
if (grp->bb_counters[i] == 0)
continue;
buddy = mb_find_buddy(e4b, i, &max);
- BUG_ON(buddy == NULL);
+ if (WARN_RATELIMIT(buddy == NULL,
+ "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i))
+ continue;
k = mb_find_next_zero_bit(buddy, max, 0);
if (k >= max) {
+ ext4_mark_group_bitmap_corrupted(ac->ac_sb,
+ e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
"%d free clusters of order %d. But found 0",
grp->bb_counters[i], i);
- ext4_mark_group_bitmap_corrupted(ac->ac_sb,
- e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
ac->ac_b_ex.fe_len = 1 << i;
ac->ac_b_ex.fe_start = k << i;
@@ -2010,7 +2544,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct super_block *sb = ac->ac_sb;
void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
- int i;
+ int i, j, freelen;
int free;
free = e4b->bd_info->bb_free;
@@ -2026,27 +2560,45 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
- * we have free blocks
+ * have free blocks
*/
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
"group info. But bitmap says 0",
free);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
+ if (!ext4_mb_cr_expensive(ac->ac_criteria)) {
+ /*
+ * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
+ * sure that this group will have a large enough
+ * continuous free extent, so skip over the smaller free
+ * extents
+ */
+ j = mb_find_next_bit(bitmap,
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ freelen = j - i;
+
+ if (freelen < ac->ac_g_ex.fe_len) {
+ i = j;
+ free -= freelen;
+ continue;
+ }
+ }
+
mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
if (WARN_ON(ex.fe_len <= 0))
break;
if (free < ex.fe_len) {
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
/*
* The number of free blocks differs. This mostly
* indicate that the bitmap is corrupt. So exit
@@ -2078,7 +2630,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
struct ext4_free_extent ex;
ext4_fsblk_t first_group_block;
ext4_fsblk_t a;
- ext4_grpblk_t i;
+ ext4_grpblk_t i, stripe;
int max;
BUG_ON(sbi->s_stripe == 0);
@@ -2090,42 +2642,67 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
+ stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe);
+ i = EXT4_B2C(sbi, i);
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
- if (max >= sbi->s_stripe) {
+ max = mb_find_extent(e4b, i, stripe, &ex);
+ if (max >= stripe) {
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
}
}
- i += sbi->s_stripe;
+ i += stripe;
}
}
+static void __ext4_mb_scan_group(struct ext4_allocation_context *ac)
+{
+ bool is_stripe_aligned;
+ struct ext4_sb_info *sbi;
+ enum criteria cr = ac->ac_criteria;
+
+ ac->ac_groups_scanned++;
+ if (cr == CR_POWER2_ALIGNED)
+ return ext4_mb_simple_scan_group(ac, ac->ac_e4b);
+
+ sbi = EXT4_SB(ac->ac_sb);
+ is_stripe_aligned = false;
+ if ((sbi->s_stripe >= sbi->s_cluster_ratio) &&
+ !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe)))
+ is_stripe_aligned = true;
+
+ if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) &&
+ is_stripe_aligned)
+ ext4_mb_scan_aligned(ac, ac->ac_e4b);
+
+ if (ac->ac_status == AC_STATUS_CONTINUE)
+ ext4_mb_complex_scan_group(ac, ac->ac_e4b);
+}
+
/*
* This is also called BEFORE we load the buddy bitmap.
* Returns either 1 or 0 indicating that the group is either suitable
* for the allocation or not.
*/
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr)
+ ext4_group_t group, enum criteria cr)
{
ext4_grpblk_t free, fragments;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
- BUG_ON(cr < 0 || cr >= 4);
+ BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);
- free = grp->bb_free;
- if (free == 0)
- return false;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return false;
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ free = grp->bb_free;
+ if (free == 0)
return false;
fragments = grp->bb_fragments;
@@ -2133,7 +2710,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
return false;
switch (cr) {
- case 0:
+ case CR_POWER2_ALIGNED:
BUG_ON(ac->ac_2order == 0);
/* Avoid using the first bg of a flexgroup for data files */
@@ -2142,23 +2719,26 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
((group % flex_size) == 0))
return false;
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
- (free / fragments) >= ac->ac_g_ex.fe_len)
+ if (free < ac->ac_g_ex.fe_len)
+ return false;
+
+ if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
return true;
if (grp->bb_largest_free_order < ac->ac_2order)
return false;
return true;
- case 1:
+ case CR_GOAL_LEN_FAST:
+ case CR_BEST_AVAIL_LEN:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return true;
break;
- case 2:
+ case CR_GOAL_LEN_SLOW:
if (free >= ac->ac_g_ex.fe_len)
return true;
break;
- case 3:
+ case CR_ANY_FREE:
return true;
default:
BUG();
@@ -2171,61 +2751,245 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
* This could return negative error code if something goes wrong
* during ext4_mb_init_group(). This should not be called with
* ext4_lock_group() held.
+ *
+ * Note: because we are conditionally operating with the group lock in
+ * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
+ * function using __acquire and __release. This means we need to be
+ * super careful before messing with the error path handling via "goto
+ * out"!
*/
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr)
+ ext4_group_t group, enum criteria cr)
{
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
ext4_grpblk_t free;
int ret = 0;
- if (should_lock)
+ if (!grp)
+ return -EFSCORRUPTED;
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
+ if (should_lock) {
ext4_lock_group(sb, group);
+ __release(ext4_group_lock_ptr(sb, group));
+ }
free = grp->bb_free;
if (free == 0)
goto out;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ /*
+ * In all criterias except CR_ANY_FREE we try to avoid groups that
+ * can't possibly satisfy the full goal request due to insufficient
+ * free blocks.
+ */
+ if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len)
goto out;
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
goto out;
- if (should_lock)
+ if (should_lock) {
+ __acquire(ext4_group_lock_ptr(sb, group));
ext4_unlock_group(sb, group);
+ }
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, group, NULL);
+ int ret;
+
+ /*
+ * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
+ * search to find large good chunks almost for free. If buddy
+ * data is not ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg, since this
+ * gets used for metadata block allocation, and we want to make
+ * sure we locate metadata blocks in the first block group in
+ * the flex_bg if possible.
+ */
+ if (!ext4_mb_cr_expensive(cr) &&
+ (!sbi->s_log_groups_per_flex ||
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+ return 0;
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
if (ret)
return ret;
}
- if (should_lock)
+ if (should_lock) {
ext4_lock_group(sb, group);
+ __release(ext4_group_lock_ptr(sb, group));
+ }
ret = ext4_mb_good_group(ac, group, cr);
out:
- if (should_lock)
+ if (should_lock) {
+ __acquire(ext4_group_lock_ptr(sb, group));
ext4_unlock_group(sb, group);
+ }
+ return ret;
+}
+
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+ unsigned int nr, int *cnt)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ /*
+ * Prefetch block groups with free blocks; but don't
+ * bother if it is marked uninitialized on disk, since
+ * it won't require I/O to read. Also only try to
+ * prefetch once, so we avoid getblk() call, which can
+ * be expensive.
+ */
+ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 ) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh) && cnt)
+ (*cnt)++;
+ brelse(bh);
+ }
+ }
+ if (++group >= ngroups)
+ group = 0;
+ }
+ blk_finish_plug(&plug);
+ return group;
+}
+
+/*
+ * Batch reads of the block allocation bitmaps to get
+ * multiple READs in flight; limit prefetching at inexpensive
+ * CR, otherwise mballoc can spend a lot of time loading
+ * imperfect groups
+ */
+static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ struct ext4_sb_info *sbi;
+
+ if (ac->ac_prefetch_grp != group)
+ return;
+
+ sbi = EXT4_SB(ac->ac_sb);
+ if (ext4_mb_cr_expensive(ac->ac_criteria) ||
+ ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) {
+ unsigned int nr = sbi->s_mb_prefetch;
+
+ if (ext4_has_feature_flex_bg(ac->ac_sb)) {
+ nr = 1 << sbi->s_log_groups_per_flex;
+ nr -= group & (nr - 1);
+ nr = umin(nr, sbi->s_mb_prefetch);
+ }
+
+ ac->ac_prefetch_nr = nr;
+ ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr,
+ &ac->ac_prefetch_ios);
+ }
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr)
+{
+ struct ext4_group_desc *gdp;
+ struct ext4_group_info *grp;
+
+ while (nr-- > 0) {
+ if (!group)
+ group = ext4_get_groups_count(sb);
+ group--;
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ grp = ext4_get_group_info(sb, group);
+
+ if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0) {
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+ }
+}
+
+static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ int ret;
+ struct super_block *sb = ac->ac_sb;
+ enum criteria cr = ac->ac_criteria;
+
+ ext4_mb_might_prefetch(ac, group);
+
+ /* prevent unnecessary buddy loading. */
+ if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group)))
+ return 0;
+
+ /* This now checks without needing the buddy folio */
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
+ if (ret <= 0) {
+ if (!ac->ac_first_err)
+ ac->ac_first_err = ret;
+ return 0;
+ }
+
+ ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b);
+ if (ret)
+ return ret;
+
+ /* skip busy group */
+ if (cr >= CR_ANY_FREE)
+ ext4_lock_group(sb, group);
+ else if (!ext4_try_lock_group(sb, group))
+ goto out_unload;
+
+ /* We need to check again after locking the block group. */
+ if (unlikely(!ext4_mb_good_group(ac, group, cr)))
+ goto out_unlock;
+
+ __ext4_mb_scan_group(ac);
+
+out_unlock:
+ ext4_unlock_group(sb, group);
+out_unload:
+ ext4_mb_unload_buddy(ac->ac_e4b);
return ret;
}
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t ngroups, group, i;
- int cr = -1;
- int err = 0, first_err = 0;
- struct ext4_sb_info *sbi;
- struct super_block *sb;
+ ext4_group_t i;
+ int err = 0;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- sb = ac->ac_sb;
- sbi = EXT4_SB(sb);
- ngroups = ext4_get_groups_count(sb);
- /* non-extent files are limited to low blocks/groups */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
- ngroups = sbi->s_blockfile_groups;
-
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
@@ -2237,9 +3001,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
goto out;
/*
- * ac->ac2_order is set only if the fe_len is a power of 2
- * if ac2_order is set we also set criteria to 0 so that we
- * try exact allocation using buddy.
+ * ac->ac_2order is set only if the fe_len is a power of 2
+ * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
+ * so that we try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
ac->ac_2order = 0;
@@ -2250,89 +3014,41 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* We also support searching for power-of-two requests only for
* requests upto maximum buddy size we have constructed.
*/
- if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
- /*
- * This should tell if fe_len is exactly power of 2
- */
- if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
+ if (is_power_of_2(ac->ac_g_ex.fe_len))
ac->ac_2order = array_index_nospec(i - 1,
- sb->s_blocksize_bits + 2);
+ MB_NUM_ORDERS(sb));
}
/* if stream allocation is enabled, use global goal */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
- /* TBD: may be hot point */
- spin_lock(&sbi->s_md_lock);
- ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
- ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
- spin_unlock(&sbi->s_md_lock);
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
+ ac->ac_g_ex.fe_start = -1;
+ ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
}
- /* Let's just scan groups to find more-less suitable blocks */
- cr = ac->ac_2order ? 0 : 1;
/*
- * cr == 0 try to get exact allocation,
- * cr == 3 try to get anything
+ * Let's just scan groups to find more-less suitable blocks We
+ * start with CR_GOAL_LEN_FAST, unless it is power of 2
+ * aligned, in which case let's do that faster approach first.
*/
-repeat:
- for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
- ac->ac_criteria = cr;
- /*
- * searching for the right group start
- * from the goal value specified
- */
- group = ac->ac_g_ex.fe_group;
-
- for (i = 0; i < ngroups; group++, i++) {
- int ret = 0;
- cond_resched();
- /*
- * Artificially restricted ngroups for non-extent
- * files makes group > ngroups possible on first loop.
- */
- if (group >= ngroups)
- group = 0;
-
- /* This now checks without needing the buddy page */
- ret = ext4_mb_good_group_nolock(ac, group, cr);
- if (ret <= 0) {
- if (!first_err)
- first_err = ret;
- continue;
- }
-
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err)
- goto out;
-
- ext4_lock_group(sb, group);
-
- /*
- * We need to check again after locking the
- * block group
- */
- ret = ext4_mb_good_group(ac, group, cr);
- if (ret == 0) {
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
- continue;
- }
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
+ if (ac->ac_2order)
+ ac->ac_criteria = CR_POWER2_ALIGNED;
- ac->ac_groups_scanned++;
- if (cr == 0)
- ext4_mb_simple_scan_group(ac, &e4b);
- else if (cr == 1 && sbi->s_stripe &&
- !(ac->ac_g_ex.fe_len % sbi->s_stripe))
- ext4_mb_scan_aligned(ac, &e4b);
- else
- ext4_mb_complex_scan_group(ac, &e4b);
-
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
+ ac->ac_e4b = &e4b;
+ ac->ac_prefetch_ios = 0;
+ ac->ac_first_err = 0;
+repeat:
+ while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
+ err = ext4_mb_scan_groups(ac);
+ if (err)
+ goto out;
- if (ac->ac_status != AC_STATUS_CONTINUE)
- break;
- }
+ if (ac->ac_status != AC_STATUS_CONTINUE)
+ break;
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2341,38 +3057,53 @@ repeat:
* We've been searching too long. Let's try to allocate
* the best chunk we've found so far
*/
-
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
+ int lost;
+
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
* found block(s)
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
*/
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len, lost);
+
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
- cr = 3;
- atomic_inc(&sbi->s_mb_lost_chunks);
+ ac->ac_criteria = CR_ANY_FREE;
goto repeat;
}
}
+
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
+ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
+ ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
+ atomic_inc(&sbi->s_bal_stream_goals);
+ }
out:
- if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
- err = first_err;
+ if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
+ err = ac->ac_first_err;
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
- ac->ac_flags, cr, err);
+ ac->ac_flags, ac->ac_criteria, err);
+
+ if (ac->ac_prefetch_nr)
+ ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
+
return err;
}
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group;
if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
@@ -2383,7 +3114,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group;
++*pos;
@@ -2395,19 +3126,17 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
- int i;
- int err, buddy_loaded = 0;
+ int i, err;
+ char nbuf[16];
struct ext4_buddy e4b;
struct ext4_group_info *grinfo;
unsigned char blocksize_bits = min_t(unsigned char,
sb->s_blocksize_bits,
EXT4_MAX_BLOCK_LOG_SIZE);
- struct sg {
- struct ext4_group_info info;
- ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
- } sg;
+ DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters,
+ EXT4_MAX_BLOCK_LOG_SIZE + 2);
group--;
if (group == 0)
@@ -2415,32 +3144,36 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
" 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
" 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
- i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) +
sizeof(struct ext4_group_info);
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo)
+ return 0;
/* Load the group info in memory only if not already loaded. */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
- seq_printf(seq, "#%-5u: I/O error\n", group);
+ seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf));
return 0;
}
- buddy_loaded = 1;
- }
-
- memcpy(&sg, ext4_get_group_info(sb, group), i);
-
- if (buddy_loaded)
ext4_mb_unload_buddy(&e4b);
+ }
- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
- sg.info.bb_fragments, sg.info.bb_first_free);
+ /*
+ * We care only about free space counters in the group info and
+ * these are safe to access even after the buddy has been unloaded
+ */
+ memcpy(sg, grinfo, i);
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free,
+ sg->bb_fragments, sg->bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
- sg.info.bb_counters[i] : 0);
- seq_printf(seq, " ]\n");
-
+ sg->bb_counters[i] : 0);
+ seq_puts(seq, " ]");
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg))
+ seq_puts(seq, " Block bitmap corrupted!");
+ seq_putc(seq, '\n');
return 0;
}
@@ -2455,6 +3188,180 @@ const struct seq_operations ext4_mb_seq_groups_ops = {
.show = ext4_mb_seq_groups_show,
};
+int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ seq_puts(seq, "mballoc:\n");
+ if (!sbi->s_mb_stats) {
+ seq_puts(seq, "\tmb stats collection turned off.\n");
+ seq_puts(
+ seq,
+ "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+ return 0;
+ }
+ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+
+ seq_printf(seq, "\tgroups_scanned: %u\n",
+ atomic_read(&sbi->s_bal_groups_scanned));
+
+ /* CR_POWER2_ALIGNED stats */
+ seq_puts(seq, "\tcr_p2_aligned_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
+
+ /* CR_GOAL_LEN_FAST stats */
+ seq_puts(seq, "\tcr_goal_fast_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
+
+ /* CR_BEST_AVAIL_LEN stats */
+ seq_puts(seq, "\tcr_best_avail_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
+
+ /* CR_GOAL_LEN_SLOW stats */
+ seq_puts(seq, "\tcr_goal_slow_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));
+
+ /* CR_ANY_FREE stats */
+ seq_puts(seq, "\tcr_any_free_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));
+
+ /* Aggregates */
+ seq_printf(seq, "\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_ex_scanned));
+ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\tstream_goal_hits: %u\n",
+ atomic_read(&sbi->s_bal_stream_goals));
+ seq_printf(seq, "\t\tlen_goal_hits: %u\n",
+ atomic_read(&sbi->s_bal_len_goals));
+ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+ seq_printf(seq, "\tbuddies_generated: %u/%u\n",
+ atomic_read(&sbi->s_mb_buddies_generated),
+ ext4_get_groups_count(sb));
+ seq_printf(seq, "\tbuddies_time_used: %llu\n",
+ atomic64_read(&sbi->s_mb_generation_time));
+ seq_printf(seq, "\tpreallocated: %u\n",
+ atomic_read(&sbi->s_mb_preallocated));
+ seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
+ return 0;
+}
+
+static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
+{
+ struct super_block *sb = pde_data(file_inode(seq->file));
+ unsigned long position;
+
+ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
+ return NULL;
+ position = *pos + 1;
+ return (void *) ((unsigned long) position);
+}
+
+static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct super_block *sb = pde_data(file_inode(seq->file));
+ unsigned long position;
+
+ ++*pos;
+ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
+ return NULL;
+ position = *pos + 1;
+ return (void *) ((unsigned long) position);
+}
+
+static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
+{
+ struct super_block *sb = pde_data(file_inode(seq->file));
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned long position = ((unsigned long) v);
+ struct ext4_group_info *grp;
+ unsigned int count;
+ unsigned long idx;
+
+ position--;
+ if (position >= MB_NUM_ORDERS(sb)) {
+ position -= MB_NUM_ORDERS(sb);
+ if (position == 0)
+ seq_puts(seq, "avg_fragment_size_lists:\n");
+
+ count = 0;
+ xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp)
+ count++;
+ seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+ (unsigned int)position, count);
+ return 0;
+ }
+
+ if (position == 0) {
+ seq_printf(seq, "optimize_scan: %d\n",
+ test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
+ seq_puts(seq, "max_free_order_lists:\n");
+ }
+ count = 0;
+ xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp)
+ count++;
+ seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+ (unsigned int)position, count);
+
+ return 0;
+}
+
+static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
+{
+}
+
+const struct seq_operations ext4_mb_seq_structs_summary_ops = {
+ .start = ext4_mb_seq_structs_summary_start,
+ .next = ext4_mb_seq_structs_summary_next,
+ .stop = ext4_mb_seq_structs_summary_stop,
+ .show = ext4_mb_seq_structs_summary_show,
+};
+
static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
{
int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
@@ -2495,7 +3402,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
if (old_groupinfo)
ext4_kvfree_array_rcu(old_groupinfo);
- ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
sbi->s_group_info_size);
return 0;
}
@@ -2523,7 +3430,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
if (meta_group_info == NULL) {
ext4_msg(sb, KERN_ERR, "can't allocate mem "
"for a buddy group");
- goto exit_meta_group_info;
+ return -ENOMEM;
}
rcu_read_lock();
rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
@@ -2558,6 +3465,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
+ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
+ meta_group_info[i]->bb_group = group;
mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
return 0;
@@ -2573,7 +3482,6 @@ exit_group_info:
group_info[idx] = NULL;
rcu_read_unlock();
}
-exit_meta_group_info:
return -ENOMEM;
} /* ext4_mb_add_groupinfo */
@@ -2602,6 +3510,8 @@ static int ext4_mb_init_backend(struct super_block *sb)
* this will avoid confusion if it ever shows up during debugging. */
sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+ ext4_set_inode_mapping_order(sbi->s_buddy_cache);
+
for (i = 0; i < ngroups; i++) {
cond_resched();
desc = ext4_get_group_desc(sb, i, NULL);
@@ -2613,12 +3523,45 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freebuddy;
}
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO.
+ * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
+ * unsigned integer, so the maximum shift is 32.
+ */
+ if (sbi->s_es->s_log_groups_per_flex >= 32) {
+ ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
+ goto err_freebuddy;
+ }
+ sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
+ BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /*
+ * now many real IOs to prefetch within a single allocation at
+ * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related
+ * optimization we shouldn't try to load too many groups, at some point
+ * we should start to use what we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4
+ */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;
err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- while (i-- > 0)
- kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+ while (i-- > 0) {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+
+ if (grp)
+ kmem_cache_free(cachep, grp);
+ }
i = sbi->s_group_info_size;
rcu_read_lock();
group_info = rcu_dereference(sbi->s_group_info);
@@ -2682,6 +3625,80 @@ static int ext4_groupinfo_create_slab(size_t size)
return 0;
}
+static void ext4_discard_work(struct work_struct *work)
+{
+ struct ext4_sb_info *sbi = container_of(work,
+ struct ext4_sb_info, s_discard_work);
+ struct super_block *sb = sbi->s_sb;
+ struct ext4_free_data *fd, *nfd;
+ struct ext4_buddy e4b;
+ LIST_HEAD(discard_list);
+ ext4_group_t grp, load_grp;
+ int err = 0;
+
+ spin_lock(&sbi->s_md_lock);
+ list_splice_init(&sbi->s_discard_list, &discard_list);
+ spin_unlock(&sbi->s_md_lock);
+
+ load_grp = UINT_MAX;
+ list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
+ /*
+ * If filesystem is umounting or no memory or suffering
+ * from no space, give up the discard
+ */
+ if ((sb->s_flags & SB_ACTIVE) && !err &&
+ !atomic_read(&sbi->s_retry_alloc_pending)) {
+ grp = fd->efd_group;
+ if (grp != load_grp) {
+ if (load_grp != UINT_MAX)
+ ext4_mb_unload_buddy(&e4b);
+
+ err = ext4_mb_load_buddy(sb, grp, &e4b);
+ if (err) {
+ kmem_cache_free(ext4_free_data_cachep, fd);
+ load_grp = UINT_MAX;
+ continue;
+ } else {
+ load_grp = grp;
+ }
+ }
+
+ ext4_lock_group(sb, grp);
+ ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
+ fd->efd_start_cluster + fd->efd_count - 1, 1);
+ ext4_unlock_group(sb, grp);
+ }
+ kmem_cache_free(ext4_free_data_cachep, fd);
+ }
+
+ if (load_grp != UINT_MAX)
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi)
+{
+ if (!sbi->s_mb_avg_fragment_size)
+ return;
+
+ for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
+ xa_destroy(&sbi->s_mb_avg_fragment_size[i]);
+
+ kfree(sbi->s_mb_avg_fragment_size);
+ sbi->s_mb_avg_fragment_size = NULL;
+}
+
+static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi)
+{
+ if (!sbi->s_mb_largest_free_orders)
+ return;
+
+ for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
+ xa_destroy(&sbi->s_mb_largest_free_orders[i]);
+
+ kfree(sbi->s_mb_largest_free_orders);
+ sbi->s_mb_largest_free_orders = NULL;
+}
+
int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2690,7 +3707,7 @@ int ext4_mb_init(struct super_block *sb)
unsigned max;
int ret;
- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
@@ -2698,7 +3715,7 @@ int ext4_mb_init(struct super_block *sb)
goto out;
}
- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
+ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
ret = -ENOMEM;
@@ -2724,18 +3741,43 @@ int ext4_mb_init(struct super_block *sb)
offset_incr = offset_incr >> 1;
max = max >> 1;
i++;
- } while (i <= sb->s_blocksize_bits + 1);
+ } while (i < MB_NUM_ORDERS(sb));
+
+ sbi->s_mb_avg_fragment_size =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray),
+ GFP_KERNEL);
+ if (!sbi->s_mb_avg_fragment_size) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ xa_init(&sbi->s_mb_avg_fragment_size[i]);
+
+ sbi->s_mb_largest_free_orders =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray),
+ GFP_KERNEL);
+ if (!sbi->s_mb_largest_free_orders) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ xa_init(&sbi->s_mb_largest_free_orders[i]);
spin_lock_init(&sbi->s_md_lock);
- spin_lock_init(&sbi->s_bal_lock);
- sbi->s_mb_free_pending = 0;
- INIT_LIST_HEAD(&sbi->s_freed_data_list);
+ atomic_set(&sbi->s_mb_free_pending, 0);
+ INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
+ INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
+ INIT_LIST_HEAD(&sbi->s_discard_list);
+ INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
+ atomic_set(&sbi->s_retry_alloc_pending, 0);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
+
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -2760,13 +3802,22 @@ int ext4_mb_init(struct super_block *sb)
*/
if (sbi->s_stripe > 1) {
sbi->s_mb_group_prealloc = roundup(
- sbi->s_mb_group_prealloc, sbi->s_stripe);
+ sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
+ }
+
+ sbi->s_mb_nr_global_goals = umin(num_possible_cpus(),
+ DIV_ROUND_UP(sbi->s_groups_count, 4));
+ sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals,
+ sizeof(ext4_group_t), GFP_KERNEL);
+ if (sbi->s_mb_last_groups == NULL) {
+ ret = -ENOMEM;
+ goto out;
}
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
ret = -ENOMEM;
- goto out;
+ goto out_free_last_groups;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
@@ -2777,6 +3828,10 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&lg->lg_prealloc_lock);
}
+ if (bdev_nonrot(sb->s_bdev))
+ sbi->s_mb_max_linear_groups = 0;
+ else
+ sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
/* init file for buddy data */
ret = ext4_mb_init_backend(sb);
if (ret != 0)
@@ -2787,7 +3842,12 @@ int ext4_mb_init(struct super_block *sb)
out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
+out_free_last_groups:
+ kfree(sbi->s_mb_last_groups);
+ sbi->s_mb_last_groups = NULL;
out:
+ ext4_mb_avg_fragment_size_destroy(sbi);
+ ext4_mb_largest_free_orders_destroy(sbi);
kfree(sbi->s_mb_offsets);
sbi->s_mb_offsets = NULL;
kfree(sbi->s_mb_maxs);
@@ -2811,7 +3871,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
return count;
}
-int ext4_mb_release(struct super_block *sb)
+void ext4_mb_release(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
@@ -2821,10 +3881,20 @@ int ext4_mb_release(struct super_block *sb)
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
int count;
+ if (test_opt(sb, DISCARD)) {
+ /*
+ * wait the discard work to drain all of ext4_free_data
+ */
+ flush_work(&sbi->s_discard_work);
+ WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
+ }
+
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
cond_resched();
grinfo = ext4_get_group_info(sb, i);
+ if (!grinfo)
+ continue;
mb_group_bb_bitmap_free(grinfo);
ext4_lock_group(sb, i);
count = ext4_mb_cleanup_pa(grinfo);
@@ -2844,6 +3914,8 @@ int ext4_mb_release(struct super_block *sb)
kvfree(group_info);
rcu_read_unlock();
}
+ ext4_mb_avg_fragment_size_destroy(sbi);
+ ext4_mb_largest_free_orders_destroy(sbi);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
iput(sbi->s_buddy_cache);
@@ -2854,17 +3926,18 @@ int ext4_mb_release(struct super_block *sb)
atomic_read(&sbi->s_bal_reqs),
atomic_read(&sbi->s_bal_success));
ext4_msg(sb, KERN_INFO,
- "mballoc: %u extents scanned, %u goal hits, "
+ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
"%u 2^N hits, %u breaks, %u lost",
atomic_read(&sbi->s_bal_ex_scanned),
+ atomic_read(&sbi->s_bal_groups_scanned),
atomic_read(&sbi->s_bal_goals),
atomic_read(&sbi->s_bal_2orders),
atomic_read(&sbi->s_bal_breaks),
atomic_read(&sbi->s_mb_lost_chunks));
ext4_msg(sb, KERN_INFO,
- "mballoc: %lu generated and it took %Lu",
- sbi->s_mb_buddies_generated,
- sbi->s_mb_generation_time);
+ "mballoc: %u generated and it took %llu",
+ atomic_read(&sbi->s_mb_buddies_generated),
+ atomic64_read(&sbi->s_mb_generation_time));
ext4_msg(sb, KERN_INFO,
"mballoc: %u preallocated, %u discarded",
atomic_read(&sbi->s_mb_preallocated),
@@ -2872,13 +3945,11 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
-
- return 0;
+ kfree(sbi->s_mb_last_groups);
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t cluster, int count,
- struct bio **biop)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
ext4_fsblk_t discard_block;
@@ -2887,13 +3958,8 @@ static inline int ext4_issue_discard(struct super_block *sb,
count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- if (biop) {
- return __blkdev_issue_discard(sb->s_bdev,
- (sector_t)discard_block << (sb->s_blocksize_bits - 9),
- (sector_t)count << (sb->s_blocksize_bits - 9),
- GFP_NOFS, 0, biop);
- } else
- return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
static void ext4_free_data_in_buddy(struct super_block *sb,
@@ -2901,7 +3967,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
{
struct ext4_buddy e4b;
struct ext4_group_info *db;
- int err, count = 0, count2 = 0;
+ int err, count = 0;
mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
entry->efd_count, entry->efd_group, entry);
@@ -2910,14 +3976,10 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
- spin_lock(&EXT4_SB(sb)->s_md_lock);
- EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
- spin_unlock(&EXT4_SB(sb)->s_md_lock);
-
+ atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
count += entry->efd_count;
- count2++;
ext4_lock_group(sb, entry->efd_group);
/* Take it out of per group rb tree */
rb_erase(&entry->efd_node, &(db->bb_free_root));
@@ -2926,25 +3988,20 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/*
* Clear the trimmed flag for the group so that the next
* ext4_trim_fs can trim it.
- * If the volume is mounted with -o discard, online discard
- * is supported and the free blocks will be trimmed online.
*/
- if (!test_opt(sb, DISCARD))
- EXT4_MB_GRP_CLEAR_TRIMMED(db);
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);
if (!db->bb_free_root.rb_node) {
/* No more items in the per group rb tree
* balance refcounts from ext4_mb_free_metadata()
*/
- put_page(e4b.bd_buddy_page);
- put_page(e4b.bd_bitmap_page);
+ folio_put(e4b.bd_buddy_folio);
+ folio_put(e4b.bd_bitmap_folio);
}
ext4_unlock_group(sb, entry->efd_group);
- kmem_cache_free(ext4_free_data_cachep, entry);
ext4_mb_unload_buddy(&e4b);
- mb_debug(sb, "freed %d blocks in %d structures\n", count,
- count2);
+ mb_debug(sb, "freed %d blocks in 1 structures\n", count);
}
/*
@@ -2955,48 +4012,26 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_free_data *entry, *tmp;
- struct bio *discard_bio = NULL;
- struct list_head freed_data_list;
- struct list_head *cut_pos = NULL;
- int err;
+ LIST_HEAD(freed_data_list);
+ struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1];
+ bool wake;
- INIT_LIST_HEAD(&freed_data_list);
+ list_replace_init(s_freed_head, &freed_data_list);
- spin_lock(&sbi->s_md_lock);
- list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
- if (entry->efd_tid != commit_tid)
- break;
- cut_pos = &entry->efd_list;
- }
- if (cut_pos)
- list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
- cut_pos);
- spin_unlock(&sbi->s_md_lock);
+ list_for_each_entry(entry, &freed_data_list, efd_list)
+ ext4_free_data_in_buddy(sb, entry);
if (test_opt(sb, DISCARD)) {
- list_for_each_entry(entry, &freed_data_list, efd_list) {
- err = ext4_issue_discard(sb, entry->efd_group,
- entry->efd_start_cluster,
- entry->efd_count,
- &discard_bio);
- if (err && err != -EOPNOTSUPP) {
- ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%d failed"
- " with %d", entry->efd_group,
- entry->efd_start_cluster,
- entry->efd_count, err);
- } else if (err == -EOPNOTSUPP)
- break;
- }
-
- if (discard_bio) {
- submit_bio_wait(discard_bio);
- bio_put(discard_bio);
- }
+ spin_lock(&sbi->s_md_lock);
+ wake = list_empty(&sbi->s_discard_list);
+ list_splice_tail(&freed_data_list, &sbi->s_discard_list);
+ spin_unlock(&sbi->s_md_lock);
+ if (wake)
+ queue_work(system_dfl_wq, &sbi->s_discard_work);
+ } else {
+ list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
-
- list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
- ext4_free_data_in_buddy(sb, entry);
}
int __init ext4_init_mballoc(void)
@@ -3039,6 +4074,111 @@ void ext4_exit_mballoc(void)
ext4_groupinfo_destroy_slabs();
}
+#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
+#define EXT4_MB_SYNC_UPDATE 0x0002
+static int
+ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
+ ext4_group_t group, ext4_grpblk_t blkoff,
+ ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ int err;
+ unsigned int i, already, changed = len;
+
+ KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
+ handle, sb, state, group, blkoff, len,
+ flags, ret_changed);
+
+ if (ret_changed)
+ *ret_changed = 0;
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh))
+ return PTR_ERR(bitmap_bh);
+
+ if (handle) {
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+ }
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ goto out_err;
+
+ if (handle) {
+ BUFFER_TRACE(gdp_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb, gdp_bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+ }
+
+ ext4_lock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+
+ if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
+ already = 0;
+ for (i = 0; i < len; i++)
+ if (mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ state)
+ already++;
+ changed = len - already;
+ }
+
+ if (state) {
+ mb_set_bits(bitmap_bh->b_data, blkoff, len);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_group_clusters(sb, gdp) - changed);
+ } else {
+ mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_group_clusters(sb, gdp) + changed);
+ }
+
+ ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_unlock_group(sb, group);
+ if (ret_changed)
+ *ret_changed = changed;
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
+
+ if (state)
+ atomic64_sub(changed, &fg->free_clusters);
+ else
+ atomic64_add(changed, &fg->free_clusters);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (err)
+ goto out_err;
+ err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+ if (err)
+ goto out_err;
+
+ if (flags & EXT4_MB_SYNC_UPDATE) {
+ sync_dirty_buffer(bitmap_bh);
+ sync_dirty_buffer(gdp_bh);
+ }
+
+out_err:
+ brelse(bitmap_bh);
+ return err;
+}
/*
* Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
@@ -3048,13 +4188,13 @@ static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
handle_t *handle, unsigned int reserv_clstrs)
{
- struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
- struct buffer_head *gdp_bh;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block;
int err, len;
+ int flags = 0;
+ ext4_grpblk_t changed;
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(ac->ac_b_ex.fe_len <= 0);
@@ -3062,76 +4202,44 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
- bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto out_err;
- }
-
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
- if (err)
- goto out_err;
-
- err = -EIO;
- gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+ gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL);
if (!gdp)
- goto out_err;
-
+ return -EIO;
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
ext4_free_group_clusters(sb, gdp));
- BUFFER_TRACE(gdp_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdp_bh);
- if (err)
- goto out_err;
-
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (!ext4_data_block_valid(sbi, block, len)) {
+ if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata", block, block+len);
/* File system mounted not to panic on error
* Fix the bitmap and return EFSCORRUPTED
* We leak some of the blocks here.
*/
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ err = ext4_mb_mark_context(handle, sb, true,
+ ac->ac_b_ex.fe_group,
+ ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len,
+ 0, NULL);
if (!err)
err = -EFSCORRUPTED;
- goto out_err;
+ return err;
}
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
#ifdef AGGRESSIVE_CHECK
- {
- int i;
- for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
- BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
- bitmap_bh->b_data));
- }
- }
+ flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb,
- ac->ac_b_ex.fe_group, gdp));
- }
- len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
- ext4_free_group_clusters_set(sb, gdp, len);
- ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
+ err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group,
+ ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len,
+ flags, &changed);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+ if (err && changed == 0)
+ return err;
+
+#ifdef AGGRESSIVE_CHECK
+ BUG_ON(changed != ac->ac_b_ex.fe_len);
+#endif
percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
@@ -3141,22 +4249,56 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
percpu_counter_sub(&sbi->s_dirtyclusters_counter,
reserv_clstrs);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi,
- ac->ac_b_ex.fe_group);
- atomic64_sub(ac->ac_b_ex.fe_len,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
+ return err;
+}
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
- if (err)
- goto out_err;
- err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+/*
+ * Idempotent helper for Ext4 fast commit replay path to set the state of
+ * blocks in bitmaps and update counters.
+ */
+void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, bool state)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int err = 0;
+ unsigned int clen, thisgrp_len;
-out_err:
- brelse(bitmap_bh);
- return err;
+ while (len > 0) {
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ * In case of flex_bg, this can happen that (block, len) may
+ * span across more than one group. In that case we need to
+ * get the corresponding group metadata to work with.
+ * For this we have goto again loop.
+ */
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
+
+ if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
+ ext4_error(sb, "Marking blocks in system zone - "
+ "Block = %llu, len = %u",
+ block, thisgrp_len);
+ break;
+ }
+
+ err = ext4_mb_mark_context(NULL, sb, state,
+ group, blkoff, clen,
+ EXT4_MB_BITMAP_MARKED_CHECK |
+ EXT4_MB_SYNC_UPDATE,
+ NULL);
+ if (err)
+ break;
+
+ block += thisgrp_len;
+ len -= thisgrp_len;
+ BUG_ON(len < 0);
+ }
}
/*
@@ -3179,6 +4321,197 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
}
/*
+ * This function returns the next element to look at during inode
+ * PA rbtree walk. We assume that we have held the inode PA rbtree lock
+ * (ei->i_prealloc_lock)
+ *
+ * new_start The start of the range we want to compare
+ * cur_start The existing start that we are comparing against
+ * node The node of the rb_tree
+ */
+static inline struct rb_node*
+ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
+{
+ if (new_start < cur_start)
+ return node->rb_left;
+ else
+ return node->rb_right;
+}
+
+static inline void
+ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
+ ext4_lblk_t start, loff_t end)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+ struct ext4_prealloc_space *tmp_pa;
+ ext4_lblk_t tmp_pa_start;
+ loff_t tmp_pa_end;
+ struct rb_node *iter;
+
+ read_lock(&ei->i_prealloc_lock);
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ tmp_pa_start = tmp_pa->pa_lstart;
+ tmp_pa_end = pa_logical_end(sbi, tmp_pa);
+
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0)
+ BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+ read_unlock(&ei->i_prealloc_lock);
+}
+
+/*
+ * Given an allocation context "ac" and a range "start", "end", check
+ * and adjust boundaries if the range overlaps with any of the existing
+ * preallocatoins stored in the corresponding inode of the allocation context.
+ *
+ * Parameters:
+ * ac allocation context
+ * start start of the new range
+ * end end of the new range
+ */
+static inline void
+ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
+ ext4_lblk_t *start, loff_t *end)
+{
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
+ struct rb_node *iter;
+ ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1;
+ loff_t new_end, tmp_pa_end, left_pa_end = -1;
+
+ new_start = *start;
+ new_end = *end;
+
+ /*
+ * Adjust the normalized range so that it doesn't overlap with any
+ * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
+ * so it doesn't change underneath us.
+ */
+ read_lock(&ei->i_prealloc_lock);
+
+ /* Step 1: find any one immediate neighboring PA of the normalized range */
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
+ tmp_pa_start, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ tmp_pa_start = tmp_pa->pa_lstart;
+ tmp_pa_end = pa_logical_end(sbi, tmp_pa);
+
+ /* PA must not overlap original request */
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0)
+ BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
+ ac->ac_o_ex.fe_logical < tmp_pa_start));
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+
+ /*
+ * Step 2: check if the found PA is left or right neighbor and
+ * get the other neighbor
+ */
+ if (tmp_pa) {
+ if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
+ struct rb_node *tmp;
+
+ left_pa = tmp_pa;
+ tmp = rb_next(&left_pa->pa_node.inode_node);
+ if (tmp) {
+ right_pa = rb_entry(tmp,
+ struct ext4_prealloc_space,
+ pa_node.inode_node);
+ }
+ } else {
+ struct rb_node *tmp;
+
+ right_pa = tmp_pa;
+ tmp = rb_prev(&right_pa->pa_node.inode_node);
+ if (tmp) {
+ left_pa = rb_entry(tmp,
+ struct ext4_prealloc_space,
+ pa_node.inode_node);
+ }
+ }
+ }
+
+ /* Step 3: get the non deleted neighbors */
+ if (left_pa) {
+ for (iter = &left_pa->pa_node.inode_node;;
+ iter = rb_prev(iter)) {
+ if (!iter) {
+ left_pa = NULL;
+ break;
+ }
+
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ left_pa = tmp_pa;
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0) {
+ spin_unlock(&tmp_pa->pa_lock);
+ break;
+ }
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+ }
+
+ if (right_pa) {
+ for (iter = &right_pa->pa_node.inode_node;;
+ iter = rb_next(iter)) {
+ if (!iter) {
+ right_pa = NULL;
+ break;
+ }
+
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ right_pa = tmp_pa;
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0) {
+ spin_unlock(&tmp_pa->pa_lock);
+ break;
+ }
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+ }
+
+ if (left_pa) {
+ left_pa_end = pa_logical_end(sbi, left_pa);
+ BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
+ }
+
+ if (right_pa) {
+ right_pa_start = right_pa->pa_lstart;
+ BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
+ }
+
+ /* Step 4: trim our normalized range to not overlap with the neighbors */
+ if (left_pa) {
+ if (left_pa_end > new_start)
+ new_start = left_pa_end;
+ }
+
+ if (right_pa) {
+ if (right_pa_start < new_end)
+ new_end = right_pa_start;
+ }
+ read_unlock(&ei->i_prealloc_lock);
+
+ /* XXX: extra loop to check we really don't overlap preallocations */
+ ext4_mb_pa_assert_overlap(ac, new_start, new_end);
+
+ *start = new_start;
+ *end = new_end;
+}
+
+/*
* Normalization means making request better in terms of
* size and alignment
*/
@@ -3187,13 +4520,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_super_block *es = sbi->s_es;
int bsbits, max;
- ext4_lblk_t end;
- loff_t size, start_off;
+ loff_t size, start_off, end;
loff_t orig_size __maybe_unused;
ext4_lblk_t start;
- struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
- struct ext4_prealloc_space *pa;
/* do normalize only data requests, metadata requests
do not need preallocation */
@@ -3218,7 +4549,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ size = extent_logical_end(sbi, &ac->ac_o_ex);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
@@ -3255,19 +4586,32 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
(22 - bsbits)) << 22;
size = 4 * 1024 * 1024;
- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+ } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
(8<<20)>>bsbits, max, 8 * 1024)) {
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
(23 - bsbits)) << 23;
size = 8 * 1024 * 1024;
} else {
start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
- size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
+ size = (loff_t) EXT4_C2B(sbi,
ac->ac_o_ex.fe_len) << bsbits;
}
size = size >> bsbits;
start = start_off >> bsbits;
+ /*
+ * For tiny groups (smaller than 8MB) the chosen allocation
+ * alignment may be larger than group size. Make sure the
+ * alignment does not move allocation to a different group which
+ * makes mballoc fail assertions later.
+ */
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
+
+ /* avoid unnecessary preallocation that may trigger assertions */
+ if (start + size > EXT_MAX_BLOCKS)
+ size = EXT_MAX_BLOCKS - start;
+
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
size -= ar->lleft + 1 - start;
@@ -3285,62 +4629,26 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
end = start + size;
- /* check we don't cross already preallocated blocks */
- rcu_read_lock();
- list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
- ext4_lblk_t pa_end;
-
- if (pa->pa_deleted)
- continue;
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
+ ext4_mb_pa_adjust_overlap(ac, &start, &end);
- pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
- pa->pa_len);
-
- /* PA must not overlap original request */
- BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
- ac->ac_o_ex.fe_logical < pa->pa_lstart));
-
- /* skip PAs this normalized request doesn't overlap with */
- if (pa->pa_lstart >= end || pa_end <= start) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
- BUG_ON(pa->pa_lstart <= start && pa_end >= end);
-
- /* adjust start or end to be adjacent to this pa */
- if (pa_end <= ac->ac_o_ex.fe_logical) {
- BUG_ON(pa_end < start);
- start = pa_end;
- } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
- BUG_ON(pa->pa_lstart > end);
- end = pa->pa_lstart;
- }
- spin_unlock(&pa->pa_lock);
- }
- rcu_read_unlock();
size = end - start;
- /* XXX: extra loop to check we really don't overlap preallocations */
- rcu_read_lock();
- list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
- ext4_lblk_t pa_end;
-
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
- pa->pa_len);
- BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
- }
- spin_unlock(&pa->pa_lock);
- }
- rcu_read_unlock();
-
- if (start + size <= ac->ac_o_ex.fe_logical &&
+ /*
+ * In this function "start" and "size" are normalized for better
+ * alignment and length such that we could preallocate more blocks.
+ * This normalization is done such that original request of
+ * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
+ * "size" boundaries.
+ * (Note fe_len can be relaxed since FS block allocation API does not
+ * provide gurantee on number of contiguous blocks allocation since that
+ * depends upon free space left, etc).
+ * In case of inode pa, later we use the allocated blocks
+ * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
+ * range of goal/best blocks [start, size] to put it at the
+ * ac_o_ex.fe_logical extent of this inode.
+ * (See ext4_mb_use_inode_pa() for more details)
+ */
+ if (start + size <= ac->ac_o_ex.fe_logical ||
start > ac->ac_o_ex.fe_logical) {
ext4_msg(ac->ac_sb, KERN_ERR,
"start %lu, size %lu, fe_logical %lu",
@@ -3356,20 +4664,24 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
+ ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
/* define goal start in order to merge */
- if (ar->pright && (ar->lright == (start + size))) {
+ if (ar->pright && (ar->lright == (start + size)) &&
+ ar->pright >= size &&
+ ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
/* merge to the right */
ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
+ &ac->ac_g_ex.fe_group,
+ &ac->ac_g_ex.fe_start);
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- if (ar->pleft && (ar->lleft + 1 == start)) {
+ if (ar->pleft && (ar->lleft + 1 == start) &&
+ ar->pleft + 1 < ext4_blocks_count(es)) {
/* merge to the left */
ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
+ &ac->ac_g_ex.fe_group,
+ &ac->ac_g_ex.fe_start);
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
@@ -3381,15 +4693,25 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
atomic_inc(&sbi->s_bal_reqs);
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
+
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+ for (int i=0; i<EXT4_MB_NUM_CRS; i++) {
+ atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]);
+ }
+
+ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
atomic_inc(&sbi->s_bal_goals);
+ /* did we allocate as much as normalizer originally wanted? */
+ if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len)
+ atomic_inc(&sbi->s_bal_len_goals);
+
if (ac->ac_found > sbi->s_mb_max_to_scan)
atomic_inc(&sbi->s_bal_breaks);
}
@@ -3416,15 +4738,14 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
if (ac->ac_f_ex.fe_len == 0)
return;
err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
- if (err) {
+ if (WARN_RATELIMIT(err,
+ "ext4: mb_load_buddy failed (%d)", err))
/*
* This should never happen since we pin the
- * pages in the ext4_allocation_context so
+ * folios in the ext4_allocation_context so
* ext4_mb_load_buddy() should never fail.
*/
- WARN(1, "mb_load_buddy failed (%d)", err);
return;
- }
ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
ac->ac_f_ex.fe_len);
@@ -3432,8 +4753,11 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
ext4_mb_unload_buddy(&e4b);
return;
}
- if (pa->pa_type == MB_INODE_PA)
+ if (pa->pa_type == MB_INODE_PA) {
+ spin_lock(&pa->pa_lock);
pa->pa_free += ac->ac_b_ex.fe_len;
+ spin_unlock(&pa->pa_lock);
+ }
}
/*
@@ -3461,6 +4785,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
BUG_ON(start < pa->pa_pstart);
BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
+ BUG_ON(ac->ac_b_ex.fe_len <= 0);
pa->pa_free -= len;
mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
@@ -3481,14 +4806,14 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
ac->ac_status = AC_STATUS_FOUND;
ac->ac_pa = pa;
- /* we don't correct pa_pstart or pa_plen here to avoid
+ /* we don't correct pa_pstart or pa_len here to avoid
* possible race when the group is being loaded concurrently
* instead we correct pa later, after blocks are marked
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
- pa->pa_lstart-len, len, pa);
+ pa->pa_lstart, len, pa);
}
/*
@@ -3521,6 +4846,37 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
}
/*
+ * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
+ */
+static bool
+ext4_mb_pa_goal_check(struct ext4_allocation_context *ac,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ ext4_fsblk_t start;
+
+ if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)))
+ return true;
+
+ /*
+ * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
+ * in ext4_mb_normalize_request and will keep same with ac_o_ex
+ * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
+ * consistent with ext4_mb_find_by_goal.
+ */
+ start = pa->pa_pstart +
+ (ac->ac_g_ex.fe_logical - pa->pa_lstart);
+ if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start)
+ return false;
+
+ if (ac->ac_g_ex.fe_len > pa->pa_len -
+ EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart))
+ return false;
+
+ return true;
+}
+
+/*
* search goal blocks in preallocated space
*/
static noinline_for_stack bool
@@ -3530,43 +4886,158 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
- struct ext4_prealloc_space *pa, *cpa = NULL;
+ struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
+ struct rb_node *iter;
ext4_fsblk_t goal_block;
/* only data can be preallocated */
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return false;
- /* first, try per-file preallocation */
- rcu_read_lock();
- list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+ /*
+ * first, try per-file preallocation by searching the inode pa rbtree.
+ *
+ * Here, we can't do a direct traversal of the tree because
+ * ext4_mb_discard_group_preallocation() can paralelly mark the pa
+ * deleted and that can cause direct traversal to skip some entries.
+ */
+ read_lock(&ei->i_prealloc_lock);
- /* all fields in this condition don't change,
- * so we can skip locking for them */
- if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
- EXT4_C2B(sbi, pa->pa_len)))
- continue;
+ if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
+ goto try_group_pa;
+ }
- /* non-extent files can't have physical blocks past 2^32 */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
- EXT4_MAX_BLOCK_FILE_PHYS))
- continue;
+ /*
+ * Step 1: Find a pa with logical start immediately adjacent to the
+ * original logical start. This could be on the left or right.
+ *
+ * (tmp_pa->pa_lstart never changes so we can skip locking for it).
+ */
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
+ tmp_pa->pa_lstart, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ }
- /* found preallocated blocks, use them */
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 0 && pa->pa_free) {
- atomic_inc(&pa->pa_count);
- ext4_mb_use_inode_pa(ac, pa);
- spin_unlock(&pa->pa_lock);
- ac->ac_criteria = 10;
- rcu_read_unlock();
- return true;
+ /*
+ * Step 2: The adjacent pa might be to the right of logical start, find
+ * the left adjacent pa. After this step we'd have a valid tmp_pa whose
+ * logical start is towards the left of original request's logical start
+ */
+ if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ struct rb_node *tmp;
+ tmp = rb_prev(&tmp_pa->pa_node.inode_node);
+
+ if (tmp) {
+ tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ } else {
+ /*
+ * If there is no adjacent pa to the left then finding
+ * an overlapping pa is not possible hence stop searching
+ * inode pa tree
+ */
+ goto try_group_pa;
}
- spin_unlock(&pa->pa_lock);
}
- rcu_read_unlock();
+
+ BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
+
+ /*
+ * Step 3: If the left adjacent pa is deleted, keep moving left to find
+ * the first non deleted adjacent pa. After this step we should have a
+ * valid tmp_pa which is guaranteed to be non deleted.
+ */
+ for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
+ if (!iter) {
+ /*
+ * no non deleted left adjacent pa, so stop searching
+ * inode pa tree
+ */
+ goto try_group_pa;
+ }
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0) {
+ /*
+ * We will keep holding the pa_lock from
+ * this point on because we don't want group discard
+ * to delete this pa underneath us. Since group
+ * discard is anyways an ENOSPC operation it
+ * should be okay for it to wait a few more cycles.
+ */
+ break;
+ } else {
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+ }
+
+ BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
+ BUG_ON(tmp_pa->pa_deleted == 1);
+
+ /*
+ * Step 4: We now have the non deleted left adjacent pa. Only this
+ * pa can possibly satisfy the request hence check if it overlaps
+ * original logical start and stop searching if it doesn't.
+ */
+ if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) {
+ spin_unlock(&tmp_pa->pa_lock);
+ goto try_group_pa;
+ }
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+ (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS)) {
+ /*
+ * Since PAs don't overlap, we won't find any other PA to
+ * satisfy this.
+ */
+ spin_unlock(&tmp_pa->pa_lock);
+ goto try_group_pa;
+ }
+
+ if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
+ atomic_inc(&tmp_pa->pa_count);
+ ext4_mb_use_inode_pa(ac, tmp_pa);
+ spin_unlock(&tmp_pa->pa_lock);
+ read_unlock(&ei->i_prealloc_lock);
+ return true;
+ } else {
+ /*
+ * We found a valid overlapping pa but couldn't use it because
+ * it had no free blocks. This should ideally never happen
+ * because:
+ *
+ * 1. When a new inode pa is added to rbtree it must have
+ * pa_free > 0 since otherwise we won't actually need
+ * preallocation.
+ *
+ * 2. An inode pa that is in the rbtree can only have it's
+ * pa_free become zero when another thread calls:
+ * ext4_mb_new_blocks
+ * ext4_mb_use_preallocated
+ * ext4_mb_use_inode_pa
+ *
+ * 3. Further, after the above calls make pa_free == 0, we will
+ * immediately remove it from the rbtree in:
+ * ext4_mb_new_blocks
+ * ext4_mb_release_context
+ * ext4_mb_put_pa
+ *
+ * 4. Since the pa_free becoming 0 and pa_free getting removed
+ * from tree both happen in ext4_mb_new_blocks, which is always
+ * called with i_data_sem held for data allocations, we can be
+ * sure that another process will never see a pa in rbtree with
+ * pa_free == 0.
+ */
+ WARN_ON_ONCE(tmp_pa->pa_free == 0);
+ }
+ spin_unlock(&tmp_pa->pa_lock);
+try_group_pa:
+ read_unlock(&ei->i_prealloc_lock);
/* can we use group allocation? */
if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
@@ -3588,52 +5059,27 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
*/
for (i = order; i < PREALLOC_TB_SIZE; i++) {
rcu_read_lock();
- list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
- pa_inode_list) {
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 0 &&
- pa->pa_free >= ac->ac_o_ex.fe_len) {
+ list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i],
+ pa_node.lg_list) {
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0 &&
+ tmp_pa->pa_free >= ac->ac_o_ex.fe_len) {
cpa = ext4_mb_check_group_pa(goal_block,
- pa, cpa);
+ tmp_pa, cpa);
}
- spin_unlock(&pa->pa_lock);
+ spin_unlock(&tmp_pa->pa_lock);
}
rcu_read_unlock();
}
if (cpa) {
ext4_mb_use_group_pa(ac, cpa);
- ac->ac_criteria = 20;
return true;
}
return false;
}
/*
- * the function goes through all block freed in the group
- * but not yet committed and marks them used in in-core bitmap.
- * buddy must be generated from this bitmap
- * Need to be called with the ext4 group lock held
- */
-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
- ext4_group_t group)
-{
- struct rb_node *n;
- struct ext4_group_info *grp;
- struct ext4_free_data *entry;
-
- grp = ext4_get_group_info(sb, group);
- n = rb_first(&(grp->bb_free_root));
-
- while (n) {
- entry = rb_entry(n, struct ext4_free_data, efd_node);
- ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
- n = rb_next(n);
- }
- return;
-}
-
-/*
* the function goes through all preallocation in this group and marks them
* used in in-core bitmap. buddy must be generated from this bitmap
* Need to be called with ext4 group lock held
@@ -3650,6 +5096,9 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
int preallocated = 0;
int len;
+ if (!grp)
+ return;
+
/* all form of preallocation discards first load group,
* so the only competing code is preallocation use.
* we don't need any locking here
@@ -3668,22 +5117,48 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- ext4_set_bits(bitmap, start, len);
+ mb_set_bits(bitmap, start, len);
preallocated += len;
}
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
}
-static void ext4_mb_pa_callback(struct rcu_head *head)
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+ struct ext4_prealloc_space *pa)
{
- struct ext4_prealloc_space *pa;
- pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+ struct ext4_inode_info *ei;
+
+ if (pa->pa_deleted) {
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+ pa->pa_len);
+ return;
+ }
+
+ pa->pa_deleted = 1;
+ if (pa->pa_type == MB_INODE_PA) {
+ ei = EXT4_I(pa->pa_inode);
+ atomic_dec(&ei->i_prealloc_active);
+ }
+}
+
+static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
+{
+ BUG_ON(!pa);
BUG_ON(atomic_read(&pa->pa_count));
BUG_ON(pa->pa_deleted == 0);
kmem_cache_free(ext4_pspace_cachep, pa);
}
+static void ext4_mb_pa_callback(struct rcu_head *head)
+{
+ struct ext4_prealloc_space *pa;
+
+ pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+ ext4_mb_pa_free(pa);
+}
+
/*
* drops a reference to preallocated space descriptor
* if this was the last reference and the space is consumed
@@ -3693,6 +5168,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
{
ext4_group_t grp;
ext4_fsblk_t grp_blk;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
@@ -3706,7 +5182,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
return;
}
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
@@ -3737,11 +5213,42 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
list_del(&pa->pa_group_list);
ext4_unlock_group(sb, grp);
- spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
- spin_unlock(pa->pa_obj_lock);
+ if (pa->pa_type == MB_INODE_PA) {
+ write_lock(pa->pa_node_lock.inode_lock);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
+ ext4_mb_pa_free(pa);
+ } else {
+ spin_lock(pa->pa_node_lock.lg_lock);
+ list_del_rcu(&pa->pa_node.lg_list);
+ spin_unlock(pa->pa_node_lock.lg_lock);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+}
+
+static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
+{
+ struct rb_node **iter = &root->rb_node, *parent = NULL;
+ struct ext4_prealloc_space *iter_pa, *new_pa;
+ ext4_lblk_t iter_start, new_start;
+
+ while (*iter) {
+ iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ new_pa = rb_entry(new, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ iter_start = iter_pa->pa_lstart;
+ new_start = new_pa->pa_lstart;
+
+ parent = *iter;
+ if (new_start < iter_start)
+ iter = &((*iter)->rb_left);
+ else
+ iter = &((*iter)->rb_right);
+ }
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ rb_link_node(new, parent, iter);
+ rb_insert_color(new, root);
}
/*
@@ -3764,50 +5271,59 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa = ac->ac_pa;
- if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
- int winl;
- int wins;
- int win;
- int offs;
+ if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
+ struct ext4_free_extent ex = {
+ .fe_logical = ac->ac_g_ex.fe_logical,
+ .fe_len = ac->ac_orig_goal_len,
+ };
+ loff_t orig_goal_end = extent_logical_end(sbi, &ex);
+ loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);
- /* we can't allocate as much as normalizer wants.
- * so, found space must get proper lstart
- * to cover original request */
+ /*
+ * We can't allocate as much as normalizer wants, so we try
+ * to get proper lstart to cover the original request, except
+ * when the goal doesn't cover the original request as below:
+ *
+ * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
+ * best_ex:0/200(200) -> adjusted: 1848/2048(200)
+ */
BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
- /* we're limited by original request in that
- * logical block must be covered any way
- * winl is window we can move our chunk within */
- winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+ /*
+ * Use the below logic for adjusting best extent as it keeps
+ * fragmentation in check while ensuring logical range of best
+ * extent doesn't overflow out of goal extent:
+ *
+ * 1. Check if best ex can be kept at end of goal (before
+ * cr_best_avail trimmed it) and still cover original start
+ * 2. Else, check if best ex can be kept at start of goal and
+ * still cover original end
+ * 3. Else, keep the best ex at start of original request.
+ */
+ ex.fe_len = ac->ac_b_ex.fe_len;
- /* also, we should cover whole original request */
- wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
+ ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
+ if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
+ goto adjust_bex;
- /* the smallest one defines real window */
- win = min(winl, wins);
+ ex.fe_logical = ac->ac_g_ex.fe_logical;
+ if (o_ex_end <= extent_logical_end(sbi, &ex))
+ goto adjust_bex;
- offs = ac->ac_o_ex.fe_logical %
- EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (offs && offs < win)
- win = offs;
+ ex.fe_logical = ac->ac_o_ex.fe_logical;
+adjust_bex:
+ ac->ac_b_ex.fe_logical = ex.fe_logical;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
- EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
- BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+ BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
}
- /* preallocation can change ac_b_ex, thus we store actually
- * allocated blocks for history */
- ac->ac_f_ex = ac->ac_b_ex;
-
pa->pa_lstart = ac->ac_b_ex.fe_logical;
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
spin_lock_init(&pa->pa_lock);
- INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
@@ -3816,20 +5332,23 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
- ext4_mb_use_inode_pa(ac, pa);
atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
+ ext4_mb_use_inode_pa(ac, pa);
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ if (!grp)
+ return;
- pa->pa_obj_lock = &ei->i_prealloc_lock;
+ pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
pa->pa_inode = ac->ac_inode;
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- spin_lock(pa->pa_obj_lock);
- list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
- spin_unlock(pa->pa_obj_lock);
+ write_lock(pa->pa_node_lock.inode_lock);
+ ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
+ atomic_inc(&ei->i_prealloc_active);
}
/*
@@ -3851,16 +5370,12 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa = ac->ac_pa;
- /* preallocation can change ac_b_ex, thus we store actually
- * allocated blocks for history */
- ac->ac_f_ex = ac->ac_b_ex;
-
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_lstart = pa->pa_pstart;
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
spin_lock_init(&pa->pa_lock);
- INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_node.lg_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
@@ -3873,10 +5388,12 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ if (!grp)
+ return;
lg = ac->ac_lg;
BUG_ON(lg == NULL);
- pa->pa_obj_lock = &lg->lg_prealloc_lock;
+ pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock;
pa->pa_inode = NULL;
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
@@ -3903,7 +5420,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
* the caller MUST hold group/inode locks.
* TODO: optimize the case when there are no in-core structures yet
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
struct ext4_prealloc_space *pa)
{
@@ -3953,11 +5470,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
*/
}
atomic_add(free, &sbi->s_mb_discarded);
-
- return 0;
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_release_group_pa(struct ext4_buddy *e4b,
struct ext4_prealloc_space *pa)
{
@@ -3968,12 +5483,14 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+ if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
+ ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
+ e4b->bd_group, group, pa->pa_pstart);
+ return;
+ }
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-
- return 0;
}
/*
@@ -3987,17 +5504,19 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
*/
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
- ext4_group_t group, int needed)
+ ext4_group_t group, int *busy)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct buffer_head *bitmap_bh = NULL;
struct ext4_prealloc_space *pa, *tmp;
- struct list_head list;
+ LIST_HEAD(list);
struct ext4_buddy e4b;
+ struct ext4_inode_info *ei;
int err;
- int busy = 0;
int free = 0;
+ if (!grp)
+ return 0;
mb_debug(sb, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
goto out_dbg;
@@ -4019,19 +5538,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
goto out_dbg;
}
- if (needed == 0)
- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
-
- INIT_LIST_HEAD(&list);
-repeat:
ext4_lock_group(sb, group);
- this_cpu_inc(discard_pa_seq);
list_for_each_entry_safe(pa, tmp,
&grp->bb_prealloc_list, pa_group_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
spin_unlock(&pa->pa_lock);
- busy = 1;
+ *busy = 1;
continue;
}
if (pa->pa_deleted) {
@@ -4040,7 +5553,10 @@ repeat:
}
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
+
+ if (!free)
+ this_cpu_inc(discard_pa_seq);
/* we can trust pa_free ... */
free += pa->pa_free;
@@ -4051,40 +5567,32 @@ repeat:
list_add(&pa->u.pa_tmp_list, &list);
}
- /* if we still need more blocks and some PAs were used, try again */
- if (free < needed && busy) {
- busy = 0;
- ext4_unlock_group(sb, group);
- cond_resched();
- goto repeat;
- }
-
- /* found anything to free? */
- if (list_empty(&list)) {
- BUG_ON(free != 0);
- mb_debug(sb, "Someone else may have freed PA for this group %u\n",
- group);
- goto out;
- }
-
/* now free all selected PAs */
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
/* remove from object (inode or locality group) */
- spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
- spin_unlock(pa->pa_obj_lock);
+ if (pa->pa_type == MB_GROUP_PA) {
+ spin_lock(pa->pa_node_lock.lg_lock);
+ list_del_rcu(&pa->pa_node.lg_list);
+ spin_unlock(pa->pa_node_lock.lg_lock);
+ } else {
+ write_lock(pa->pa_node_lock.inode_lock);
+ ei = EXT4_I(pa->pa_inode);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
+ }
+
+ list_del(&pa->u.pa_tmp_list);
- if (pa->pa_type == MB_GROUP_PA)
+ if (pa->pa_type == MB_GROUP_PA) {
ext4_mb_release_group_pa(&e4b, pa);
- else
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ } else {
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
-
- list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ ext4_mb_pa_free(pa);
+ }
}
-out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
@@ -4110,34 +5618,37 @@ void ext4_discard_preallocations(struct inode *inode)
struct buffer_head *bitmap_bh = NULL;
struct ext4_prealloc_space *pa, *tmp;
ext4_group_t group = 0;
- struct list_head list;
+ LIST_HEAD(list);
struct ext4_buddy e4b;
+ struct rb_node *iter;
int err;
- if (!S_ISREG(inode->i_mode)) {
- /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- }
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
- trace_ext4_discard_preallocations(inode);
-
- INIT_LIST_HEAD(&list);
+ trace_ext4_discard_preallocations(inode,
+ atomic_read(&ei->i_prealloc_active));
repeat:
/* first, collect all pa's in the inode */
- spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list)) {
- pa = list_entry(ei->i_prealloc_list.next,
- struct ext4_prealloc_space, pa_inode_list);
- BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+ write_lock(&ei->i_prealloc_lock);
+ for (iter = rb_first(&ei->i_prealloc_node); iter;
+ iter = rb_next(iter)) {
+ pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);
+
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/* this shouldn't happen often - nobody should
* use preallocation while we're discarding it */
spin_unlock(&pa->pa_lock);
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
ext4_msg(sb, KERN_ERR,
"uh-oh! used pa while discarding");
WARN_ON(1);
@@ -4146,16 +5657,16 @@ repeat:
}
if (pa->pa_deleted == 0) {
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
- list_del_rcu(&pa->pa_inode_list);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
list_add(&pa->u.pa_tmp_list, &list);
continue;
}
/* someone is deleting pa right now */
spin_unlock(&pa->pa_lock);
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
/* we have to wait here because pa_deleted
* doesn't mean pa is already unlinked from
@@ -4172,7 +5683,7 @@ repeat:
schedule_timeout_uninterruptible(HZ);
goto repeat;
}
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
@@ -4204,7 +5715,7 @@ repeat:
put_bh(bitmap_bh);
list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ ext4_mb_pa_free(pa);
}
}
@@ -4221,14 +5732,20 @@ static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
return 0;
}
-static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
+static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
BUG_ON(!pa);
ac->ac_pa = NULL;
WARN_ON(!atomic_dec_and_test(&pa->pa_count));
- kmem_cache_free(ext4_pspace_cachep, pa);
+ /*
+ * current function is only called due to an error or due to
+ * len of found blocks < len of requested blocks hence the PA has not
+ * been added to grp->bb_prealloc_list. So we don't need to lock it
+ */
+ pa->pa_deleted = 1;
+ ext4_mb_pa_free(pa);
}
#ifdef CONFIG_EXT4_DEBUG
@@ -4236,7 +5753,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
{
ext4_group_t i, ngroups;
- if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_emergency_state(sb))
return;
ngroups = ext4_get_groups_count(sb);
@@ -4246,6 +5763,9 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
struct ext4_prealloc_space *pa;
ext4_grpblk_t start;
struct list_head *cur;
+
+ if (!grp)
+ continue;
ext4_lock_group(sb, i);
list_for_each(cur, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space,
@@ -4267,7 +5787,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_emergency_state(sb))
return;
mb_debug(sb, "Can't allocate:"
@@ -4291,17 +5811,19 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
mb_debug(sb, "%u found", ac->ac_found);
+ mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa));
+ if (ac->ac_pa)
+ mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
+ "group pa" : "inode pa");
ext4_mb_show_pa(sb);
}
#else
static inline void ext4_mb_show_pa(struct super_block *sb)
{
- return;
}
static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
ext4_mb_show_pa(ac->ac_sb);
- return;
}
#endif
@@ -4317,6 +5839,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits = ac->ac_sb->s_blocksize_bits;
loff_t size, isize;
+ bool inode_pa_eligible, group_pa_eligible;
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
@@ -4324,25 +5847,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
- size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ group_pa_eligible = sbi->s_mb_group_prealloc > 0;
+ inode_pa_eligible = true;
+ size = extent_logical_end(sbi, &ac->ac_o_ex);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
+ /* No point in using inode preallocation for closed files */
if ((size == isize) && !ext4_fs_is_busy(sbi) &&
- !inode_is_open_for_write(ac->ac_inode)) {
- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
- return;
- }
+ !inode_is_open_for_write(ac->ac_inode))
+ inode_pa_eligible = false;
- if (sbi->s_mb_group_prealloc <= 0) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
- return;
- }
-
- /* don't use group allocation for large files */
size = max(size, isize);
- if (size > sbi->s_mb_stream_request) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ /* Don't use group allocation for large files */
+ if (size > sbi->s_mb_stream_request)
+ group_pa_eligible = false;
+
+ if (!group_pa_eligible) {
+ if (inode_pa_eligible)
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ else
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
@@ -4361,7 +5886,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
mutex_lock(&ac->ac_lg->lg_mutex);
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
@@ -4397,9 +5922,10 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
ac->ac_g_ex = ac->ac_o_ex;
+ ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
ac->ac_flags = ar->flags;
- /* we have to define context: we'll we work with a file or
+ /* we have to define context: we'll work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
@@ -4410,8 +5936,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
inode_is_open_for_write(ar->inode) ? "" : "non-");
- return 0;
-
}
static noinline_for_stack void
@@ -4421,16 +5945,14 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
{
ext4_group_t group = 0;
struct ext4_buddy e4b;
- struct list_head discard_list;
+ LIST_HEAD(discard_list);
struct ext4_prealloc_space *pa, *tmp;
mb_debug(sb, "discard locality group preallocation\n");
- INIT_LIST_HEAD(&discard_list);
-
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
- pa_inode_list,
+ pa_node.lg_list,
lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
@@ -4450,10 +5972,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
- list_del_rcu(&pa->pa_inode_list);
+ list_del_rcu(&pa->pa_node.lg_list);
list_add(&pa->u.pa_tmp_list, &discard_list);
total_entries--;
@@ -4514,7 +6036,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* Add the prealloc space to lg */
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
- pa_inode_list,
+ pa_node.lg_list,
lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
@@ -4523,8 +6045,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
}
if (!added && pa->pa_free < tmp_pa->pa_free) {
/* Add to the tail of the previous entry */
- list_add_tail_rcu(&pa->pa_inode_list,
- &tmp_pa->pa_inode_list);
+ list_add_tail_rcu(&pa->pa_node.lg_list,
+ &tmp_pa->pa_node.lg_list);
added = 1;
/*
* we want to count the total
@@ -4535,23 +6057,20 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
lg_prealloc_count++;
}
if (!added)
- list_add_tail_rcu(&pa->pa_inode_list,
+ list_add_tail_rcu(&pa->pa_node.lg_list,
&lg->lg_prealloc_list[order]);
spin_unlock(&lg->lg_prealloc_lock);
/* Now trim the list to be not more than 8 elements */
- if (lg_prealloc_count > 8) {
+ if (lg_prealloc_count > 8)
ext4_mb_discard_lg_preallocations(sb, lg,
order, lg_prealloc_count);
- return;
- }
- return ;
}
/*
* release all resource we used in allocation
*/
-static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+static void ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
@@ -4564,44 +6083,54 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
+
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if (likely(pa->pa_free)) {
+ spin_lock(pa->pa_node_lock.lg_lock);
+ list_del_rcu(&pa->pa_node.lg_list);
+ spin_unlock(pa->pa_node_lock.lg_lock);
+ ext4_mb_add_n_trim(ac);
+ }
}
- }
- if (pa) {
- /*
- * We want to add the pa to the right bucket.
- * Remove it from the list and while adding
- * make sure the list to which we are adding
- * doesn't grow big.
- */
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
- spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
- spin_unlock(pa->pa_obj_lock);
- ext4_mb_add_n_trim(ac);
- }
+
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
- if (ac->ac_bitmap_page)
- put_page(ac->ac_bitmap_page);
- if (ac->ac_buddy_page)
- put_page(ac->ac_buddy_page);
+ if (ac->ac_bitmap_folio)
+ folio_put(ac->ac_bitmap_folio);
+ if (ac->ac_buddy_folio)
+ folio_put(ac->ac_buddy_folio);
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
- return 0;
}
static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int ret;
- int freed = 0;
+ int freed = 0, busy = 0;
+ int retry = 0;
trace_ext4_mb_discard_preallocations(sb, needed);
+
+ if (needed == 0)
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+ repeat:
for (i = 0; i < ngroups && needed > 0; i++) {
- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
freed += ret;
needed -= ret;
+ cond_resched();
+ }
+
+ if (needed > 0 && busy && ++retry < 3) {
+ busy = 0;
+ goto repeat;
}
return freed;
@@ -4627,11 +6156,79 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
}
out_dbg:
- mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret));
return ret;
}
/*
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
+ * linearly starting at the goal block and also excludes the blocks which
+ * are going to be in use after fast commit replay.
+ */
+static ext4_fsblk_t
+ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = ar->inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group, nr;
+ ext4_grpblk_t blkoff;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_fsblk_t goal, block;
+ struct ext4_super_block *es = sbi->s_es;
+
+ goal = ar->goal;
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+
+ ar->len = 0;
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
+ for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ *errp = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return 0;
+ }
+
+ while (1) {
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
+ blkoff);
+ if (i >= max)
+ break;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, i))) {
+ blkoff = i + 1;
+ } else
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i < max)
+ break;
+
+ if (++group >= ext4_get_groups_count(sb))
+ group = 0;
+
+ blkoff = 0;
+ }
+
+ if (i >= max) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+
+ block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
+ ext4_mb_mark_bb(sb, block, 1, true);
+ ar->len = 1;
+
+ *errp = 0;
+ return block;
+}
+
+/*
* Main entry point into mballoc to allocate blocks
* it tries to use preallocation first, then falls back
* to usual allocation
@@ -4645,6 +6242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
+ int retries = 0;
u64 seq;
might_sleep();
@@ -4652,6 +6250,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ if (sbi->s_mount_state & EXT4_FC_REPLAY)
+ return ext4_mb_new_blocks_simple(ar, errp);
/* Allow to use superuser reservation for quota file */
if (ext4_is_quota_file(ar->inode))
@@ -4701,14 +6301,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out;
}
- *errp = ext4_mb_initialize_context(ac, ar);
- if (*errp) {
- ar->len = 0;
- goto out;
- }
+ ext4_mb_initialize_context(ac, ar);
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
- seq = *this_cpu_ptr(&discard_pa_seq);
+ seq = this_cpu_read(discard_pa_seq);
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
@@ -4727,13 +6323,13 @@ repeat:
* So we have to free this pa here itself.
*/
if (*errp) {
- ext4_mb_pa_free(ac);
+ ext4_mb_pa_put_free(ac);
ext4_discard_allocated_blocks(ac);
goto errout;
}
if (ac->ac_status == AC_STATUS_FOUND &&
ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
- ext4_mb_pa_free(ac);
+ ext4_mb_pa_put_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4745,26 +6341,26 @@ repeat:
ar->len = ac->ac_b_ex.fe_len;
}
} else {
- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
+ if (++retries < 3 &&
+ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
goto repeat;
/*
* If block allocation fails then the pa allocated above
* needs to be freed here itself.
*/
- ext4_mb_pa_free(ac);
+ ext4_mb_pa_put_free(ac);
*errp = -ENOSPC;
}
-errout:
if (*errp) {
+errout:
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
ext4_mb_show_ac(ac);
}
ext4_mb_release_context(ac);
+ kmem_cache_free(ext4_ac_cachep, ac);
out:
- if (ac)
- kmem_cache_free(ext4_ac_cachep, ac);
if (inquota && ar->len < inquota)
dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
@@ -4784,47 +6380,83 @@ out:
* are contiguous, AND the extents were freed by the same transaction,
* AND the blocks are associated with the same group.
*/
-static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
- struct ext4_free_data *entry,
- struct ext4_free_data *new_entry,
- struct rb_root *entry_rb_root)
+static inline bool
+ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
{
- if ((entry->efd_tid != new_entry->efd_tid) ||
- (entry->efd_group != new_entry->efd_group))
- return;
- if (entry->efd_start_cluster + entry->efd_count ==
- new_entry->efd_start_cluster) {
- new_entry->efd_start_cluster = entry->efd_start_cluster;
- new_entry->efd_count += entry->efd_count;
- } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
- entry->efd_start_cluster) {
- new_entry->efd_count += entry->efd_count;
- } else
- return;
+ if (entry1->efd_tid != entry2->efd_tid)
+ return false;
+ if (entry1->efd_start_cluster + entry1->efd_count !=
+ entry2->efd_start_cluster)
+ return false;
+ if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group))
+ return false;
+ return true;
+}
+
+static inline void
+ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ entry1->efd_count += entry2->efd_count;
spin_lock(&sbi->s_md_lock);
- list_del(&entry->efd_list);
+ list_del(&entry2->efd_list);
spin_unlock(&sbi->s_md_lock);
- rb_erase(&entry->efd_node, entry_rb_root);
- kmem_cache_free(ext4_free_data_cachep, entry);
+ rb_erase(&entry2->efd_node, root);
+ kmem_cache_free(ext4_free_data_cachep, entry2);
}
-static noinline_for_stack int
+static inline void
+ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry)
+{
+ struct ext4_free_data *prev;
+ struct rb_node *node;
+
+ node = rb_prev(&entry->efd_node);
+ if (!node)
+ return;
+
+ prev = rb_entry(node, struct ext4_free_data, efd_node);
+ if (ext4_freed_extents_can_be_merged(prev, entry))
+ ext4_merge_freed_extents(sbi, root, prev, entry);
+}
+
+static inline void
+ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry)
+{
+ struct ext4_free_data *next;
+ struct rb_node *node;
+
+ node = rb_next(&entry->efd_node);
+ if (!node)
+ return;
+
+ next = rb_entry(node, struct ext4_free_data, efd_node);
+ if (ext4_freed_extents_can_be_merged(entry, next))
+ ext4_merge_freed_extents(sbi, root, entry, next);
+}
+
+static noinline_for_stack void
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_free_data *new_entry)
{
ext4_group_t group = e4b->bd_group;
ext4_grpblk_t cluster;
ext4_grpblk_t clusters = new_entry->efd_count;
- struct ext4_free_data *entry;
+ struct ext4_free_data *entry = NULL;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_root *root = &db->bb_free_root;
+ struct rb_node **n = &root->rb_node;
struct rb_node *parent = NULL, *new_node;
BUG_ON(!ext4_handle_valid(handle));
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
+ BUG_ON(e4b->bd_bitmap_folio == NULL);
+ BUG_ON(e4b->bd_buddy_folio == NULL);
new_node = &new_entry->efd_node;
cluster = new_entry->efd_start_cluster;
@@ -4835,8 +6467,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
* otherwise we'll refresh it from
* on-disk bitmap and lose not-yet-available
* blocks */
- get_page(e4b->bd_buddy_page);
- get_page(e4b->bd_bitmap_page);
+ folio_get(e4b->bd_buddy_folio);
+ folio_get(e4b->bd_bitmap_folio);
}
while (*n) {
parent = *n;
@@ -4850,137 +6482,94 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_first_block_no(sb, group) +
EXT4_C2B(sbi, cluster),
"Block already on to-be-freed list");
- return 0;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ return;
}
}
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &db->bb_free_root);
+ atomic_add(clusters, &sbi->s_mb_free_pending);
+ if (!entry)
+ goto insert;
- /* Now try to see the extent can be merged to left and right */
- node = rb_prev(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ /* Now try to see the extent can be merged to prev and next */
+ if (ext4_freed_extents_can_be_merged(new_entry, entry)) {
+ entry->efd_start_cluster = cluster;
+ entry->efd_count += new_entry->efd_count;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ ext4_try_merge_freed_extent_prev(sbi, root, entry);
+ return;
}
-
- node = rb_next(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ if (ext4_freed_extents_can_be_merged(entry, new_entry)) {
+ entry->efd_count += new_entry->efd_count;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ ext4_try_merge_freed_extent_next(sbi, root, entry);
+ return;
}
+insert:
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, root);
spin_lock(&sbi->s_md_lock);
- list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
- sbi->s_mb_free_pending += clusters;
+ list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
spin_unlock(&sbi->s_md_lock);
- return 0;
+}
+
+static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
+ unsigned long count)
+{
+ struct super_block *sb = inode->i_sb;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+ ext4_mb_mark_context(NULL, sb, false, group, blkoff, count,
+ EXT4_MB_BITMAP_MARKED_CHECK |
+ EXT4_MB_SYNC_UPDATE,
+ NULL);
}
/**
- * ext4_free_blocks() -- Free given blocks and update quota
+ * ext4_mb_clear_bb() -- helper function for freeing blocks.
+ * Used by ext4_free_blocks()
* @handle: handle for this transaction
* @inode: inode
- * @bh: optional buffer of the block to be freed
* @block: starting physical block to be freed
* @count: number of blocks to be freed
* @flags: flags used by ext4_free_blocks
*/
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block,
- unsigned long count, int flags)
+static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t block, unsigned long count,
+ int flags)
{
- struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
- struct ext4_group_desc *gdp;
+ struct ext4_group_info *grp;
unsigned int overflow;
ext4_grpblk_t bit;
- struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
struct ext4_buddy e4b;
unsigned int count_clusters;
int err = 0;
- int ret;
-
- might_sleep();
- if (bh) {
- if (block)
- BUG_ON(block != bh->b_blocknr);
- else
- block = bh->b_blocknr;
- }
+ int mark_flags = 0;
+ ext4_grpblk_t changed;
sbi = EXT4_SB(sb);
- if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_data_block_valid(sbi, block, count)) {
- ext4_error(sb, "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
- goto error_return;
- }
-
- ext4_debug("freeing block %llu\n", block);
- trace_ext4_free_blocks(inode, block, count, flags);
-
- if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- BUG_ON(count > 1);
-
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block);
- }
-
- /*
- * If the extent to be freed does not begin on a cluster
- * boundary, we need to deal with partial clusters at the
- * beginning and end of the extent. Normally we will free
- * blocks at the beginning or the end unless we are explicitly
- * requested to avoid doing so.
- */
- overflow = EXT4_PBLK_COFF(sbi, block);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
- overflow = sbi->s_cluster_ratio - overflow;
- block += overflow;
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else {
- block -= overflow;
- count += overflow;
- }
- }
- overflow = EXT4_LBLK_COFF(sbi, count);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else
- count += sbi->s_cluster_ratio - overflow;
- }
-
- if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- int i;
- int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
- for (i = 0; i < count; i++) {
- cond_resched();
- if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
- ext4_forget(handle, is_metadata, inode, bh, block + i);
- }
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks in system zone - "
+ "Block = %llu, count = %lu", block, count);
+ /* err = 0. ext4_std_error should be a no op */
+ goto error_out;
}
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
- ext4_get_group_info(sb, block_group))))
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return;
/*
@@ -4991,61 +6580,39 @@ do_more:
overflow = EXT4_C2B(sbi, bit) + count -
EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
count_clusters = EXT4_NUM_B2C(sbi, count);
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto error_return;
- }
- gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!gdp) {
- err = -EIO;
- goto error_return;
- }
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
- if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
- in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
- in_range(block, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group)) {
+ /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+ GFP_NOFS|__GFP_NOFAIL);
+ if (err)
+ goto error_out;
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
- goto error_return;
+ goto error_clean;
}
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
- if (err)
- goto error_return;
#ifdef AGGRESSIVE_CHECK
- {
- int i;
- for (i = 0; i < count_clusters; i++)
- BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
- }
+ mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
- trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+ err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+ count_clusters, mark_flags, &changed);
- /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
- err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
- GFP_NOFS|__GFP_NOFAIL);
- if (err)
- goto error_return;
+
+ if (err && changed == 0)
+ goto error_clean;
+
+#ifdef AGGRESSIVE_CHECK
+ BUG_ON(changed != count_clusters);
+#endif
/*
* We need to make sure we don't reuse the freed block until after the
@@ -5069,42 +6636,32 @@ do_more:
new_entry->efd_tid = handle->h_transaction->t_tid;
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
} else {
- /* need to update group_info->bb_free and bitmap
- * with group lock held. generate_buddy look at
- * them with group lock_held
- */
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count,
- NULL);
- if (err && err != -EOPNOTSUPP)
+ err = ext4_issue_discard(sb, block_group, bit,
+ count_clusters);
+ /*
+ * Ignore EOPNOTSUPP error. This is consistent with
+ * what happens when using journal.
+ */
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ if (err)
ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%lu failed"
+ " group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
- } else
- EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+ }
+
+ EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
mb_free_blocks(inode, &e4b, bit, count_clusters);
}
- ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
- ext4_free_group_clusters_set(sb, gdp, ret);
- ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic64_add(count_clusters,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
-
/*
* on a bigalloc file system, defer the s_freeclusters_counter
* update to the caller (ext4_remove_space and friends) so they
@@ -5117,28 +6674,122 @@ do_more:
count_clusters);
}
- ext4_mb_unload_buddy(&e4b);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
if (overflow && !err) {
block += count;
count = overflow;
- put_bh(bitmap_bh);
+ ext4_mb_unload_buddy(&e4b);
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
goto do_more;
}
-error_return:
- brelse(bitmap_bh);
+
+error_clean:
+ ext4_mb_unload_buddy(&e4b);
+error_out:
ext4_std_error(sb, err);
- return;
+}
+
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @bh: optional buffer of the block to be freed
+ * @block: starting physical block to be freed
+ * @count: number of blocks to be freed
+ * @flags: flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned int overflow;
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
+
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
+ return;
+ }
+
+ might_sleep();
+
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
+ return;
+ }
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
+
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
+
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
+ }
+
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
+ }
+
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ if (is_metadata)
+ bh = sb_find_get_block_nonatomic(inode->i_sb,
+ block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
+ }
+ }
+
+ ext4_mb_clear_bb(handle, inode, block, count, flags);
}
/**
@@ -5153,23 +6804,19 @@ error_return:
int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count)
{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gd_bh;
ext4_group_t block_group;
ext4_grpblk_t bit;
- unsigned int i;
- struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- int err = 0, ret, free_clusters_count;
- ext4_grpblk_t clusters_freed;
+ int err = 0;
ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
unsigned long cluster_count = last_cluster - first_cluster + 1;
+ ext4_grpblk_t changed;
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
- if (count == 0)
+ if (cluster_count == 0)
return 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -5181,102 +6828,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_warning(sb, "too many blocks added to group %u",
block_group);
err = -EINVAL;
- goto error_return;
+ goto error_out;
}
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto error_return;
- }
-
- desc = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!desc) {
- err = -EIO;
- goto error_return;
- }
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_out;
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
+ if (!ext4_sb_block_valid(sb, NULL, block, count)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
err = -EINVAL;
- goto error_return;
+ goto error_clean;
}
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
- if (err)
- goto error_return;
-
- for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
- ext4_error(sb, "bit already cleared for block %llu",
- (ext4_fsblk_t)(block + i));
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- clusters_freed++;
- }
- }
+ err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+ cluster_count, EXT4_MB_BITMAP_MARKED_CHECK,
+ &changed);
+ if (err && changed == 0)
+ goto error_clean;
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
- if (err)
- goto error_return;
+ if (changed != cluster_count)
+ ext4_error(sb, "bit already cleared in group %u", block_group);
- /*
- * need to update group_info->bb_free and bitmap
- * with group lock held. generate_buddy look at
- * them with group lock_held
- */
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
mb_free_blocks(NULL, &e4b, bit, cluster_count);
- free_clusters_count = clusters_freed +
- ext4_free_group_clusters(sb, desc);
- ext4_free_group_clusters_set(sb, desc, free_clusters_count);
- ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
- ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
- clusters_freed);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic64_add(clusters_freed,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
+ changed);
+error_clean:
ext4_mb_unload_buddy(&e4b);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
-error_return:
- brelse(bitmap_bh);
+error_out:
ext4_std_error(sb, err);
return err;
}
@@ -5286,19 +6870,19 @@ error_return:
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
* @count: number of blocks to TRIM
- * @group: alloc. group we are working with
* @e4b: ext4 buddy for the group
*
* Trim "count" blocks starting at "start" in the "group". To assure that no
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_trim_extent(struct super_block *sb,
+ int start, int count, struct ext4_buddy *e4b)
__releases(bitlock)
__acquires(bitlock)
{
struct ext4_free_extent ex;
+ ext4_group_t group = e4b->bd_group;
int ret = 0;
trace_ext4_trim_extent(sb, group, start, count);
@@ -5315,12 +6899,92 @@ __acquires(bitlock)
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ret = ext4_issue_discard(sb, group, start, count, NULL);
+ ret = ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
}
+static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
+ ext4_group_t grp)
+{
+ unsigned long nr_clusters_in_group;
+
+ if (grp < (ext4_get_groups_count(sb) - 1))
+ nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
+ else
+ nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, grp))
+ >> EXT4_CLUSTER_BITS(sb);
+
+ return nr_clusters_in_group - 1;
+}
+
+static bool ext4_trim_interrupted(void)
+{
+ return fatal_signal_pending(current) || freezing(current);
+}
+
+static int ext4_try_to_trim_range(struct super_block *sb,
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
+ ext4_grpblk_t max, ext4_grpblk_t minblocks)
+__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
+__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
+{
+ ext4_grpblk_t next, count, free_count, last, origin_start;
+ bool set_trimmed = false;
+ void *bitmap;
+
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return 0;
+
+ last = ext4_last_grp_cluster(sb, e4b->bd_group);
+ bitmap = e4b->bd_bitmap;
+ if (start == 0 && max >= last)
+ set_trimmed = true;
+ origin_start = start;
+ start = max(e4b->bd_info->bb_first_free, start);
+ count = 0;
+ free_count = 0;
+
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
+ break;
+
+ next = mb_find_next_bit(bitmap, last + 1, start);
+ if (origin_start == 0 && next >= last)
+ set_trimmed = true;
+
+ if ((next - start) >= minblocks) {
+ int ret = ext4_trim_extent(sb, start, next - start, e4b);
+
+ if (ret && ret != -EOPNOTSUPP)
+ return count;
+ count += next - start;
+ }
+ free_count += next - start;
+ start = next + 1;
+
+ if (ext4_trim_interrupted())
+ return count;
+
+ if (need_resched()) {
+ ext4_unlock_group(sb, e4b->bd_group);
+ cond_resched();
+ ext4_lock_group(sb, e4b->bd_group);
+ }
+
+ if ((e4b->bd_info->bb_free - free_count) < minblocks)
+ break;
+ }
+
+ if (set_trimmed)
+ EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
+
+ return count;
+}
+
/**
* ext4_trim_all_free -- function to trim all free space in alloc. group
* @sb: super block for file system
@@ -5329,25 +6993,18 @@ __acquires(bitlock)
* @max: last group block to examine
* @minblocks: minimum extent block count
*
- * ext4_trim_all_free walks through group's buddy bitmap searching for free
- * extents. When the free block is found, ext4_trim_extent is called to TRIM
- * the extent.
- *
- *
* ext4_trim_all_free walks through group's block bitmap searching for free
* extents. When the free extent is found, mark it as used in group buddy
* bitmap. Then issue a TRIM command on this extent and free the extent in
- * the group buddy bitmap. This is done until whole group is scanned.
+ * the group buddy bitmap.
*/
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
ext4_grpblk_t minblocks)
{
- void *bitmap;
- ext4_grpblk_t next, count = 0, free_count = 0;
struct ext4_buddy e4b;
- int ret = 0;
+ int ret;
trace_ext4_trim_all_free(sb, group, start, max);
@@ -5357,58 +7014,20 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ret, group);
return ret;
}
- bitmap = e4b.bd_bitmap;
ext4_lock_group(sb, group);
- if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
- minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
- goto out;
-
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
-
- while (start <= max) {
- start = mb_find_next_zero_bit(bitmap, max + 1, start);
- if (start > max)
- break;
- next = mb_find_next_bit(bitmap, max + 1, start);
-
- if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start,
- next - start, group, &e4b);
- if (ret && ret != -EOPNOTSUPP)
- break;
- ret = 0;
- count += next - start;
- }
- free_count += next - start;
- start = next + 1;
-
- if (fatal_signal_pending(current)) {
- count = -ERESTARTSYS;
- break;
- }
-
- if (need_resched()) {
- ext4_unlock_group(sb, group);
- cond_resched();
- ext4_lock_group(sb, group);
- }
- if ((e4b.bd_info->bb_free - free_count) < minblocks)
- break;
- }
+ if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks)
+ ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
+ else
+ ret = 0;
- if (!ret) {
- ret = count;
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
- }
-out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
- count, group);
+ ret, group);
return ret;
}
@@ -5427,6 +7046,7 @@ out:
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
+ unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
@@ -5445,7 +7065,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start >= max_blks ||
range->len < sb->s_blocksize)
return -EINVAL;
- if (end >= max_blks)
+ /* No point to try to trim less than discard granularity */
+ if (range->minlen < discard_granularity) {
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ discard_granularity >> sb->s_blocksize_bits);
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
+ goto out;
+ }
+ if (end >= max_blks - 1)
end = max_blks - 1;
if (end <= first_data_blk)
goto out;
@@ -5462,7 +7089,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
for (group = first_group; group <= last_group; group++) {
+ if (ext4_trim_interrupted())
+ break;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ continue;
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
ret = ext4_mb_init_group(sb, group, GFP_NOFS);
@@ -5478,10 +7109,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
*/
if (group == last_group)
end = last_cluster;
-
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen);
if (cnt < 0) {
ret = cnt;
break;
@@ -5497,7 +7127,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
if (!ret)
- atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+ EXT4_SB(sb)->s_last_trim_minblks = minlen;
out:
range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
@@ -5509,13 +7139,14 @@ int
ext4_mballoc_query_range(
struct super_block *sb,
ext4_group_t group,
- ext4_grpblk_t start,
+ ext4_grpblk_t first,
ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
ext4_mballoc_query_range_fn formatter,
void *priv)
{
void *bitmap;
- ext4_grpblk_t next;
+ ext4_grpblk_t start, next;
struct ext4_buddy e4b;
int error;
@@ -5526,11 +7157,19 @@ ext4_mballoc_query_range(
ext4_lock_group(sb, group);
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
+ start = max(e4b.bd_info->bb_first_free, first);
if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
-
+ if (meta_formatter && start != first) {
+ if (start > end)
+ start = end;
+ ext4_unlock_group(sb, group);
+ error = meta_formatter(sb, group, first, start - first,
+ priv);
+ if (error)
+ goto out_unload;
+ ext4_lock_group(sb, group);
+ }
while (start <= end) {
start = mb_find_next_zero_bit(bitmap, end + 1, start);
if (start > end)
@@ -5552,3 +7191,7 @@ out_unload:
return error;
}
+
+#ifdef CONFIG_EXT4_KUNIT_TESTS
+#include "mballoc-test.c"
+#endif
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6b4d17c2935d..15a049f05d04 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -49,7 +49,7 @@
#define MB_DEFAULT_MIN_TO_SCAN 10
/*
- * with 'ext4_mb_stats' allocator will collect stats that will be
+ * with 's_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
#define MB_DEFAULT_STATS 0
@@ -59,7 +59,7 @@
* by the stream allocator, which purpose is to pack requests
* as close each to other as possible to produce smooth I/O traffic
* We use locality group prealloc space for stream request.
- * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ * We can tune the same via /proc/fs/ext4/<partition>/stream_req
*/
#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
@@ -73,6 +73,29 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512
+/*
+ * Number of groups to search linearly before performing group scanning
+ * optimization.
+ */
+#define MB_DEFAULT_LINEAR_LIMIT 4
+
+/*
+ * Minimum number of groups that should be present in the file system to perform
+ * group scanning optimizations.
+ */
+#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16
+
+/*
+ * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular
+ * allocation request. Example, if we have an order 7 request and max trim order
+ * of 3, we can trim this request upto order 4.
+ */
+#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3
+
+/*
+ * Number of valid buddy orders
+ */
+#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2)
struct ext4_free_data {
/* this links the free block information from sb_info */
@@ -93,7 +116,10 @@ struct ext4_free_data {
};
struct ext4_prealloc_space {
- struct list_head pa_inode_list;
+ union {
+ struct rb_node inode_node; /* for inode PA rbtree */
+ struct list_head lg_list; /* for lg PAs */
+ } pa_node;
struct list_head pa_group_list;
union {
struct list_head pa_tmp_list;
@@ -107,8 +133,11 @@ struct ext4_prealloc_space {
ext4_grpblk_t pa_len; /* len of preallocated chunk */
ext4_grpblk_t pa_free; /* how many blocks are free */
unsigned short pa_type; /* pa type. inode or group */
- spinlock_t *pa_obj_lock;
- struct inode *pa_inode; /* hack, for history only */
+ union {
+ rwlock_t *inode_lock; /* locks the rbtree holding this PA */
+ spinlock_t *lg_lock; /* locks the lg list holding this PA */
+ } pa_node_lock;
+ struct inode *pa_inode; /* used to get the inode during group discard */
};
enum {
@@ -157,18 +186,33 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
+ /*
+ * goal len can change in CR_BEST_AVAIL_LEN, so save the original len.
+ * This is used while adjusting the PA window and for accounting.
+ */
+ ext4_grpblk_t ac_orig_goal_len;
+
+ ext4_group_t ac_prefetch_grp;
+ unsigned int ac_prefetch_ios;
+ unsigned int ac_prefetch_nr;
+
+ int ac_first_err;
+
+ __u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned;
__u16 ac_found;
+ __u16 ac_cX_found[EXT4_MB_NUM_CRS];
__u16 ac_tail;
__u16 ac_buddy;
- __u16 ac_flags; /* allocation hints */
__u8 ac_status;
__u8 ac_criteria;
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
- struct page *ac_bitmap_page;
- struct page *ac_buddy_page;
+
+ struct ext4_buddy *ac_e4b;
+ struct folio *ac_bitmap_folio;
+ struct folio *ac_buddy_folio;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
@@ -178,9 +222,9 @@ struct ext4_allocation_context {
#define AC_STATUS_BREAK 3
struct ext4_buddy {
- struct page *bd_buddy_page;
+ struct folio *bd_buddy_folio;
void *bd_buddy;
- struct page *bd_bitmap_page;
+ struct folio *bd_bitmap_folio;
void *bd_bitmap;
struct ext4_group_info *bd_info;
struct super_block *bd_sb;
@@ -195,6 +239,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}
+static inline loff_t extent_logical_end(struct ext4_sb_info *sbi,
+ struct ext4_free_extent *fex)
+{
+ /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
+ return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len);
+}
+
+static inline loff_t pa_logical_end(struct ext4_sb_info *sbi,
+ struct ext4_prealloc_space *pa)
+{
+ /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
+ return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len);
+}
+
typedef int (*ext4_mballoc_query_range_fn)(
struct super_block *sb,
ext4_group_t agno,
@@ -208,6 +266,7 @@ ext4_mballoc_query_range(
ext4_group_t agno,
ext4_grpblk_t start,
ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
ext4_mballoc_query_range_fn formatter,
void *priv);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index c5e3fc998211..1b0dfd963d3f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -32,19 +32,18 @@ static int finish_range(handle_t *handle, struct inode *inode,
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
- /* Locking only for convinience since we are operating on temp inode */
+ /* Locking only for convenience since we are operating on temp inode */
down_write(&EXT4_I(inode)->i_data_sem);
path = ext4_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
- path = NULL;
goto err_out;
}
/*
* Calculate the credit needed to inserting this extent
- * Since we are doing this in loop we may accumalate extra
- * credit. But below we try to not accumalate too much
+ * Since we are doing this in loop we may accumulate extra
+ * credit. But below we try to not accumulate too much
* of them by restarting the journal.
*/
needed = ext4_ext_calc_credits_for_single_extent(inode,
@@ -53,11 +52,12 @@ static int finish_range(handle_t *handle, struct inode *inode,
retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
if (retval < 0)
goto err_out;
- retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
+ path = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+ if (IS_ERR(path))
+ retval = PTR_ERR(path);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
lb->first_pblock = 0;
return retval;
}
@@ -409,7 +409,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
int ext4_ext_migrate(struct inode *inode)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
@@ -417,15 +416,17 @@ int ext4_ext_migrate(struct inode *inode)
struct inode *tmp_inode = NULL;
struct migrate_struct lb;
unsigned long max_entries;
- __u32 goal;
+ __u32 goal, tmp_csum_seed;
uid_t owner[2];
+ int alloc_ctx;
/*
* If the filesystem does not support extents, or the inode
* already is extent-based, error out.
*/
if (!ext4_has_feature_extents(inode->i_sb) ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ ext4_has_inline_data(inode))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
@@ -434,15 +435,15 @@ int ext4_ext_migrate(struct inode *inode)
*/
return retval;
- percpu_down_write(&sbi->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_write(inode->i_sb);
/*
- * Worst case we can touch the allocation bitmaps, a bgd
- * block, and a block to link in the orphan list. We do need
- * need to worry about credits for modifying the quota inode.
+ * Worst case we can touch the allocation bitmaps and a block
+ * group descriptor block. We do need to worry about
+ * credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
- 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+ 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
@@ -459,6 +460,14 @@ int ext4_ext_migrate(struct inode *inode)
ext4_journal_stop(handle);
goto out_unlock;
}
+ /*
+ * Use the correct seed for checksum (i.e. the seed from 'inode'). This
+ * is so that the metadata blocks will have the correct checksum after
+ * the migration.
+ */
+ ei = EXT4_I(inode);
+ tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed;
+ EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
* Set the i_nlink to zero so it will be deleted later
@@ -467,7 +476,6 @@ int ext4_ext_migrate(struct inode *inode)
clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
- ext4_orphan_add(handle, tmp_inode);
ext4_journal_stop(handle);
/*
@@ -479,7 +487,7 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block
+ * Even though we take i_rwsem we can still cause block
* allocation via mmap write to holes. If we have allocated
* new blocks we fail migrate. New block allocation will
* clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
@@ -492,17 +500,10 @@ int ext4_ext_migrate(struct inode *inode)
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
- /*
- * It is impossible to update on-disk structures without
- * a handle, so just rollback in-core changes and live other
- * work to orphan_list_cleanup()
- */
- ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
goto out_tmp_inode;
}
- ei = EXT4_I(inode);
i_data = ei->i_data;
memset(&lb, 0, sizeof(lb));
@@ -576,6 +577,7 @@ err_out:
* the inode is not visible to user space.
*/
tmp_inode->i_blocks = 0;
+ EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
@@ -585,7 +587,7 @@ out_tmp_inode:
unlock_new_inode(tmp_inode);
iput(tmp_inode);
out_unlock:
- percpu_up_write(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
return retval;
}
@@ -604,6 +606,7 @@ int ext4_ind_migrate(struct inode *inode)
ext4_fsblk_t blk;
handle_t *handle;
int ret, ret2 = 0;
+ int alloc_ctx;
if (!ext4_has_feature_extents(inode->i_sb) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
@@ -620,7 +623,7 @@ int ext4_ind_migrate(struct inode *inode)
if (test_opt(inode->i_sb, DELALLOC))
ext4_alloc_da_blocks(inode);
- percpu_down_write(&sbi->s_writepages_rwsem);
+ alloc_ctx = ext4_writepages_down_write(inode->i_sb);
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
@@ -661,9 +664,9 @@ int ext4_ind_migrate(struct inode *inode)
if (unlikely(ret2 && !ret))
ret = ret2;
errout:
- ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
out_unlock:
- percpu_up_write(&sbi->s_writepages_rwsem);
+ ext4_writepages_up_write(inode->i_sb, alloc_ctx);
return ret;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d34cb8c46655..6f57c181ff77 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -14,14 +14,14 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
int offset = offsetof(struct mmp_struct, mmp_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+ csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset);
return cpu_to_le32(csum);
}
static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
@@ -39,28 +39,32 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
* Write the MMP block using REQ_SYNC to try to get the block on-disk
* faster.
*/
-static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+static int write_mmp_block_thawed(struct super_block *sb,
+ struct buffer_head *bh)
{
struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
- /*
- * We protect against freezing so that we don't create dirty buffers
- * on frozen filesystem.
- */
- sb_start_write(sb);
ext4_mmp_csum_set(sb, mmp);
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
- sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
- return 1;
-
+ return -EIO;
return 0;
}
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+{
+ /*
+ * We protect against freezing so that we don't create dirty buffers
+ * on frozen filesystem.
+ */
+ scoped_guard(super_write, sb)
+ return write_mmp_block_thawed(sb, bh);
+}
+
/*
* Read the MMP block. It _must_ be read from disk and hence we clear the
* uptodate flag on the buffer.
@@ -85,15 +89,11 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
}
- get_bh(*bh);
lock_buffer(*bh);
- (*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
- ret = -EIO;
+ ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false);
+ if (ret)
goto warn_exit;
- }
+
mmp = (struct mmp_struct *)((*bh)->b_data);
if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
ret = -EFSCORRUPTED;
@@ -131,9 +131,9 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
*/
static int kmmpd(void *data)
{
- struct super_block *sb = ((struct mmpd_data *) data)->sb;
- struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct super_block *sb = data;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
struct mmp_struct *mmp;
ext4_fsblk_t mmp_block;
u32 seq = 0;
@@ -142,7 +142,7 @@ static int kmmpd(void *data)
unsigned mmp_check_interval;
unsigned long last_update_time;
unsigned long diff;
- int retval;
+ int retval = 0;
mmp_block = le64_to_cpu(es->s_mmp_block);
mmp = (struct mmp_struct *)(bh->b_data);
@@ -154,13 +154,16 @@ static int kmmpd(void *data)
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
- BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
- bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
- while (!kthread_should_stop()) {
+ while (!kthread_should_stop() && !ext4_emergency_state(sb)) {
+ if (!ext4_has_feature_mmp(sb)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ goto wait_to_exit;
+ }
if (++seq > EXT4_MMP_SEQ_MAX)
seq = 1;
@@ -181,16 +184,6 @@ static int kmmpd(void *data)
failed_writes++;
}
- if (!(le32_to_cpu(es->s_feature_incompat) &
- EXT4_FEATURE_INCOMPAT_MMP)) {
- ext4_warning(sb, "kmmpd being stopped since MMP feature"
- " has been disabled.");
- goto exit_thread;
- }
-
- if (sb_rdonly(sb))
- break;
-
diff = jiffies - last_update_time;
if (diff < mmp_update_interval * HZ)
schedule_timeout_interruptible(mmp_update_interval *
@@ -211,7 +204,7 @@ static int kmmpd(void *data)
ext4_error_err(sb, -retval,
"error reading MMP data: %d",
retval);
- goto exit_thread;
+ goto wait_to_exit;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -225,7 +218,7 @@ static int kmmpd(void *data)
ext4_error_err(sb, EBUSY, "abort");
put_bh(bh_check);
retval = -EBUSY;
- goto exit_thread;
+ goto wait_to_exit;
}
put_bh(bh_check);
}
@@ -234,9 +227,9 @@ static int kmmpd(void *data)
* Adjust the mmp_check_interval depending on how much time
* it took for the MMP block to be written.
*/
- mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
- EXT4_MMP_MAX_CHECK_INTERVAL),
- EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MIN_CHECK_INTERVAL,
+ EXT4_MMP_MAX_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
}
@@ -248,26 +241,32 @@ static int kmmpd(void *data)
retval = write_mmp_block(sb, bh);
-exit_thread:
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- kfree(data);
- brelse(bh);
+wait_to_exit:
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop())
+ schedule();
+ }
+ set_current_state(TASK_RUNNING);
return retval;
}
+void ext4_stop_mmpd(struct ext4_sb_info *sbi)
+{
+ if (sbi->s_mmp_tsk) {
+ kthread_stop(sbi->s_mmp_tsk);
+ brelse(sbi->s_mmp_bh);
+ sbi->s_mmp_tsk = NULL;
+ }
+}
+
/*
* Get a random new sequence number but make sure it is not greater than
* EXT4_MMP_SEQ_MAX.
*/
static unsigned int mmp_new_seq(void)
{
- u32 new_seq;
-
- do {
- new_seq = prandom_u32();
- } while (new_seq > EXT4_MMP_SEQ_MAX);
-
- return new_seq;
+ return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1);
}
/*
@@ -279,7 +278,6 @@ int ext4_multi_mount_protect(struct super_block *sb,
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = NULL;
struct mmp_struct *mmp = NULL;
- struct mmpd_data *mmpd_data;
u32 seq;
unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned int wait_time = 0;
@@ -288,6 +286,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
mmp_block >= ext4_blocks_count(es)) {
ext4_warning(sb, "Invalid MMP block in superblock");
+ retval = -EINVAL;
goto failed;
}
@@ -313,6 +312,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
if (seq == EXT4_MMP_SEQ_FSCK) {
dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+ retval = -EBUSY;
goto failed;
}
@@ -326,6 +326,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ retval = -ETIMEDOUT;
goto failed;
}
@@ -336,6 +337,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
+ retval = -EBUSY;
goto failed;
}
@@ -346,7 +348,11 @@ skip:
seq = mmp_new_seq();
mmp->mmp_seq = cpu_to_le32(seq);
- retval = write_mmp_block(sb, bh);
+ /*
+ * On mount / remount we are protected against fs freezing (by s_umount
+ * semaphore) and grabbing freeze protection upsets lockdep
+ */
+ retval = write_mmp_block_thawed(sb, bh);
if (retval)
goto failed;
@@ -355,6 +361,7 @@ skip:
*/
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount");
+ retval = -ETIMEDOUT;
goto failed;
}
@@ -365,29 +372,27 @@ skip:
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
+ retval = -EBUSY;
goto failed;
}
- mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL);
- if (!mmpd_data) {
- ext4_warning(sb, "not enough memory for mmpd_data");
- goto failed;
- }
- mmpd_data->sb = sb;
- mmpd_data->bh = bh;
+ EXT4_SB(sb)->s_mmp_bh = bh;
+
+ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
+ snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname),
+ "%pg", bh->b_bdev);
/*
* Start a kernel thread to update the MMP block periodically.
*/
- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
(int)sizeof(mmp->mmp_bdevname),
- bdevname(bh->b_bdev,
- mmp->mmp_bdevname));
+ mmp->mmp_bdevname);
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
EXT4_SB(sb)->s_mmp_tsk = NULL;
- kfree(mmpd_data);
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
sb->s_id);
+ retval = -ENOMEM;
goto failed;
}
@@ -395,7 +400,5 @@ skip:
failed:
brelse(bh);
- return 1;
+ return retval;
}
-
-
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1ed86fb6c302..0550fd30fd10 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -8,37 +8,19 @@
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "ext4_extents.h"
-/**
- * get_ext_path() - Find an extent path for designated logical block number.
- * @inode: inode to be searched
- * @lblock: logical block number to find an extent path
- * @ppath: pointer to an extent path pointer (for output)
- *
- * ext4_find_extent wrapper. Return 0 on success, or a negative error value
- * on failure.
- */
-static inline int
-get_ext_path(struct inode *inode, ext4_lblk_t lblock,
- struct ext4_ext_path **ppath)
-{
- struct ext4_ext_path *path;
-
- path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
- if (IS_ERR(path))
- return PTR_ERR(path);
- if (path[ext_depth(inode)].p_ext == NULL) {
- ext4_ext_drop_refs(path);
- kfree(path);
- *ppath = NULL;
- return -ENODATA;
- }
- *ppath = path;
- return 0;
-}
+#include <trace/events/ext4.h>
+
+struct mext_data {
+ struct inode *orig_inode; /* Origin file inode */
+ struct inode *donor_inode; /* Donor file inode */
+ struct ext4_map_blocks orig_map;/* Origin file's move mapping */
+ ext4_lblk_t donor_lblk; /* Start block of the donor file */
+};
/**
* ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
@@ -56,7 +38,6 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
} else {
down_write(&EXT4_I(second)->i_data_sem);
down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
-
}
}
@@ -75,59 +56,14 @@ ext4_double_up_write_data_sem(struct inode *orig_inode,
up_write(&EXT4_I(donor_inode)->i_data_sem);
}
-/**
- * mext_check_coverage - Check that all extents in range has the same type
- *
- * @inode: inode in question
- * @from: block offset of inode
- * @count: block count to be checked
- * @unwritten: extents expected to be unwritten
- * @err: pointer to save error value
- *
- * Return 1 if all extents in range has expected type, and zero otherwise.
- */
-static int
-mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
- int unwritten, int *err)
-{
- struct ext4_ext_path *path = NULL;
- struct ext4_extent *ext;
- int ret = 0;
- ext4_lblk_t last = from + count;
- while (from < last) {
- *err = get_ext_path(inode, from, &path);
- if (*err)
- goto out;
- ext = path[ext_depth(inode)].p_ext;
- if (unwritten != ext4_ext_is_unwritten(ext))
- goto out;
- from += ext4_ext_get_actual_len(ext);
- ext4_ext_drop_refs(path);
- }
- ret = 1;
-out:
- ext4_ext_drop_refs(path);
- kfree(path);
- return ret;
-}
-
-/**
- * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
- *
- * @inode1: the inode structure
- * @inode2: the inode structure
- * @index1: page index
- * @index2: page index
- * @page: result page vector
- *
- * Grab two locked pages for inode's by inode order
- */
-static int
-mext_page_double_lock(struct inode *inode1, struct inode *inode2,
- pgoff_t index1, pgoff_t index2, struct page *page[2])
+/* Grab and lock folio on both @inode1 and @inode2 by inode order. */
+static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
+ pgoff_t index1, pgoff_t index2, size_t len,
+ struct folio *folio[2])
{
struct address_space *mapping[2];
- unsigned fl = AOP_FLAG_NOFS;
+ unsigned int flags;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
BUG_ON(!inode1 || !inode2);
if (inode1 < inode2) {
@@ -139,357 +75,425 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
mapping[1] = inode1->i_mapping;
}
- page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
- if (!page[0])
- return -ENOMEM;
+ flags = memalloc_nofs_save();
+ fgp_flags |= fgf_set_order(len);
+ folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags,
+ mapping_gfp_mask(mapping[0]));
+ if (IS_ERR(folio[0])) {
+ memalloc_nofs_restore(flags);
+ return PTR_ERR(folio[0]);
+ }
- page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
- if (!page[1]) {
- unlock_page(page[0]);
- put_page(page[0]);
- return -ENOMEM;
+ folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags,
+ mapping_gfp_mask(mapping[1]));
+ memalloc_nofs_restore(flags);
+ if (IS_ERR(folio[1])) {
+ folio_unlock(folio[0]);
+ folio_put(folio[0]);
+ return PTR_ERR(folio[1]);
}
/*
- * grab_cache_page_write_begin() may not wait on page's writeback if
+ * __filemap_get_folio() may not wait on folio's writeback if
* BDI not demand that. But it is reasonable to be very conservative
- * here and explicitly wait on page's writeback
+ * here and explicitly wait on folio's writeback
*/
- wait_on_page_writeback(page[0]);
- wait_on_page_writeback(page[1]);
+ folio_wait_writeback(folio[0]);
+ folio_wait_writeback(folio[1]);
if (inode1 > inode2)
- swap(page[0], page[1]);
+ swap(folio[0], folio[1]);
return 0;
}
-/* Force page buffers uptodate w/o dropping page's lock */
-static int
-mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+static void mext_folio_double_unlock(struct folio *folio[2])
+{
+ folio_unlock(folio[0]);
+ folio_put(folio[0]);
+ folio_unlock(folio[1]);
+ folio_put(folio[1]);
+}
+
+/* Force folio buffers uptodate w/o dropping folio's lock */
+static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
sector_t block;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ struct buffer_head *bh, *head;
unsigned int blocksize, block_start, block_end;
- int i, err, nr = 0, partial = 0;
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
+ int nr = 0;
+ bool partial = false;
- if (PageUptodate(page))
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
+
+ if (folio_test_uptodate(folio))
return 0;
blocksize = i_blocksize(inode);
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
-
- head = page_buffers(page);
- block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits);
- for (bh = head, block_start = 0; bh != head || !block_start;
- block++, block_start = block_end, bh = bh->b_this_page) {
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
+
+ block = folio_pos(folio) >> inode->i_blkbits;
+ block_end = 0;
+ bh = head;
+ do {
+ block_start = block_end;
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
- partial = 1;
+ partial = true;
continue;
}
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
- err = ext4_get_block(inode, block, bh, 0);
- if (err) {
- SetPageError(page);
+ int err = ext4_get_block(inode, block, bh, 0);
+ if (err)
return err;
- }
if (!buffer_mapped(bh)) {
- zero_user(page, block_start, blocksize);
+ folio_zero_range(folio, block_start, blocksize);
set_buffer_uptodate(bh);
continue;
}
}
- BUG_ON(nr >= MAX_BUF_PER_PAGE);
- arr[nr++] = bh;
- }
+ lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+ ext4_read_bh_nowait(bh, 0, NULL, false);
+ nr++;
+ } while (block++, (bh = bh->b_this_page) != head);
+
/* No io required */
if (!nr)
goto out;
- for (i = 0; i < nr; i++) {
- bh = arr[i];
- if (!bh_uptodate_or_lock(bh)) {
- err = bh_submit_read(bh);
- if (err)
- return err;
- }
- }
+ bh = head;
+ do {
+ if (bh_offset(bh) + blocksize <= from)
+ continue;
+ if (bh_offset(bh) >= to)
+ break;
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ continue;
+ return -EIO;
+ } while ((bh = bh->b_this_page) != head);
out:
if (!partial)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
return 0;
}
-/**
- * move_extent_per_page - Move extent data per page
- *
- * @o_filp: file structure of original file
- * @donor_inode: donor inode
- * @orig_page_offset: page index on original file
- * @donor_page_offset: page index on donor file
- * @data_offset_in_page: block index where data swapping starts
- * @block_len_in_page: the number of blocks to be swapped
- * @unwritten: orig extent is unwritten or not
- * @err: pointer to save return value
- *
- * Save the data in original inode blocks and replace original inode extents
- * with donor inode extents by calling ext4_swap_extents().
- * Finally, write out the saved data in new original inode blocks. Return
- * replaced block count.
+enum mext_move_type {MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, MEXT_COPY_DATA};
+
+/*
+ * Start to move extent between the origin inode and the donor inode,
+ * hold one folio for each inode and check the candidate moving extent
+ * mapping status again.
*/
-static int
-move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
- pgoff_t orig_page_offset, pgoff_t donor_page_offset,
- int data_offset_in_page,
- int block_len_in_page, int unwritten, int *err)
+static int mext_move_begin(struct mext_data *mext, struct folio *folio[2],
+ enum mext_move_type *move_type)
{
- struct inode *orig_inode = file_inode(o_filp);
- struct page *pagep[2] = {NULL, NULL};
- handle_t *handle;
- ext4_lblk_t orig_blk_offset, donor_blk_offset;
- unsigned long blocksize = orig_inode->i_sb->s_blocksize;
- unsigned int tmp_data_size, data_size, replaced_size;
- int i, err2, jblocks, retries = 0;
- int replaced_count = 0;
- int from = data_offset_in_page << orig_inode->i_blkbits;
- int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
- struct super_block *sb = orig_inode->i_sb;
- struct buffer_head *bh = NULL;
+ struct inode *orig_inode = mext->orig_inode;
+ struct inode *donor_inode = mext->donor_inode;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ struct ext4_map_blocks donor_map = {0};
+ loff_t orig_pos, donor_pos;
+ size_t move_len;
+ int ret;
+
+ orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits;
+ donor_pos = ((loff_t)mext->donor_lblk) << blkbits;
+ ret = mext_folio_double_lock(orig_inode, donor_inode,
+ orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT,
+ ((size_t)mext->orig_map.m_len) << blkbits, folio);
+ if (ret)
+ return ret;
/*
- * It needs twice the amount of ordinary journal buffers because
- * inode and donor_inode may change each different metadata blocks.
+ * Check the origin inode's mapping information again under the
+ * folio lock, as we do not hold the i_data_sem at all times, and
+ * it may change during the concurrent write-back operation.
*/
-again:
- *err = 0;
- jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
- handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+ if (mext->orig_map.m_seq != READ_ONCE(EXT4_I(orig_inode)->i_es_seq)) {
+ ret = -ESTALE;
+ goto error;
+ }
+
+ /* Adjust the moving length according to the length of shorter folio. */
+ move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos,
+ folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos);
+ move_len >>= blkbits;
+ if (move_len < mext->orig_map.m_len)
+ mext->orig_map.m_len = move_len;
+
+ donor_map.m_lblk = mext->donor_lblk;
+ donor_map.m_len = mext->orig_map.m_len;
+ donor_map.m_flags = 0;
+ ret = ext4_map_blocks(NULL, donor_inode, &donor_map, 0);
+ if (ret < 0)
+ goto error;
+
+ /* Adjust the moving length according to the donor mapping length. */
+ mext->orig_map.m_len = donor_map.m_len;
+
+ /* Skip moving if the donor range is a hole or a delalloc extent. */
+ if (!(donor_map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)))
+ *move_type = MEXT_SKIP_EXTENT;
+ /* If both mapping ranges are unwritten, no need to copy data. */
+ else if ((mext->orig_map.m_flags & EXT4_MAP_UNWRITTEN) &&
+ (donor_map.m_flags & EXT4_MAP_UNWRITTEN))
+ *move_type = MEXT_MOVE_EXTENT;
+ else
+ *move_type = MEXT_COPY_DATA;
+
+ return 0;
+error:
+ mext_folio_double_unlock(folio);
+ return ret;
+}
+
+/*
+ * Re-create the new moved mapping buffers of the original inode and commit
+ * the entire written range.
+ */
+static int mext_folio_mkwrite(struct inode *inode, struct folio *folio,
+ size_t from, size_t to)
+{
+ unsigned int blocksize = i_blocksize(inode);
+ struct buffer_head *bh, *head;
+ size_t block_start, block_end;
+ sector_t block;
+ int ret;
+
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
+
+ block = folio_pos(folio) >> inode->i_blkbits;
+ block_end = 0;
+ bh = head;
+ do {
+ block_start = block_end;
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to)
+ continue;
+
+ ret = ext4_get_block(inode, block, bh, 0);
+ if (ret)
+ return ret;
+ } while (block++, (bh = bh->b_this_page) != head);
+
+ block_commit_write(folio, from, to);
+ return 0;
+}
+
+/*
+ * Save the data in original inode extent blocks and replace one folio size
+ * aligned original inode extent with one or one partial donor inode extent,
+ * and then write out the saved data in new original inode blocks. Pass out
+ * the replaced block count through m_len. Return 0 on success, and an error
+ * code otherwise.
+ */
+static int mext_move_extent(struct mext_data *mext, u64 *m_len)
+{
+ struct inode *orig_inode = mext->orig_inode;
+ struct inode *donor_inode = mext->donor_inode;
+ struct ext4_map_blocks *orig_map = &mext->orig_map;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ struct folio *folio[2] = {NULL, NULL};
+ loff_t from, length;
+ enum mext_move_type move_type = 0;
+ handle_t *handle;
+ u64 r_len = 0;
+ unsigned int credits;
+ int ret, ret2;
+
+ *m_len = 0;
+ trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode,
+ mext->donor_lblk);
+ credits = ext4_chunk_trans_extent(orig_inode, 0) * 2;
+ handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits);
if (IS_ERR(handle)) {
- *err = PTR_ERR(handle);
- return 0;
+ ret = PTR_ERR(handle);
+ goto out;
}
- orig_blk_offset = orig_page_offset * blocks_per_page +
- data_offset_in_page;
-
- donor_blk_offset = donor_page_offset * blocks_per_page +
- data_offset_in_page;
-
- /* Calculate data_size */
- if ((orig_blk_offset + block_len_in_page - 1) ==
- ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
- /* Replace the last block */
- tmp_data_size = orig_inode->i_size & (blocksize - 1);
- /*
- * If data_size equal zero, it shows data_size is multiples of
- * blocksize. So we set appropriate value.
- */
- if (tmp_data_size == 0)
- tmp_data_size = blocksize;
-
- data_size = tmp_data_size +
- ((block_len_in_page - 1) << orig_inode->i_blkbits);
- } else
- data_size = block_len_in_page << orig_inode->i_blkbits;
-
- replaced_size = data_size;
-
- *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
- donor_page_offset, pagep);
- if (unlikely(*err < 0))
- goto stop_journal;
+ ret = mext_move_begin(mext, folio, &move_type);
+ if (ret)
+ goto stop_handle;
+
+ if (move_type == MEXT_SKIP_EXTENT)
+ goto unlock;
+
/*
- * If orig extent was unwritten it can become initialized
- * at any time after i_data_sem was dropped, in order to
- * serialize with delalloc we have recheck extent while we
- * hold page's lock, if it is still the case data copy is not
- * necessary, just swap data blocks between orig and donor.
+ * Copy the data. First, read the original inode data into the page
+ * cache. Then, release the existing mapping relationships and swap
+ * the extent. Finally, re-establish the new mapping relationships
+ * and dirty the page cache.
*/
- if (unwritten) {
- ext4_double_down_write_data_sem(orig_inode, donor_inode);
- /* If any of extents in range became initialized we have to
- * fallback to data copying */
- unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
- block_len_in_page, 1, err);
- if (*err)
- goto drop_data_sem;
-
- unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
- block_len_in_page, 1, err);
- if (*err)
- goto drop_data_sem;
-
- if (!unwritten) {
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
- goto data_copy;
- }
- if ((page_has_private(pagep[0]) &&
- !try_to_release_page(pagep[0], 0)) ||
- (page_has_private(pagep[1]) &&
- !try_to_release_page(pagep[1], 0))) {
- *err = -EBUSY;
- goto drop_data_sem;
- }
- replaced_count = ext4_swap_extents(handle, orig_inode,
- donor_inode, orig_blk_offset,
- donor_blk_offset,
- block_len_in_page, 1, err);
- drop_data_sem:
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
- goto unlock_pages;
+ if (move_type == MEXT_COPY_DATA) {
+ from = offset_in_folio(folio[0],
+ ((loff_t)orig_map->m_lblk) << blkbits);
+ length = ((loff_t)orig_map->m_len) << blkbits;
+
+ ret = mext_folio_mkuptodate(folio[0], from, from + length);
+ if (ret)
+ goto unlock;
}
-data_copy:
- *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
- if (*err)
- goto unlock_pages;
-
- /* At this point all buffers in range are uptodate, old mapping layout
- * is no longer required, try to drop it now. */
- if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
- (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
- *err = -EBUSY;
- goto unlock_pages;
+
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
+ ret = -EBUSY;
+ goto unlock;
}
+
+ /* Move extent */
ext4_double_down_write_data_sem(orig_inode, donor_inode);
- replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
- orig_blk_offset, donor_blk_offset,
- block_len_in_page, 1, err);
+ *m_len = ext4_swap_extents(handle, orig_inode, donor_inode,
+ orig_map->m_lblk, mext->donor_lblk,
+ orig_map->m_len, 1, &ret);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
- if (*err) {
- if (replaced_count) {
- block_len_in_page = replaced_count;
- replaced_size =
- block_len_in_page << orig_inode->i_blkbits;
- } else
- goto unlock_pages;
- }
- /* Perform all necessary steps similar write_begin()/write_end()
- * but keeping in mind that i_size will not change */
- if (!page_has_buffers(pagep[0]))
- create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
- bh = page_buffers(pagep[0]);
- for (i = 0; i < data_offset_in_page; i++)
- bh = bh->b_this_page;
- for (i = 0; i < block_len_in_page; i++) {
- *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
- if (*err < 0)
- break;
- bh = bh->b_this_page;
- }
- if (!*err)
- *err = block_commit_write(pagep[0], from, from + replaced_size);
- if (unlikely(*err < 0))
- goto repair_branches;
+ /* A short-length swap cannot occur after a successful swap extent. */
+ if (WARN_ON_ONCE(!ret && (*m_len != orig_map->m_len)))
+ ret = -EIO;
- /* Even in case of data=writeback it is reasonable to pin
- * inode to transaction, to prevent unexpected data loss */
- *err = ext4_jbd2_inode_add_write(handle, orig_inode,
- (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
-
-unlock_pages:
- unlock_page(pagep[0]);
- put_page(pagep[0]);
- unlock_page(pagep[1]);
- put_page(pagep[1]);
-stop_journal:
- ext4_journal_stop(handle);
- if (*err == -ENOSPC &&
- ext4_should_retry_alloc(sb, &retries))
- goto again;
- /* Buffer was busy because probably is pinned to journal transaction,
- * force transaction commit may help to free it. */
- if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
- jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
- goto again;
- return replaced_count;
+ if (!(*m_len) || (move_type == MEXT_MOVE_EXTENT))
+ goto unlock;
-repair_branches:
+ /* Copy data */
+ length = (*m_len) << blkbits;
+ ret2 = mext_folio_mkwrite(orig_inode, folio[0], from, from + length);
+ if (ret2) {
+ if (!ret)
+ ret = ret2;
+ goto repair_branches;
+ }
/*
- * This should never ever happen!
- * Extents are swapped already, but we are not able to copy data.
- * Try to swap extents to it's original places
+ * Even in case of data=writeback it is reasonable to pin
+ * inode to transaction, to prevent unexpected data loss.
*/
- ext4_double_down_write_data_sem(orig_inode, donor_inode);
- replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
- orig_blk_offset, donor_blk_offset,
- block_len_in_page, 0, &err2);
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
- if (replaced_count != block_len_in_page) {
- ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset),
- EIO, "Unable to copy data block,"
- " data will be lost.");
- *err = -EIO;
+ ret2 = ext4_jbd2_inode_add_write(handle, orig_inode,
+ ((loff_t)orig_map->m_lblk) << blkbits, length);
+ if (!ret)
+ ret = ret2;
+unlock:
+ mext_folio_double_unlock(folio);
+stop_handle:
+ ext4_journal_stop(handle);
+out:
+ trace_ext4_move_extent_exit(orig_inode, orig_map->m_lblk, donor_inode,
+ mext->donor_lblk, orig_map->m_len, *m_len,
+ move_type, ret);
+ return ret;
+
+repair_branches:
+ ret2 = 0;
+ r_len = ext4_swap_extents(handle, donor_inode, orig_inode,
+ mext->donor_lblk, orig_map->m_lblk,
+ *m_len, 0, &ret2);
+ if (ret2 || r_len != *m_len) {
+ ext4_error_inode_block(orig_inode, (sector_t)(orig_map->m_lblk),
+ EIO, "Unable to copy data block, data will be lost!");
+ ret = -EIO;
}
- replaced_count = 0;
- goto unlock_pages;
+ *m_len = 0;
+ goto unlock;
}
-/**
- * mext_check_arguments - Check whether move extent can be done
- *
- * @orig_inode: original inode
- * @donor_inode: donor inode
- * @orig_start: logical start offset in block for orig
- * @donor_start: logical start offset in block for donor
- * @len: the number of blocks to be moved
- *
- * Check the arguments of ext4_move_extents() whether the files can be
- * exchanged with each other.
- * Return 0 on success, or a negative error value on failure.
+/*
+ * Check the validity of the basic filesystem environment and the
+ * inodes' support status.
*/
-static int
-mext_check_arguments(struct inode *orig_inode,
- struct inode *donor_inode, __u64 orig_start,
- __u64 donor_start, __u64 *len)
+static int mext_check_validity(struct inode *orig_inode,
+ struct inode *donor_inode)
{
- __u64 orig_eof, donor_eof;
- unsigned int blkbits = orig_inode->i_blkbits;
- unsigned int blocksize = 1 << blkbits;
+ struct super_block *sb = orig_inode->i_sb;
- orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
- donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+ /* origin and donor should be different inodes */
+ if (orig_inode == donor_inode) {
+ ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+ /* origin and donor should belone to the same filesystem */
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
- if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
- ext4_debug("ext4 move extent: suid or sgid is set"
- " to donor file [ino:orig %lu, donor %lu]\n",
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
- return -EPERM;
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
- /* Ext4 move extent does not support swapfile */
- if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be swapfile [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ if (IS_DAX(orig_inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with DAX");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * TODO: it's not obvious how to swap blocks for inodes with full
+ * journaling enabled.
+ */
+ if (ext4_should_journal_data(orig_inode) ||
+ ext4_should_journal_data(donor_inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with data journaling");
+ return -EOPNOTSUPP;
}
- if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be quota files [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported for encrypted files");
+ return -EOPNOTSUPP;
}
/* Ext4 move extent supports only extent based file */
- if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
- ext4_debug("ext4 move extent: orig file is not extents "
- "based file [ino:orig %lu]\n", orig_inode->i_ino);
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS)) ||
+ !(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported for non-extent files");
return -EOPNOTSUPP;
- } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
- ext4_debug("ext4 move extent: donor file is not extents "
- "based file [ino:donor %lu]\n", donor_inode->i_ino);
+ }
+
+ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+ ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) {
+ ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EPERM;
+ }
+
+ /* Ext4 move extent does not support swap files */
+ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -ETXTBSY;
+ }
+
+ if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
return -EOPNOTSUPP;
}
@@ -498,12 +502,25 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
+ return 0;
+}
+
+/*
+ * Check the moving range of ext4_move_extents() whether the files can be
+ * exchanged with each other, and adjust the length to fit within the file
+ * size. Return 0 on success, or a negative error value on failure.
+ */
+static int mext_check_adjust_range(struct inode *orig_inode,
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
+{
+ __u64 orig_eof, donor_eof;
+
/* Start offset should be same */
if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
(donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
- ext4_debug("ext4 move extent: orig and donor's start "
- "offsets are not aligned [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
+ ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -512,11 +529,14 @@ mext_check_arguments(struct inode *orig_inode,
(*len > EXT_MAX_BLOCKS) ||
(donor_start + *len >= EXT_MAX_BLOCKS) ||
(orig_start + *len >= EXT_MAX_BLOCKS)) {
- ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
- "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
- orig_inode->i_ino, donor_inode->i_ino);
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n",
+ EXT_MAX_BLOCKS,
+ orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
+
+ orig_eof = EXT4_B_TO_LBLK(orig_inode, i_size_read(orig_inode));
+ donor_eof = EXT4_B_TO_LBLK(donor_inode, i_size_read(donor_inode));
if (orig_eof <= orig_start)
*len = 0;
else if (orig_eof < orig_start + *len - 1)
@@ -526,9 +546,8 @@ mext_check_arguments(struct inode *orig_inode,
else if (donor_eof < donor_start + *len - 1)
*len = donor_eof - donor_start;
if (!*len) {
- ext4_debug("ext4 move extent: len should not be 0 "
- "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
- donor_inode->i_ino);
+ ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -547,142 +566,82 @@ mext_check_arguments(struct inode *orig_inode,
*
* This function returns 0 and moved block length is set in moved_len
* if succeed, otherwise returns error value.
- *
*/
-int
-ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
- __u64 donor_blk, __u64 len, __u64 *moved_len)
+int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+ __u64 donor_blk, __u64 len, __u64 *moved_len)
{
struct inode *orig_inode = file_inode(o_filp);
struct inode *donor_inode = file_inode(d_filp);
- struct ext4_ext_path *path = NULL;
- int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
- ext4_lblk_t o_end, o_start = orig_blk;
- ext4_lblk_t d_start = donor_blk;
+ struct mext_data mext;
+ struct super_block *sb = orig_inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int retries = 0;
+ u64 m_len;
int ret;
- if (orig_inode->i_sb != donor_inode->i_sb) {
- ext4_debug("ext4 move extent: The argument files "
- "should be in same FS [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
- /* orig and donor should be different inodes */
- if (orig_inode == donor_inode) {
- ext4_debug("ext4 move extent: The argument files should not "
- "be same inode [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
- /* Regular file check */
- if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
- ext4_debug("ext4 move extent: The argument files should be "
- "regular file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
- /* TODO: it's not obvious how to swap blocks for inodes with full
- journaling enabled */
- if (ext4_should_journal_data(orig_inode) ||
- ext4_should_journal_data(donor_inode)) {
- ext4_msg(orig_inode->i_sb, KERN_ERR,
- "Online defrag not supported with data journaling");
- return -EOPNOTSUPP;
- }
-
- if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
- ext4_msg(orig_inode->i_sb, KERN_ERR,
- "Online defrag not supported for encrypted files");
- return -EOPNOTSUPP;
- }
+ *moved_len = 0;
/* Protect orig and donor inodes against a truncate */
lock_two_nondirectories(orig_inode, donor_inode);
+ ret = mext_check_validity(orig_inode, donor_inode);
+ if (ret)
+ goto out;
+
/* Wait for all existing dio workers */
inode_dio_wait(orig_inode);
inode_dio_wait(donor_inode);
- /* Protect extent tree against block allocations via delalloc */
- ext4_double_down_write_data_sem(orig_inode, donor_inode);
- /* Check the filesystem environment whether move_extent can be done */
- ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
- donor_blk, &len);
+ /* Check and adjust the specified move_extent range. */
+ ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk,
+ donor_blk, &len);
if (ret)
goto out;
- o_end = o_start + len;
- while (o_start < o_end) {
- struct ext4_extent *ex;
- ext4_lblk_t cur_blk, next_blk;
- pgoff_t orig_page_index, donor_page_index;
- int offset_in_page;
- int unwritten, cur_len;
+ mext.orig_inode = orig_inode;
+ mext.donor_inode = donor_inode;
+ while (len) {
+ mext.orig_map.m_lblk = orig_blk;
+ mext.orig_map.m_len = len;
+ mext.orig_map.m_flags = 0;
+ mext.donor_lblk = donor_blk;
- ret = get_ext_path(orig_inode, o_start, &path);
- if (ret)
+ ret = ext4_map_blocks(NULL, orig_inode, &mext.orig_map, 0);
+ if (ret < 0)
goto out;
- ex = path[path->p_depth].p_ext;
- next_blk = ext4_ext_next_allocated_block(path);
- cur_blk = le32_to_cpu(ex->ee_block);
- cur_len = ext4_ext_get_actual_len(ex);
- /* Check hole before the start pos */
- if (cur_blk + cur_len - 1 < o_start) {
- if (next_blk == EXT_MAX_BLOCKS) {
- o_start = o_end;
- ret = -ENODATA;
- goto out;
+
+ /* Skip moving if it is a hole or a delalloc extent. */
+ if (mext.orig_map.m_flags &
+ (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) {
+ ret = mext_move_extent(&mext, &m_len);
+ *moved_len += m_len;
+ if (!ret)
+ goto next;
+
+ /* Move failed or partially failed. */
+ if (m_len) {
+ orig_blk += m_len;
+ donor_blk += m_len;
+ len -= m_len;
}
- d_start += next_blk - o_start;
- o_start = next_blk;
- continue;
- /* Check hole after the start pos */
- } else if (cur_blk > o_start) {
- /* Skip hole */
- d_start += cur_blk - o_start;
- o_start = cur_blk;
- /* Extent inside requested range ?*/
- if (cur_blk >= o_end)
- goto out;
- } else { /* in_range(o_start, o_blk, o_len) */
- cur_len += cur_blk - o_start;
+ if (ret == -ESTALE)
+ continue;
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(sb, &retries))
+ continue;
+ if (ret == -EBUSY &&
+ sbi->s_journal && retries++ < 4 &&
+ jbd2_journal_force_commit_nested(sbi->s_journal))
+ continue;
+
+ goto out;
}
- unwritten = ext4_ext_is_unwritten(ex);
- if (o_end - o_start < cur_len)
- cur_len = o_end - o_start;
-
- orig_page_index = o_start >> (PAGE_SHIFT -
- orig_inode->i_blkbits);
- donor_page_index = d_start >> (PAGE_SHIFT -
- donor_inode->i_blkbits);
- offset_in_page = o_start % blocks_per_page;
- if (cur_len > blocks_per_page- offset_in_page)
- cur_len = blocks_per_page - offset_in_page;
- /*
- * Up semaphore to avoid following problems:
- * a. transaction deadlock among ext4_journal_start,
- * ->write_begin via pagefault, and jbd2_journal_commit
- * b. racing with ->readpage, ->write_begin, and ext4_get_block
- * in move_extent_per_page
- */
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
- /* Swap original branches with new branches */
- move_extent_per_page(o_filp, donor_inode,
- orig_page_index, donor_page_index,
- offset_in_page, cur_len,
- unwritten, &ret);
- ext4_double_down_write_data_sem(orig_inode, donor_inode);
- if (ret < 0)
- break;
- o_start += cur_len;
- d_start += cur_len;
+next:
+ orig_blk += mext.orig_map.m_len;
+ donor_blk += mext.orig_map.m_len;
+ len -= mext.orig_map.m_len;
+ retries = 0;
}
- *moved_len = o_start - orig_blk;
- if (*moved_len > len)
- *moved_len = len;
out:
if (*moved_len) {
@@ -690,10 +649,6 @@ out:
ext4_discard_preallocations(donor_inode);
}
- ext4_ext_drop_refs(path);
- kfree(path);
- ext4_double_up_write_data_sem(orig_inode, donor_inode);
unlock_two_nondirectories(orig_inode, donor_inode);
-
return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 56738b538ddf..c4b5e252af0e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
ext4_lblk_t *block)
{
+ struct ext4_map_blocks map;
struct buffer_head *bh;
int err;
@@ -63,20 +64,41 @@ static struct buffer_head *ext4_append(handle_t *handle,
return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ map.m_lblk = *block;
+ map.m_len = 1;
+
+ /*
+ * We're appending new directory block. Make sure the block is not
+ * allocated yet, otherwise we will end up corrupting the
+ * directory.
+ */
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "Logical block already allocated");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
if (IS_ERR(bh))
return bh;
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ goto out;
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
- if (err) {
- brelse(bh);
- ext4_std_error(inode->i_sb, err);
- return ERR_PTR(err);
- }
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out;
return bh;
+
+out:
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
}
static int ext4_dx_csum_verify(struct inode *inode,
@@ -109,6 +131,13 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
+ if (block >= inode->i_size >> inode->i_blkbits) {
+ ext4_error_inode(inode, func, line, block,
+ "Attempting to read directory block (%u) that is past i_size (%llu)",
+ block, inode->i_size);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
bh = ERR_PTR(-EIO);
else
@@ -122,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh;
}
- if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
+ /* The first directory block must not be a hole. */
+ if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) {
ext4_error_inode(inode, func, line, block,
- "Directory hole found for htree %s block",
- (type == INDEX) ? "index" : "leaf");
+ "Directory hole found for htree %s block %u",
+ (type == INDEX) ? "index" : "leaf", block);
return ERR_PTR(-EFSCORRUPTED);
}
if (!bh)
@@ -146,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
brelse(bh);
return ERR_PTR(-EFSCORRUPTED);
}
- if (!ext4_has_metadata_csum(inode->i_sb) ||
+ if (!ext4_has_feature_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
return bh;
@@ -182,10 +212,6 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh;
}
-#ifndef assert
-#define assert(test) J_ASSERT(test)
-#endif
-
#ifdef DX_DEBUG
#define dxtrace(command) command
#else
@@ -265,34 +291,6 @@ struct dx_tail {
__le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
};
-static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
-static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash(struct dx_entry *entry);
-static void dx_set_hash(struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count(struct dx_entry *entries);
-static unsigned dx_get_limit(struct dx_entry *entries);
-static void dx_set_count(struct dx_entry *entries, unsigned value);
-static void dx_set_limit(struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(struct ext4_filename *fname,
- struct inode *dir,
- struct dx_hash_info *hinfo,
- struct dx_frame *frame);
-static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
- struct dx_map_entry map[]);
-static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
- struct dx_map_entry *offsets, int count, unsigned blocksize);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
-static void dx_insert_block(struct dx_frame *frame,
- u32 hash, ext4_lblk_t block);
-static int ext4_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
- __u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
@@ -316,17 +314,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
struct ext4_dir_entry *d, *top;
d = (struct ext4_dir_entry *)bh->b_data;
top = (struct ext4_dir_entry *)(bh->b_data +
- (EXT4_BLOCK_SIZE(inode->i_sb) -
- sizeof(struct ext4_dir_entry_tail)));
- while (d < top && d->rec_len)
+ (blocksize - sizeof(struct ext4_dir_entry_tail)));
+ while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
d = (struct ext4_dir_entry *)(((void *)d) +
- le16_to_cpu(d->rec_len));
+ ext4_rec_len_from_disk(d->rec_len, blocksize));
if (d != top)
return NULL;
@@ -337,7 +335,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
#endif
if (t->det_reserved_zero1 ||
- le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+ (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
+ sizeof(struct ext4_dir_entry_tail)) ||
t->det_reserved_zero2 ||
t->det_reserved_ft != EXT4_FT_DIR_CSUM)
return NULL;
@@ -347,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
return cpu_to_le32(csum);
}
@@ -369,7 +367,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
t = get_dirent_tail(inode, bh);
@@ -390,7 +388,7 @@ static void ext4_dirblock_csum_set(struct inode *inode,
{
struct ext4_dir_entry_tail *t;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
t = get_dirent_tail(inode, bh);
@@ -418,13 +416,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
struct ext4_dir_entry *dp;
struct dx_root_info *root;
int count_offset;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
+ unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);
- if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+ if (rlen == blocksize)
count_offset = 8;
- else if (le16_to_cpu(dirent->rec_len) == 12) {
+ else if (rlen == 12) {
dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
- if (le16_to_cpu(dp->rec_len) !=
- EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+ if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
return NULL;
root = (struct dx_root_info *)(((void *)dp + 12));
if (root->reserved_zero ||
@@ -442,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int count_offset, int count, struct dx_tail *t)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
int size;
@@ -450,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int offset = offsetof(struct dx_tail, dt_checksum);
size = count_offset + (count * sizeof(struct dx_entry));
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
- csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(csum, (__u8 *)t, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
return cpu_to_le32(csum);
}
@@ -464,7 +462,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
struct dx_tail *t;
int count_offset, limit, count;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -493,7 +491,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
struct dx_tail *t;
int count_offset, limit, count;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -578,19 +576,21 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
- EXT4_DIR_REC_LEN(2) - infosize;
+ unsigned int entry_space = dir->i_sb->s_blocksize -
+ ext4_dir_rec_len(1, NULL) -
+ ext4_dir_rec_len(2, NULL) - infosize;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit(struct inode *dir)
{
- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+ unsigned int entry_space = dir->i_sb->s_blocksize -
+ ext4_dir_rec_len(0, dir);
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -643,15 +643,9 @@ static struct stats dx_show_leaf(struct inode *dir,
name = de->name;
len = de->name_len;
- if (IS_ENCRYPTED(dir))
- res = fscrypt_get_encryption_info(dir);
- if (res) {
- printk(KERN_WARNING "Error setting up"
- " fname crypto: %d\n", res);
- }
- if (!fscrypt_has_encryption_key(dir)) {
+ if (!IS_ENCRYPTED(dir)) {
/* Directory is not encrypted */
- ext4fs_dirhash(dir, de->name,
+ (void) ext4fs_dirhash(dir, de->name,
de->name_len, &h);
printk("%*.s:(U)%x.%u ", len,
name, h.hash,
@@ -663,8 +657,7 @@ static struct stats dx_show_leaf(struct inode *dir,
/* Directory is encrypted */
res = fscrypt_fname_alloc_buffer(
- dir, len,
- &fname_crypto_str);
+ len, &fname_crypto_str);
if (res)
printk(KERN_WARNING "Error "
"allocating crypto "
@@ -684,8 +677,12 @@ static struct stats dx_show_leaf(struct inode *dir,
name = fname_crypto_str.name;
len = fname_crypto_str.len;
}
- ext4fs_dirhash(dir, de->name,
- de->name_len, &h);
+ if (IS_CASEFOLDED(dir))
+ h.hash = EXT4_DIRENT_HASH(de);
+ else
+ (void) ext4fs_dirhash(dir,
+ de->name,
+ de->name_len, &h);
printk("%*.s:(E)%x.%u ", len, name,
h.hash, (unsigned) ((char *) de
- base));
@@ -695,12 +692,13 @@ static struct stats dx_show_leaf(struct inode *dir,
#else
int len = de->name_len;
char *name = de->name;
- ext4fs_dirhash(dir, de->name, de->name_len, &h);
+ (void) ext4fs_dirhash(dir, de->name,
+ de->name_len, &h);
printk("%*.s:%x.%u ", len, name, h.hash,
(unsigned) ((char *) de - base));
#endif
}
- space += EXT4_DIR_REC_LEN(de->name_len);
+ space += ext4_dir_rec_len(de->name_len, dir);
names++;
}
de = ext4_next_entry(de, size);
@@ -742,6 +740,29 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
(space/bcount)*100/blocksize);
return (struct stats) { names, space, bcount};
}
+
+/*
+ * Linear search cross check
+ */
+static inline void htree_rep_invariant_check(struct dx_entry *at,
+ struct dx_entry *target,
+ u32 hash, unsigned int n)
+{
+ while (n--) {
+ dxtrace(printk(KERN_CONT ","));
+ if (dx_get_hash(++at) > hash) {
+ at--;
+ break;
+ }
+ }
+ ASSERT(at == target - 1);
+}
+#else /* DX_DEBUG */
+static inline void htree_rep_invariant_check(struct dx_entry *at,
+ struct dx_entry *target,
+ u32 hash, unsigned int n)
+{
+}
#endif /* DX_DEBUG */
/*
@@ -757,12 +778,14 @@ static struct dx_frame *
dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
- unsigned count, indirect;
+ unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
struct dx_root *root;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
+ ext4_lblk_t block;
+ ext4_lblk_t blocks[EXT4_HTREE_LEVEL];
memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
@@ -772,19 +795,41 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
root = (struct dx_root *) frame->bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
- root->info.hash_version != DX_HASH_LEGACY) {
+ root->info.hash_version != DX_HASH_LEGACY &&
+ root->info.hash_version != DX_HASH_SIPHASH) {
ext4_warning_inode(dir, "Unrecognised inode hash code %u",
root->info.hash_version);
goto fail;
}
+ if (ext4_hash_in_dirent(dir)) {
+ if (root->info.hash_version != DX_HASH_SIPHASH) {
+ ext4_warning_inode(dir,
+ "Hash in dirent, but hash is not SIPHASH");
+ goto fail;
+ }
+ } else {
+ if (root->info.hash_version == DX_HASH_SIPHASH) {
+ ext4_warning_inode(dir,
+ "Hash code is SIPHASH, but hash not in dirent");
+ goto fail;
+ }
+ }
if (fname)
hinfo = &fname->hinfo;
hinfo->hash_version = root->info.hash_version;
if (hinfo->hash_version <= DX_HASH_TEA)
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- if (fname && fname_name(fname))
- ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo);
+ /* hash is already computed for encrypted casefolded directory */
+ if (fname && fname_name(fname) &&
+ !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) {
+ int ret = ext4fs_dirhash(dir, fname_name(fname),
+ fname_len(fname), hinfo);
+ if (ret < 0) {
+ ret_err = ERR_PTR(ret);
+ goto fail;
+ }
+ }
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
@@ -818,6 +863,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
dxtrace(printk("Look up %x", hash));
+ level = 0;
+ blocks[0] = 0;
while (1) {
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
@@ -838,20 +885,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
p = m + 1;
}
- if (0) { // linear search cross check
- unsigned n = count - 1;
- at = entries;
- while (n--)
- {
- dxtrace(printk(KERN_CONT ","));
- if (dx_get_hash(++at) > hash)
- {
- at--;
- break;
- }
- }
- assert (at == p - 1);
- }
+ htree_rep_invariant_check(entries, p, hash, count - 1);
at = p - 1;
dxtrace(printk(KERN_CONT " %x->%u\n",
@@ -859,15 +893,27 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
- if (!indirect--)
+
+ block = dx_get_block(at);
+ for (i = 0; i <= level; i++) {
+ if (blocks[i] == block) {
+ ext4_warning_inode(dir,
+ "dx entry: tree cycle block %u points back to block %u",
+ blocks[level], block);
+ goto fail;
+ }
+ }
+ if (++level > indirect)
return frame;
+ blocks[level] = block;
frame++;
- frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ frame->bh = ext4_read_dirblock(dir, block, INDEX);
if (IS_ERR(frame->bh)) {
ret_err = (struct dx_frame *) frame->bh;
frame->bh = NULL;
goto fail;
}
+
entries = ((struct dx_node *) frame->bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit(dir)) {
@@ -957,7 +1003,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
* If the hash is 1, then continue only if the next page has a
* continuation hash of any value. This is used for readdir
* handling. Otherwise, check to see if the hash matches the
- * desired contiuation hash. If it doesn't, return since
+ * desired continuation hash. If it doesn't, return since
* there's no point to read in the successive index pages.
*/
bhash = dx_get_hash(p->at);
@@ -998,6 +1044,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
+ int csum = ext4_has_feature_metadata_csum(dir->i_sb);
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -1006,18 +1053,20 @@ static int htree_dirblock_to_tree(struct file *dir_file,
return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
+ /* csum entries are not larger in the casefolded encrypted case */
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
- EXT4_DIR_REC_LEN(0));
+ ext4_dir_rec_len(0,
+ csum ? NULL : dir));
/* Check if the directory is encrypted */
if (IS_ENCRYPTED(dir)) {
- err = fscrypt_get_encryption_info(dir);
+ err = fscrypt_prepare_readdir(dir);
if (err < 0) {
brelse(bh);
return err;
}
- err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN,
- &fname_crypto_str);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN,
+ &fname_crypto_str);
if (err < 0) {
brelse(bh);
return err;
@@ -1027,12 +1076,27 @@ static int htree_dirblock_to_tree(struct file *dir_file,
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size,
- (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ EXT4_LBLK_TO_B(dir, block)
+ ((char *)de - bh->b_data))) {
/* silently ignore the rest of the block */
break;
}
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (ext4_hash_in_dirent(dir)) {
+ if (de->name_len && de->inode) {
+ hinfo->hash = EXT4_DIRENT_HASH(de);
+ hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ } else {
+ hinfo->hash = 0;
+ hinfo->minor_hash = 0;
+ }
+ } else {
+ err = ext4fs_dirhash(dir, de->name,
+ de->name_len, hinfo);
+ if (err < 0) {
+ count = err;
+ goto errout;
+ }
+ }
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@@ -1101,7 +1165,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
start_hash, start_minor_hash));
dir = file_inode(dir_file);
if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
- hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (ext4_hash_in_dirent(dir))
+ hinfo.hash_version = DX_HASH_SIPHASH;
+ else
+ hinfo.hash_version =
+ EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version +=
EXT4_SB(dir->i_sb)->s_hash_unsigned;
@@ -1209,25 +1277,41 @@ static inline int search_dirblock(struct buffer_head *bh,
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail)
{
int count = 0;
- char *base = (char *) de;
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data;
+ unsigned int buflen = bh->b_size;
+ char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
+ int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
+
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
+ buflen -= sizeof(struct ext4_dir_entry_tail);
- while ((char *) de < base + blocksize) {
+ while ((char *) de < base + buflen) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen,
+ ((char *)de) - base))
+ return -EFSCORRUPTED;
if (de->name_len && de->inode) {
- ext4fs_dirhash(dir, de->name, de->name_len, &h);
+ if (ext4_hash_in_dirent(dir))
+ h.hash = EXT4_DIRENT_HASH(de);
+ else {
+ int err = ext4fs_dirhash(dir, de->name,
+ de->name_len, &h);
+ if (err < 0)
+ return err;
+ }
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
- map_tail->size = le16_to_cpu(de->rec_len);
+ map_tail->size = ext4_rec_len_from_disk(de->rec_len,
+ blocksize);
count++;
cond_resched();
}
- /* XXX: do we need to check rec_len == 0 case? -Chris */
de = ext4_next_entry(de, blocksize);
}
return count;
@@ -1266,75 +1350,50 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
struct dx_entry *old = frame->at, *new = old + 1;
int count = dx_get_count(entries);
- assert(count < dx_get_limit(entries));
- assert(old < entries + count);
+ ASSERT(count < dx_get_limit(entries));
+ ASSERT(old < entries + count);
memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
dx_set_hash(new, hash);
dx_set_block(new, block);
dx_set_count(entries, count + 1);
}
-#ifdef CONFIG_UNICODE
-/*
- * Test whether a case-insensitive directory entry matches the filename
- * being searched for. If quick is set, assume the name being looked up
- * is already in the casefolded form.
- *
- * Returns: 0 if the directory entry matches, more than 0 if it
- * doesn't match or less than zero on error.
- */
-int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
- const struct qstr *entry, bool quick)
-{
- const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
- const struct unicode_map *um = sbi->s_encoding;
- int ret;
-
- if (quick)
- ret = utf8_strncasecmp_folded(um, name, entry);
- else
- ret = utf8_strncasecmp(um, name, entry);
-
- if (ret < 0) {
- /* Handle invalid character sequence as either an error
- * or as an opaque byte sequence.
- */
- if (ext4_has_strict_mode(sbi))
- return -EINVAL;
-
- if (name->len != entry->len)
- return 1;
-
- return !!memcmp(name->name, entry->name, name->len);
- }
-
- return ret;
-}
-
-void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
- struct fscrypt_str *cf_name)
+#if IS_ENABLED(CONFIG_UNICODE)
+int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
+ struct ext4_filename *name)
{
+ struct qstr *cf_name = &name->cf_name;
+ unsigned char *buf;
+ struct dx_hash_info *hinfo = &name->hinfo;
int len;
- if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
+ if (!IS_CASEFOLDED(dir) ||
+ (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) {
cf_name->name = NULL;
- return;
+ return 0;
}
- cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
- if (!cf_name->name)
- return;
+ buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
- len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding,
- iname, cf_name->name,
- EXT4_NAME_LEN);
+ len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN);
if (len <= 0) {
- kfree(cf_name->name);
- cf_name->name = NULL;
- return;
+ kfree(buf);
+ buf = NULL;
}
+ cf_name->name = buf;
cf_name->len = (unsigned) len;
+ if (!IS_ENCRYPTED(dir))
+ return 0;
+
+ hinfo->hash_version = DX_HASH_SIPHASH;
+ hinfo->seed = NULL;
+ if (cf_name->name)
+ return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo);
+ else
+ return ext4fs_dirhash(dir, iname->name, iname->len, hinfo);
}
#endif
@@ -1343,14 +1402,11 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
*
* Return: %true if the directory entry matches, otherwise %false.
*/
-static inline bool ext4_match(const struct inode *parent,
+static bool ext4_match(struct inode *parent,
const struct ext4_filename *fname,
- const struct ext4_dir_entry_2 *de)
+ struct ext4_dir_entry_2 *de)
{
struct fscrypt_name f;
-#ifdef CONFIG_UNICODE
- const struct qstr entry = {.name = de->name, .len = de->name_len};
-#endif
if (!de->inode)
return false;
@@ -1361,15 +1417,33 @@ static inline bool ext4_match(const struct inode *parent,
f.crypto_buf = fname->crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
- if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) {
- if (fname->cf_name.name) {
- struct qstr cf = {.name = fname->cf_name.name,
- .len = fname->cf_name.len};
- return !ext4_ci_compare(parent, &cf, &entry, true);
- }
- return !ext4_ci_compare(parent, fname->usr_fname, &entry,
- false);
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (IS_CASEFOLDED(parent) &&
+ (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
+ /*
+ * Just checking IS_ENCRYPTED(parent) below is not
+ * sufficient to decide whether one can use the hash for
+ * skipping the string comparison, because the key might
+ * have been added right after
+ * ext4_fname_setup_ci_filename(). In this case, a hash
+ * mismatch will be a false negative. Therefore, make
+ * sure cf_name was properly initialized before
+ * considering the calculated hash.
+ */
+ if (sb_no_casefold_compat_fallback(parent->i_sb) &&
+ IS_ENCRYPTED(parent) && fname->cf_name.name &&
+ (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
+ fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)))
+ return false;
+ /*
+ * Treat comparison errors as not a match. The
+ * only case where it happens is on a disk
+ * corruption or ENOMEM.
+ */
+
+ return generic_ci_match(parent, fname->usr_fname,
+ &fname->cf_name, de->name,
+ de->name_len) > 0;
}
#endif
@@ -1377,7 +1451,7 @@ static inline bool ext4_match(const struct inode *parent,
}
/*
- * Returns 0 if not found, -1 on failure, and 1 on success
+ * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success
*/
int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
struct inode *dir, struct ext4_filename *fname,
@@ -1389,16 +1463,16 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
- while ((char *) de < dlimit) {
+ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit &&
+ if (de->name + de->name_len <= dlimit &&
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
- bh->b_size, offset))
- return -1;
+ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
+ buf_size, offset))
+ return -EFSCORRUPTED;
*res_dir = de;
return 1;
}
@@ -1406,7 +1480,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
if (de_len <= 0)
- return -1;
+ return -EFSCORRUPTED;
offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
}
@@ -1467,11 +1541,10 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
int has_inline_data = 1;
ret = ext4_find_inline_entry(dir, fname, res_dir,
&has_inline_data);
- if (has_inline_data) {
- if (inlined)
- *inlined = 1;
+ if (inlined)
+ *inlined = has_inline_data;
+ if (has_inline_data || IS_ERR(ret))
goto cleanup_and_exit;
- }
}
if ((namelen <= 2) && (name[0] == '.') &&
@@ -1491,10 +1564,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
- if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
+ if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR)
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+ "falling back\n"));
+ else if (!sb_no_casefold_compat_fallback(dir->i_sb) &&
+ *res_dir == NULL && IS_CASEFOLDED(dir))
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold "
+ "failed, falling back\n"));
+ else
goto cleanup_and_exit;
- dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
- "falling back\n"));
ret = NULL;
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -1552,15 +1630,17 @@ restart:
}
set_buffer_verified(bh);
i = search_dirblock(bh, dir, fname,
- block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
+ EXT4_LBLK_TO_B(dir, block), res_dir);
if (i == 1) {
EXT4_I(dir)->i_dir_start_lookup = block;
ret = bh;
goto cleanup_and_exit;
} else {
brelse(bh);
- if (i < 0)
+ if (i < 0) {
+ ret = ERR_PTR(i);
goto cleanup_and_exit;
+ }
}
next:
if (++block >= nblocks)
@@ -1630,7 +1710,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir)
{
- struct super_block * sb = dir->i_sb;
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct buffer_head *bh;
ext4_lblk_t block;
@@ -1641,7 +1720,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
#endif
frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
- return (struct buffer_head *) frame;
+ return ERR_CAST(frame);
do {
block = dx_get_block(frame->at);
bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
@@ -1649,12 +1728,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
goto errout;
retval = search_dirblock(bh, dir, fname,
- block << EXT4_BLOCK_SIZE_BITS(sb),
- res_dir);
+ EXT4_LBLK_TO_B(dir, block), res_dir);
if (retval == 1)
goto success;
brelse(bh);
- if (retval == -1) {
+ if (retval < 0) {
bh = ERR_PTR(ERR_BAD_DX_DIR);
goto errout;
}
@@ -1682,7 +1760,7 @@ success:
static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
- struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_2 *de = NULL;
struct buffer_head *bh;
if (dentry->d_name.len > EXT4_NAME_LEN)
@@ -1722,8 +1800,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
}
}
-#ifdef CONFIG_UNICODE
- if (!inode && IS_CASEFOLDED(dir)) {
+ if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
* well. For now, prevent the negative dentry
@@ -1731,7 +1808,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
*/
return NULL;
}
-#endif
+
return d_splice_alias(inode, dentry);
}
@@ -1739,11 +1816,10 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct dentry *ext4_get_parent(struct dentry *child)
{
__u32 ino;
- static const struct qstr dotdot = QSTR_INIT("..", 2);
- struct ext4_dir_entry_2 * de;
+ struct ext4_dir_entry_2 * de = NULL;
struct buffer_head *bh;
- bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
+ bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL);
if (IS_ERR(bh))
return ERR_CAST(bh);
if (!bh)
@@ -1765,7 +1841,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
* Returns pointer to last entry moved.
*/
static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+dx_move_dirents(struct inode *dir, char *from, char *to,
+ struct dx_map_entry *map, int count,
unsigned blocksize)
{
unsigned rec_len = 0;
@@ -1773,11 +1850,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
while (count--) {
struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
(from + (map->offs<<2));
- rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ rec_len = ext4_dir_rec_len(de->name_len, dir);
+
memcpy (to, de, rec_len);
((struct ext4_dir_entry_2 *) to)->rec_len =
ext4_rec_len_to_disk(rec_len, blocksize);
+
+ /* wipe dir_entry excluding the rec_len field */
de->inode = 0;
+ memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len,
+ blocksize) -
+ offsetof(struct ext4_dir_entry_2,
+ name_len));
+
map++;
to += rec_len;
}
@@ -1788,7 +1873,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
* Compact each dir entry in the range to the minimal rec_len.
* Returns pointer to last entry in range.
*/
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
+ unsigned int blocksize)
{
struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
unsigned rec_len = 0;
@@ -1797,7 +1883,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
while ((char*)de < base + blocksize) {
next = ext4_next_entry(de, blocksize);
if (de->inode && de->name_len) {
- rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ rec_len = ext4_dir_rec_len(de->name_len, dir);
if (de > to)
memmove(to, de, rec_len);
to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
@@ -1819,7 +1905,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
struct dx_hash_info *hinfo)
{
unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
+ unsigned continued;
+ int count;
struct buffer_head *bh2;
ext4_lblk_t newblock;
u32 hash2;
@@ -1830,23 +1917,25 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err = 0, i;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
bh2 = ext4_append(handle, dir, &newblock);
if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- return (struct ext4_dir_entry_2 *) bh2;
+ return ERR_CAST(bh2);
}
BUFFER_TRACE(*bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, *bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, *bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
BUFFER_TRACE(frame->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, frame->bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
@@ -1854,11 +1943,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
- blocksize, hinfo, map);
+ count = dx_make_map(dir, *bh, hinfo, map);
+ if (count < 0) {
+ err = count;
+ goto journal_error;
+ }
map -= count;
dx_sort_map(map, count);
- /* Split the existing block in the middle, size-wise */
+ /* Ensure that neither split block is over half full */
size = 0;
move = 0;
for (i = count-1; i >= 0; i--) {
@@ -1868,8 +1960,27 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
size += map[i].size;
move++;
}
- /* map index at which we will split */
- split = count - move;
+ /*
+ * map index at which we will split
+ *
+ * If the sum of active entries didn't exceed half the block size, just
+ * split it in half by count; each resulting block will have at least
+ * half the space free.
+ */
+ if (i >= 0)
+ split = count - move;
+ else
+ split = count/2;
+
+ if (WARN_ON_ONCE(split == 0)) {
+ /* Should never happen, but avoid out-of-bounds access below */
+ ext4_error_inode_block(dir, (*bh)->b_blocknr, 0,
+ "bad indexed directory? hash=%08x:%08x count=%d move=%u",
+ hinfo->hash, hinfo->minor_hash, count, move);
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+
hash2 = map[split].hash;
continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
@@ -1877,9 +1988,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split,
+ de2 = dx_move_dirents(dir, data1, data2, map + split, count - split,
blocksize);
- de = dx_pack_dirents(data1, blocksize);
+ de = dx_pack_dirents(dir, data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
(char *) de,
blocksize);
@@ -1913,26 +2024,26 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
return de;
journal_error:
+ ext4_std_error(dir->i_sb, err);
+out:
brelse(*bh);
brelse(bh2);
*bh = NULL;
- ext4_std_error(dir->i_sb, err);
return ERR_PTR(err);
}
-int ext4_find_dest_de(struct inode *dir, struct inode *inode,
- struct buffer_head *bh,
+int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de)
{
struct ext4_dir_entry_2 *de;
- unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
+ unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir);
int nlen, rlen;
unsigned int offset = 0;
char *top;
- de = (struct ext4_dir_entry_2 *)buf;
+ de = buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -1940,7 +2051,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
return -EFSCORRUPTED;
if (ext4_match(dir, fname, de))
return -EEXIST;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
+ nlen = ext4_dir_rec_len(de->name_len, dir);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
break;
@@ -1954,7 +2065,8 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
return 0;
}
-void ext4_insert_dentry(struct inode *inode,
+void ext4_insert_dentry(struct inode *dir,
+ struct inode *inode,
struct ext4_dir_entry_2 *de,
int buf_size,
struct ext4_filename *fname)
@@ -1962,7 +2074,7 @@ void ext4_insert_dentry(struct inode *inode,
int nlen, rlen;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
+ nlen = ext4_dir_rec_len(de->name_len, dir);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if (de->inode) {
struct ext4_dir_entry_2 *de1 =
@@ -1976,6 +2088,13 @@ void ext4_insert_dentry(struct inode *inode,
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
de->name_len = fname_len(fname);
memcpy(de->name, fname_name(fname), fname_len(fname));
+ if (ext4_hash_in_dirent(dir)) {
+ struct dx_hash_info *hinfo = &fname->hinfo;
+
+ EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash);
+ EXT4_DIRENT_HASHES(de)->minor_hash =
+ cpu_to_le32(hinfo->minor_hash);
+ }
}
/*
@@ -1995,24 +2114,25 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
int csum_size = 0;
int err, err2;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
- err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+ err = ext4_find_dest_de(dir, bh, bh->b_data,
blocksize - csum_size, fname, &de);
if (err)
return err;
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(dir->i_sb, err);
return err;
}
/* By now the buffer is marked for journaling */
- ext4_insert_dentry(inode, de, blocksize, fname);
+ ext4_insert_dentry(dir, inode, de, blocksize, fname);
/*
* XXX shouldn't update any times until successful
@@ -2025,7 +2145,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
err2 = ext4_mark_inode_dirty(handle, dir);
@@ -2036,6 +2156,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err ? err : err2;
}
+static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
+{
+ struct fake_dirent *fde;
+ const char *error_msg;
+ unsigned int rlen;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ char *blockend = (char *)root + dir->i_sb->s_blocksize;
+
+ fde = &root->dot;
+ if (unlikely(fde->name_len != 1)) {
+ error_msg = "invalid name_len for '.'";
+ goto corrupted;
+ }
+ if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
+ error_msg = "invalid name for '.'";
+ goto corrupted;
+ }
+ rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
+ if (unlikely((char *)fde + rlen >= blockend)) {
+ error_msg = "invalid rec_len for '.'";
+ goto corrupted;
+ }
+
+ fde = &root->dotdot;
+ if (unlikely(fde->name_len != 2)) {
+ error_msg = "invalid name_len for '..'";
+ goto corrupted;
+ }
+ if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
+ error_msg = "invalid name for '..'";
+ goto corrupted;
+ }
+ rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
+ if (unlikely((char *)fde + rlen >= blockend)) {
+ error_msg = "invalid rec_len for '..'";
+ goto corrupted;
+ }
+
+ return true;
+
+corrupted:
+ EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended",
+ error_msg);
+ return false;
+}
+
/*
* This converts a one block unindexed directory to a 3 block indexed
* directory, and adds the dentry to the indexed directory.
@@ -2057,29 +2223,30 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
struct fake_dirent *fde;
int csum_size = 0;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
BUFFER_TRACE(bh, "get_write_access");
- retval = ext4_journal_get_write_access(handle, bh);
+ retval = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (retval) {
ext4_std_error(dir->i_sb, retval);
brelse(bh);
return retval;
}
+
root = (struct dx_root *) bh->b_data;
+ if (!ext4_check_dx_root(dir, root)) {
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
/* The 0th block becomes the root, move the dirents out */
fde = &root->dotdot;
de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize));
- if ((char *) de >= (((char *) root) + blocksize)) {
- EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
- brelse(bh);
- return -EFSCORRUPTED;
- }
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
@@ -2092,10 +2259,18 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
data2 = bh2->b_data;
memcpy(data2, de, len);
+ memset(de, 0, len); /* wipe old data */
de = (struct ext4_dir_entry_2 *) data2;
top = data2 + len;
- while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
+ (char *)de - data2)) {
+ brelse(bh2);
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
de = de2;
+ }
de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
(char *) de, blocksize);
@@ -2104,11 +2279,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
- blocksize);
+ de->rec_len = ext4_rec_len_to_disk(
+ blocksize - ext4_dir_rec_len(2, NULL), blocksize);
memset (&root->info, 0, sizeof(root->info));
root->info.info_length = sizeof(root->info);
- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (ext4_hash_in_dirent(dir))
+ root->info.hash_version = DX_HASH_SIPHASH;
+ else
+ root->info.hash_version =
+ EXT4_SB(dir->i_sb)->s_def_hash_version;
+
entries = root->entries;
dx_set_block(entries, 1);
dx_set_count(entries, 1);
@@ -2119,8 +2299,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
if (fname->hinfo.hash_version <= DX_HASH_TEA)
fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo);
+ /* casefolded encrypted hashes are computed on fname setup */
+ if (!ext4_hash_in_dirent(dir)) {
+ int err = ext4fs_dirhash(dir, fname_name(fname),
+ fname_len(fname), &fname->hinfo);
+ if (err < 0) {
+ brelse(bh2);
+ brelse(bh);
+ return err;
+ }
+ }
memset(frames, 0, sizeof(frames));
frame = frames;
frame->entries = entries;
@@ -2129,10 +2318,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (retval)
- goto out_frames;
+ goto out_frames;
retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
if (retval)
- goto out_frames;
+ goto out_frames;
de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
if (IS_ERR(de)) {
@@ -2171,9 +2360,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct buffer_head *bh = NULL;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
-#ifdef CONFIG_UNICODE
- struct ext4_sb_info *sbi;
-#endif
struct ext4_filename fname;
int retval;
int dx_fallback=0;
@@ -2181,20 +2367,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
ext4_lblk_t block, blocks;
int csum_size = 0;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
blocksize = sb->s_blocksize;
- if (!dentry->d_name.len)
- return -EINVAL;
-#ifdef CONFIG_UNICODE
- sbi = EXT4_SB(sb);
- if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
- sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
+ if (fscrypt_is_nokey_name(dentry))
+ return -ENOKEY;
+
+ if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
return -EINVAL;
-#endif
retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
if (retval)
@@ -2215,7 +2398,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
/* Can we just ignore htree data? */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
EXT4_ERROR_INODE(dir,
"Directory has corrupted htree index.");
retval = -EFSCORRUPTED;
@@ -2306,7 +2489,7 @@ again:
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
if (err)
goto journal_error;
@@ -2363,9 +2546,12 @@ again:
node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
sb->s_blocksize);
BUFFER_TRACE(frame->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, frame->bh);
- if (err)
+ err = ext4_journal_get_write_access(handle, sb, frame->bh,
+ EXT4_JTR_NONE);
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
@@ -2373,10 +2559,13 @@ again:
icount1, icount2));
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
- err = ext4_journal_get_write_access(handle,
- (frame - 1)->bh);
- if (err)
+ err = ext4_journal_get_write_access(handle, sb,
+ (frame - 1)->bh,
+ EXT4_JTR_NONE);
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
memcpy((char *) entries2, (char *) (entries + icount1),
icount2 * sizeof(struct dx_entry));
@@ -2386,7 +2575,7 @@ again:
/* Which index block gets the new entry? */
if (at - entries >= icount1) {
- frame->at = at = at - entries - icount1 + entries2;
+ frame->at = at - entries - icount1 + entries2;
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
@@ -2395,18 +2584,19 @@ again:
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
brelse (bh2);
err = ext4_handle_dirty_dx_node(handle, dir,
(frame - 1)->bh);
if (err)
goto journal_error;
- if (restart) {
- err = ext4_handle_dirty_dx_node(handle, dir,
- frame->bh);
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ frame->bh);
+ if (restart || err)
goto journal_error;
- }
} else {
struct dx_root *dxroot;
memcpy((char *) entries2, (char *) entries,
@@ -2422,8 +2612,10 @@ again:
"Creating %d level index...\n",
dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
brelse(bh2);
restart = 1;
@@ -2455,8 +2647,7 @@ cleanup:
* ext4_generic_delete_entry deletes a directory entry by merging it
* with the previous entry
*/
-int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2469,21 +2660,33 @@ int ext4_generic_delete_entry(handle_t *handle,
i = 0;
pde = NULL;
- de = (struct ext4_dir_entry_2 *)entry_buf;
+ de = entry_buf;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- bh->b_data, bh->b_size, i))
+ entry_buf, buf_size, i))
return -EFSCORRUPTED;
if (de == de_del) {
- if (pde)
+ if (pde) {
pde->rec_len = ext4_rec_len_to_disk(
ext4_rec_len_from_disk(pde->rec_len,
blocksize) +
ext4_rec_len_from_disk(de->rec_len,
blocksize),
blocksize);
- else
+
+ /* wipe entire dir_entry */
+ memset(de, 0, ext4_rec_len_from_disk(de->rec_len,
+ blocksize));
+ } else {
+ /* wipe dir_entry excluding the rec_len field */
de->inode = 0;
+ memset(&de->name_len, 0,
+ ext4_rec_len_from_disk(de->rec_len,
+ blocksize) -
+ offsetof(struct ext4_dir_entry_2,
+ name_len));
+ }
+
inode_inc_iversion(dir);
return 0;
}
@@ -2509,16 +2712,16 @@ static int ext4_delete_entry(handle_t *handle,
return err;
}
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(err))
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del,
- bh, bh->b_data,
+ err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
dir->i_sb->s_blocksize, csum_size);
if (err)
goto out;
@@ -2546,7 +2749,7 @@ out:
* for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
* on regular files) and to avoid creating huge/slow non-HTREE directories.
*/
-static void ext4_inc_count(handle_t *handle, struct inode *inode)
+static void ext4_inc_count(struct inode *inode)
{
inc_nlink(inode);
if (is_dx(inode) &&
@@ -2558,7 +2761,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
* If a directory had nlink == 1, then we should let it be 1. This indicates
* directory has >EXT4_LINK_MAX subdirs.
*/
-static void ext4_dec_count(handle_t *handle, struct inode *inode)
+static void ext4_dec_count(struct inode *inode)
{
if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
drop_nlink(inode);
@@ -2586,6 +2789,7 @@ static int ext4_add_nondir(handle_t *handle,
return err;
}
drop_nlink(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
return err;
@@ -2599,8 +2803,8 @@ static int ext4_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int ext4_create(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
handle_t *handle;
struct inode *inode;
@@ -2613,8 +2817,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
- inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
- NULL, EXT4_HT_DIR, credits);
+ inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name,
+ 0, NULL, EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -2622,6 +2826,8 @@ retry:
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, &inode);
+ if (!err)
+ ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
@@ -2632,8 +2838,8 @@ retry:
return err;
}
-static int ext4_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
+static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
@@ -2646,14 +2852,16 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
- inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
- NULL, EXT4_HT_DIR, credits);
+ inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name,
+ 0, NULL, EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
err = ext4_add_nondir(handle, dentry, &inode);
+ if (!err)
+ ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
@@ -2664,7 +2872,8 @@ retry:
return err;
}
-static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
+ struct file *file, umode_t mode)
{
handle_t *handle;
struct inode *inode;
@@ -2675,10 +2884,10 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
return err;
retry:
- inode = ext4_new_inode_start_handle(dir, mode,
+ inode = ext4_new_inode_start_handle(idmap, dir, mode,
NULL, 0, NULL,
EXT4_HT_DIR,
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) +
4 + EXT4_XATTR_TRANS_BLOCKS);
handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
@@ -2686,7 +2895,7 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
err = ext4_orphan_add(handle, inode);
if (err)
goto err_unlock_inode;
@@ -2697,54 +2906,66 @@ retry:
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return finish_open_simple(file, err);
err_unlock_inode:
ext4_journal_stop(handle);
unlock_new_inode(inode);
return err;
}
-struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len)
+int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, unsigned int parent_ino,
+ void *inline_buf, int inline_size)
{
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data;
+ size_t blocksize = bh->b_size;
+ int csum_size = 0, header_size;
+
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
blocksize);
- strcpy(de->name, ".");
+ memcpy(de->name, ".", 2);
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
de = ext4_next_entry(de, blocksize);
de->inode = cpu_to_le32(parent_ino);
de->name_len = 2;
- if (!dotdot_real_len)
+ memcpy(de->name, "..", 3);
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+ if (inline_buf) {
+ de->rec_len = ext4_rec_len_to_disk(
+ ext4_dir_rec_len(de->name_len, NULL),
+ blocksize);
+ de = ext4_next_entry(de, blocksize);
+ header_size = (char *)de - bh->b_data;
+ memcpy((void *)de, inline_buf, inline_size);
+ ext4_update_final_de(bh->b_data, inline_size + header_size,
+ blocksize - csum_size);
+ } else {
de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + EXT4_DIR_REC_LEN(1)),
+ (csum_size + ext4_dir_rec_len(1, NULL)),
blocksize);
- else
- de->rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(de->name_len), blocksize);
- strcpy(de->name, "..");
- ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+ }
- return ext4_next_entry(de, blocksize);
+ if (csum_size)
+ ext4_initialize_dirent_tail(bh, blocksize);
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
+ return ext4_handle_dirty_dirblock(handle, inode, bh);
}
-static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode)
{
struct buffer_head *dir_block = NULL;
- struct ext4_dir_entry_2 *de;
ext4_lblk_t block = 0;
- unsigned int blocksize = dir->i_sb->s_blocksize;
- int csum_size = 0;
int err;
- if (ext4_has_metadata_csum(dir->i_sb))
- csum_size = sizeof(struct ext4_dir_entry_tail);
-
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
err = ext4_try_create_inline_dir(handle, dir, inode);
if (err < 0 && err != -ENOSPC)
@@ -2753,43 +2974,35 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
goto out;
}
+ set_nlink(inode, 2);
inode->i_size = 0;
dir_block = ext4_append(handle, inode, &block);
if (IS_ERR(dir_block))
return PTR_ERR(dir_block);
- de = (struct ext4_dir_entry_2 *)dir_block->b_data;
- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
- set_nlink(inode, 2);
- if (csum_size)
- ext4_initialize_dirent_tail(dir_block, blocksize);
-
- BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
- if (err)
- goto out;
- set_buffer_verified(dir_block);
+ err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0);
out:
brelse(dir_block);
return err;
}
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
int err, err2 = 0, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
- return -EMLINK;
+ return ERR_PTR(-EMLINK);
err = dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
- inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+ inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode,
&dentry->d_name,
0, NULL, EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
@@ -2817,12 +3030,14 @@ out_clear_inode:
iput(inode);
goto out_retry;
}
- ext4_inc_count(handle, dir);
+ ext4_inc_count(dir);
+
ext4_update_dx_flag(dir);
err = ext4_mark_inode_dirty(handle, dir);
if (err)
goto out_clear_inode;
d_instantiate_new(dentry, inode);
+ ext4_fc_track_create(handle, dentry);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
@@ -2832,7 +3047,7 @@ out_stop:
out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return ERR_PTR(err);
}
/*
@@ -2855,33 +3070,33 @@ bool ext4_empty_dir(struct inode *inode)
}
sb = inode->i_sb;
- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+ if (inode->i_size < ext4_dir_rec_len(1, NULL) +
+ ext4_dir_rec_len(2, NULL)) {
EXT4_ERROR_INODE(inode, "invalid size");
- return true;
+ return false;
}
- /* The first directory block must not be a hole,
- * so treat it as DIRENT_HTREE
- */
- bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
+ bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh))
- return true;
+ return false;
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
0) ||
- le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
+ le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 ||
+ de->name[0] != '.') {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
- return true;
+ return false;
}
offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
de = ext4_next_entry(de, sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
offset) ||
- le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
+ de->name[0] != '.' || de->name[1] != '.') {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
- return true;
+ return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
@@ -2895,16 +3110,13 @@ bool ext4_empty_dir(struct inode *inode)
continue;
}
if (IS_ERR(bh))
- return true;
+ return false;
}
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
- bh->b_data, bh->b_size, offset)) {
- offset = (offset | (sb->s_blocksize - 1)) + 1;
- continue;
- }
- if (le32_to_cpu(de->inode)) {
+ bh->b_data, bh->b_size, offset) ||
+ le32_to_cpu(de->inode)) {
brelse(bh);
return false;
}
@@ -2914,190 +3126,17 @@ bool ext4_empty_dir(struct inode *inode)
return true;
}
-/*
- * ext4_orphan_add() links an unlinked or truncated inode into a list of
- * such inodes, starting at the superblock, in case we crash before the
- * file is closed/deleted, or in case the inode truncate spans multiple
- * transactions and the last transaction is not recovered after a crash.
- *
- * At filesystem recovery time, we walk this list deleting unlinked
- * inodes and truncating linked inodes in ext4_orphan_cleanup().
- *
- * Orphan list manipulation functions must be called under i_mutex unless
- * we are just creating the inode or deleting it.
- */
-int ext4_orphan_add(handle_t *handle, struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_iloc iloc;
- int err = 0, rc;
- bool dirty = false;
-
- if (!sbi->s_journal || is_bad_inode(inode))
- return 0;
-
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !inode_is_locked(inode));
- /*
- * Exit early if inode already is on orphan list. This is a big speedup
- * since we don't have to contend on the global s_orphan_lock.
- */
- if (!list_empty(&EXT4_I(inode)->i_orphan))
- return 0;
-
- /*
- * Orphan handling is only valid for files with data blocks
- * being truncated, or files being unlinked. Note that we either
- * hold i_mutex, or the inode can not be referenced from outside,
- * so i_nlink should not be bumped due to race
- */
- J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
-
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
- goto out;
-
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto out;
-
- mutex_lock(&sbi->s_orphan_lock);
- /*
- * Due to previous errors inode may be already a part of on-disk
- * orphan list. If so skip on-disk list modification.
- */
- if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
- (le32_to_cpu(sbi->s_es->s_inodes_count))) {
- /* Insert this inode at the head of the on-disk orphan list */
- NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
- sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- dirty = true;
- }
- list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
- mutex_unlock(&sbi->s_orphan_lock);
-
- if (dirty) {
- err = ext4_handle_dirty_super(handle, sb);
- rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
- if (!err)
- err = rc;
- if (err) {
- /*
- * We have to remove inode from in-memory list if
- * addition to on disk orphan list failed. Stray orphan
- * list entries can cause panics at unmount time.
- */
- mutex_lock(&sbi->s_orphan_lock);
- list_del_init(&EXT4_I(inode)->i_orphan);
- mutex_unlock(&sbi->s_orphan_lock);
- }
- } else
- brelse(iloc.bh);
-
- jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
- jbd_debug(4, "orphan inode %lu will point to %d\n",
- inode->i_ino, NEXT_ORPHAN(inode));
-out:
- ext4_std_error(sb, err);
- return err;
-}
-
-/*
- * ext4_orphan_del() removes an unlinked or truncated inode from the list
- * of such inodes stored on disk, because it is finally being cleaned up.
- */
-int ext4_orphan_del(handle_t *handle, struct inode *inode)
-{
- struct list_head *prev;
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- __u32 ino_next;
- struct ext4_iloc iloc;
- int err = 0;
-
- if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
- return 0;
-
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !inode_is_locked(inode));
- /* Do this quick check before taking global s_orphan_lock. */
- if (list_empty(&ei->i_orphan))
- return 0;
-
- if (handle) {
- /* Grab inode buffer early before taking global s_orphan_lock */
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- }
-
- mutex_lock(&sbi->s_orphan_lock);
- jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
-
- prev = ei->i_orphan.prev;
- list_del_init(&ei->i_orphan);
-
- /* If we're on an error path, we may not have a valid
- * transaction handle with which to update the orphan list on
- * disk, but we still need to remove the inode from the linked
- * list in memory. */
- if (!handle || err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_err;
- }
-
- ino_next = NEXT_ORPHAN(inode);
- if (prev == &sbi->s_orphan) {
- jbd_debug(4, "superblock will point to %u\n", ino_next);
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_brelse;
- }
- sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- mutex_unlock(&sbi->s_orphan_lock);
- err = ext4_handle_dirty_super(handle, inode->i_sb);
- } else {
- struct ext4_iloc iloc2;
- struct inode *i_prev =
- &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
-
- jbd_debug(4, "orphan inode %lu will point to %u\n",
- i_prev->i_ino, ino_next);
- err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
- if (err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_brelse;
- }
- NEXT_ORPHAN(i_prev) = ino_next;
- err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
- mutex_unlock(&sbi->s_orphan_lock);
- }
- if (err)
- goto out_brelse;
- NEXT_ORPHAN(inode) = 0;
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-out_err:
- ext4_std_error(inode->i_sb, err);
- return err;
-
-out_brelse:
- brelse(iloc.bh);
- goto out_err;
-}
-
static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
{
int retval;
struct inode *inode;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_2 *de = NULL;
handle_t *handle = NULL;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
- return -EIO;
+ retval = ext4_emergency_state(dir->i_sb);
+ if (unlikely(retval))
+ return retval;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
@@ -3151,24 +3190,24 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+ inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (retval)
goto end_rmdir;
- ext4_dec_count(handle, dir);
+ ext4_dec_count(dir);
ext4_update_dx_flag(dir);
+ ext4_fc_track_unlink(handle, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
-#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
end_rmdir:
brelse(bh);
@@ -3177,89 +3216,142 @@ end_rmdir:
return retval;
}
-static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode,
+ struct dentry *dentry /* NULL during fast_commit recovery */)
{
- int retval;
- struct inode *inode;
+ int retval = -ENOENT;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de;
- handle_t *handle = NULL;
-
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
- return -EIO;
-
- trace_ext4_unlink_enter(dir, dentry);
- /* Initialize quotas before so that eventual writes go
- * in separate transaction */
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
- retval = dquot_initialize(d_inode(dentry));
- if (retval)
- return retval;
+ struct ext4_dir_entry_2 *de = NULL;
+ handle_t *handle;
+ int skip_remove_dentry = 0;
- retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ /*
+ * Keep this outside the transaction; it may have to set up the
+ * directory's encryption key, which isn't GFP_NOFS-safe.
+ */
+ bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
- if (!bh)
- goto end_unlink;
- inode = d_inode(dentry);
+ if (!bh)
+ return -ENOENT;
- retval = -EFSCORRUPTED;
- if (le32_to_cpu(de->inode) != inode->i_ino)
- goto end_unlink;
+ if (le32_to_cpu(de->inode) != inode->i_ino) {
+ /*
+ * It's okay if we find dont find dentry which matches
+ * the inode. That's because it might have gotten
+ * renamed to a different inode number
+ */
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ skip_remove_dentry = 1;
+ else
+ goto out_bh;
+ }
handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- handle = NULL;
- goto end_unlink;
+ goto out_bh;
}
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- retval = ext4_delete_entry(handle, dir, de, bh);
- if (retval)
- goto end_unlink;
- dir->i_ctime = dir->i_mtime = current_time(dir);
- ext4_update_dx_flag(dir);
- retval = ext4_mark_inode_dirty(handle, dir);
- if (retval)
- goto end_unlink;
+ if (!skip_remove_dentry) {
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto out_handle;
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+ ext4_update_dx_flag(dir);
+ retval = ext4_mark_inode_dirty(handle, dir);
+ if (retval)
+ goto out_handle;
+ } else {
+ retval = 0;
+ }
if (inode->i_nlink == 0)
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
+ d_name->len, d_name->name);
else
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
+ if (dentry && !retval)
+ ext4_fc_track_unlink(handle, dentry);
+out_handle:
+ ext4_journal_stop(handle);
+out_bh:
+ brelse(bh);
+ return retval;
+}
+
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int retval;
+
+ retval = ext4_emergency_state(dir->i_sb);
+ if (unlikely(retval))
+ return retval;
+
+ trace_ext4_unlink_enter(dir, dentry);
+ /*
+ * Initialize quotas before so that eventual writes go
+ * in separate transaction
+ */
+ retval = dquot_initialize(dir);
+ if (retval)
+ goto out_trace;
+ retval = dquot_initialize(d_inode(dentry));
+ if (retval)
+ goto out_trace;
+
+ retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
-#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
-end_unlink:
- brelse(bh);
- if (handle)
- ext4_journal_stop(handle);
+out_trace:
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
-static int ext4_symlink(struct inode *dir,
+static int ext4_init_symlink_block(handle_t *handle, struct inode *inode,
+ struct fscrypt_str *disk_link)
+{
+ struct buffer_head *bh;
+ char *kaddr;
+ int err = 0;
+
+ bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ kaddr = (char *)bh->b_data;
+ memcpy(kaddr, disk_link->name, disk_link->len);
+ inode->i_size = disk_link->len - 1;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+out:
+ brelse(bh);
+ return err;
+}
+
+static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
handle_t *handle;
@@ -3267,9 +3359,11 @@ static int ext4_symlink(struct inode *dir,
int err, len = strlen(symname);
int credits;
struct fscrypt_str disk_link;
+ int retries = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
- return -EIO;
+ err = ext4_emergency_state(dir->i_sb);
+ if (unlikely(err))
+ return err;
err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
&disk_link);
@@ -3280,34 +3374,24 @@ static int ext4_symlink(struct inode *dir,
if (err)
return err;
- if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- /*
- * For non-fast symlinks, we just allocate inode and put it on
- * orphan list in the first transaction => we need bitmap,
- * group descriptor, sb, inode block, quota blocks, and
- * possibly selinux xattr blocks.
- */
- credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
- EXT4_XATTR_TRANS_BLOCKS;
- } else {
- /*
- * Fast symlink. We have to add entry to directory
- * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
- * allocate new inode (bitmap, group descriptor, inode block,
- * quota blocks, sb is already counted in previous macros).
- */
- credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
- }
-
- inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+ /*
+ * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the
+ * directory. +3 for inode, inode bitmap, group descriptor allocation.
+ * EXT4_DATA_TRANS_BLOCKS for the data block allocation and
+ * modification.
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
+retry:
+ inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO,
&dentry->d_name, 0, NULL,
EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
if (IS_ERR(inode)) {
if (handle)
ext4_journal_stop(handle);
- return PTR_ERR(inode);
+ err = PTR_ERR(inode);
+ goto out_retry;
}
if (IS_ENCRYPTED(inode)) {
@@ -3315,102 +3399,56 @@ static int ext4_symlink(struct inode *dir,
if (err)
goto err_drop_inode;
inode->i_op = &ext4_encrypted_symlink_inode_operations;
+ } else {
+ if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
+ inode->i_op = &ext4_symlink_inode_operations;
+ } else {
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ }
}
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- if (!IS_ENCRYPTED(inode))
- inode->i_op = &ext4_symlink_inode_operations;
- inode_nohighmem(inode);
- ext4_set_aops(inode);
- /*
- * We cannot call page_symlink() with transaction started
- * because it calls into ext4_write_begin() which can wait
- * for transaction commit if we are running out of space
- * and thus we deadlock. So we have to stop transaction now
- * and restart it when symlink contents is written.
- *
- * To keep fs consistent in case of crash, we have to put inode
- * to orphan list in the mean time.
- */
- drop_nlink(inode);
- err = ext4_orphan_add(handle, inode);
- ext4_journal_stop(handle);
- handle = NULL;
- if (err)
- goto err_drop_inode;
- err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
- if (err)
- goto err_drop_inode;
- /*
- * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
- * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
- */
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- handle = NULL;
- goto err_drop_inode;
- }
- set_nlink(inode, 1);
- err = ext4_orphan_del(handle, inode);
+ /* alloc symlink block and fill it */
+ err = ext4_init_symlink_block(handle, inode, &disk_link);
if (err)
goto err_drop_inode;
} else {
/* clear the extent format for fast symlink */
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
- if (!IS_ENCRYPTED(inode)) {
- inode->i_op = &ext4_fast_symlink_inode_operations;
- inode->i_link = (char *)&EXT4_I(inode)->i_data;
- }
memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
disk_link.len);
inode->i_size = disk_link.len - 1;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ if (!IS_ENCRYPTED(inode))
+ inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data,
+ inode->i_size);
}
- EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, &inode);
if (handle)
ext4_journal_stop(handle);
- if (inode)
- iput(inode);
- goto out_free_encrypted_link;
+ iput(inode);
+ goto out_retry;
err_drop_inode:
- if (handle)
- ext4_journal_stop(handle);
clear_nlink(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
+ if (handle)
+ ext4_journal_stop(handle);
iput(inode);
-out_free_encrypted_link:
+out_retry:
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
if (disk_link.name != (unsigned char *)symname)
kfree(disk_link.name);
return err;
}
-static int ext4_link(struct dentry *old_dentry,
- struct inode *dir, struct dentry *dentry)
+int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
{
handle_t *handle;
- struct inode *inode = d_inode(old_dentry);
int err, retries = 0;
-
- if (inode->i_nlink >= EXT4_LINK_MAX)
- return -EMLINK;
-
- err = fscrypt_prepare_link(old_dentry, dir, dentry);
- if (err)
- return err;
-
- if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
- (!projid_eq(EXT4_I(dir)->i_projid,
- EXT4_I(old_dentry->d_inode)->i_projid)))
- return -EXDEV;
-
- err = dquot_initialize(dir);
- if (err)
- return err;
-
retry:
handle = ext4_journal_start(dir, EXT4_HT_DIR,
(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -3421,8 +3459,8 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode->i_ctime = current_time(inode);
- ext4_inc_count(handle, inode);
+ inode_set_ctime_current(inode);
+ ext4_inc_count(inode);
ihold(inode);
err = ext4_add_entry(handle, dentry, inode);
@@ -3434,6 +3472,7 @@ retry:
if (inode->i_nlink == 1)
ext4_orphan_del(handle, inode);
d_instantiate(dentry, inode);
+ ext4_fc_track_link(handle, dentry);
} else {
drop_nlink(inode);
iput(inode);
@@ -3444,6 +3483,29 @@ retry:
return err;
}
+static int ext4_link(struct dentry *old_dentry,
+ struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(old_dentry);
+ int err;
+
+ if (inode->i_nlink >= EXT4_LINK_MAX)
+ return -EMLINK;
+
+ err = fscrypt_prepare_link(old_dentry, dir, dentry);
+ if (err)
+ return err;
+
+ if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
+ (!projid_eq(EXT4_I(dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+ return __ext4_link(dir, inode, dentry);
+}
/*
* Try to find buffer head where contains the parent block.
@@ -3459,17 +3521,39 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
- /* The first directory block must not be a hole, so
- * treat it as DIRENT_HTREE
- */
- bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
+ struct ext4_dir_entry_2 *de;
+ unsigned int offset;
+
+ bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh)) {
*retval = PTR_ERR(bh);
return NULL;
}
- *parent_de = ext4_next_entry(
- (struct ext4_dir_entry_2 *)bh->b_data,
- inode->i_sb->s_blocksize);
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, 0) ||
+ le32_to_cpu(de->inode) != inode->i_ino ||
+ de->name_len != 1 || de->name[0] != '.') {
+ EXT4_ERROR_INODE(inode, "directory missing '.'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ offset = ext4_rec_len_from_disk(de->rec_len,
+ inode->i_sb->s_blocksize);
+ de = ext4_next_entry(de, inode->i_sb->s_blocksize);
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, offset) ||
+ le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
+ de->name[0] != '.' || de->name[1] != '.') {
+ EXT4_ERROR_INODE(inode, "directory missing '..'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ *parent_de = de;
+
return bh;
}
@@ -3495,10 +3579,14 @@ struct ext4_renament {
int dir_inlined;
};
-static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross)
{
int retval;
+ ent->is_dir = true;
+ if (!is_cross)
+ return 0;
+
ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
&retval, &ent->parent_de,
&ent->dir_inlined);
@@ -3507,7 +3595,8 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
return -EFSCORRUPTED;
BUFFER_TRACE(ent->dir_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, ent->dir_bh);
+ return ext4_journal_get_write_access(handle, ent->dir->i_sb,
+ ent->dir_bh, EXT4_JTR_NONE);
}
static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
@@ -3515,6 +3604,9 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
{
int retval;
+ if (!ent->dir_bh)
+ return 0;
+
ent->parent_de->inode = cpu_to_le32(dir_ino);
BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
if (!ent->dir_inlined) {
@@ -3542,15 +3634,15 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
int retval, retval2;
BUFFER_TRACE(ent->bh, "get write access");
- retval = ext4_journal_get_write_access(handle, ent->bh);
+ retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh,
+ EXT4_JTR_NONE);
if (retval)
return retval;
ent->de->inode = cpu_to_le32(ino);
if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
inode_inc_iversion(ent->dir);
- ent->dir->i_ctime = ent->dir->i_mtime =
- current_time(ent->dir);
+ inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir));
retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
@@ -3560,18 +3652,41 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
return retval2;
}
}
- brelse(ent->bh);
- ent->bh = NULL;
-
return retval;
}
+static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
+ unsigned ino, unsigned file_type)
+{
+ struct ext4_renament old = *ent;
+ int retval = 0;
+
+ /*
+ * old->de could have moved from under us during make indexed dir,
+ * so the old->de may no longer valid and need to find it again
+ * before reset old inode info.
+ */
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
+ &old.inlined);
+ if (IS_ERR(old.bh))
+ retval = PTR_ERR(old.bh);
+ if (!old.bh)
+ retval = -ENOENT;
+ if (retval) {
+ ext4_std_error(old.dir->i_sb, retval);
+ return;
+ }
+
+ ext4_setent(handle, &old, ino, file_type);
+ brelse(old.bh);
+}
+
static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
const struct qstr *d_name)
{
int retval = -ENOENT;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_2 *de = NULL;
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
@@ -3619,14 +3734,15 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
{
if (ent->dir_nlink_delta) {
if (ent->dir_nlink_delta == -1)
- ext4_dec_count(handle, ent->dir);
+ ext4_dec_count(ent->dir);
else
- ext4_inc_count(handle, ent->dir);
+ ext4_inc_count(ent->dir);
ext4_mark_inode_dirty(handle, ent->dir);
}
}
-static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
+static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap,
+ struct ext4_renament *ent,
int credits, handle_t **h)
{
struct inode *wh;
@@ -3640,7 +3756,8 @@ static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
EXT4_XATTR_TRANS_BLOCKS + 4);
retry:
- wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
+ wh = ext4_new_inode_start_handle(idmap, ent->dir,
+ S_IFCHR | WHITEOUT_MODE,
&ent->dentry->d_name, 0, NULL,
EXT4_HT_DIR, credits);
@@ -3667,9 +3784,9 @@ retry:
* while new_{dentry,inode) refers to the destination dentry/inode
* This comes from rename(const char *oldpath, const char *newpath)
*/
-static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
handle_t *handle = NULL;
struct ext4_renament old = {
@@ -3702,6 +3819,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
retval = dquot_initialize(old.dir);
if (retval)
return retval;
+ retval = dquot_initialize(old.inode);
+ if (retval)
+ return retval;
retval = dquot_initialize(new.dir);
if (retval)
return retval;
@@ -3714,9 +3834,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
return retval;
}
- old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
+ &old.inlined);
if (IS_ERR(old.bh))
return PTR_ERR(old.bh);
+
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -3725,14 +3847,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
retval = -ENOENT;
if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
- goto end_rename;
+ goto release_bh;
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
new.bh = NULL;
- goto end_rename;
+ goto release_bh;
}
if (new.bh) {
if (!new.inode) {
@@ -3749,18 +3871,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- handle = NULL;
- goto end_rename;
+ goto release_bh;
}
} else {
- whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
+ whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle);
if (IS_ERR(whiteout)) {
retval = PTR_ERR(whiteout);
- whiteout = NULL;
- goto end_rename;
+ goto release_bh;
}
}
+ old_file_type = old.de->file_type;
if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
ext4_handle_sync(handle);
@@ -3774,7 +3895,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
- retval = ext4_rename_dir_prepare(handle, &old);
+ retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
if (retval)
goto end_rename;
}
@@ -3788,7 +3909,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
force_reread = (new.dir->i_ino == old.dir->i_ino &&
ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
- old_file_type = old.de->file_type;
if (whiteout) {
/*
* Do this before adding a new entry, so the old entry is sure
@@ -3801,6 +3921,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
retval = ext4_mark_inode_dirty(handle, whiteout);
if (unlikely(retval))
goto end_rename;
+
}
if (!new.bh) {
retval = ext4_add_entry(handle, new.dentry, old.inode);
@@ -3820,7 +3941,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old.inode->i_ctime = current_time(old.inode);
+ inode_set_ctime_current(old.inode);
retval = ext4_mark_inode_dirty(handle, old.inode);
if (unlikely(retval))
goto end_rename;
@@ -3833,24 +3954,24 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (new.inode) {
- ext4_dec_count(handle, new.inode);
- new.inode->i_ctime = current_time(new.inode);
+ ext4_dec_count(new.inode);
+ inode_set_ctime_current(new.inode);
}
- old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
+ inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
ext4_update_dx_flag(old.dir);
- if (old.dir_bh) {
+ if (old.is_dir) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
goto end_rename;
- ext4_dec_count(handle, old.dir);
+ ext4_dec_count(old.dir);
if (new.inode) {
/* checked ext4_empty_dir above, can't have another
* parent, ext4_dec_count() won't work for many-linked
* dirs */
clear_nlink(new.inode);
} else {
- ext4_inc_count(handle, new.dir);
+ ext4_inc_count(new.dir);
ext4_update_dx_flag(new.dir);
retval = ext4_mark_inode_dirty(handle, new.dir);
if (unlikely(retval))
@@ -3860,6 +3981,31 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
retval = ext4_mark_inode_dirty(handle, old.dir);
if (unlikely(retval))
goto end_rename;
+
+ if (old.is_dir) {
+ /*
+ * We disable fast commits here that's because the
+ * replay code is not yet capable of changing dot dot
+ * dirents in directories.
+ */
+ ext4_fc_mark_ineligible(old.inode->i_sb,
+ EXT4_FC_REASON_RENAME_DIR, handle);
+ } else {
+ struct super_block *sb = old.inode->i_sb;
+
+ if (new.inode)
+ ext4_fc_track_unlink(handle, new.dentry);
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) {
+ __ext4_fc_track_link(handle, old.inode, new.dentry);
+ __ext4_fc_track_unlink(handle, old.inode, old.dentry);
+ if (whiteout)
+ __ext4_fc_track_create(handle, whiteout,
+ old.dentry);
+ }
+ }
+
if (new.inode) {
retval = ext4_mark_inode_dirty(handle, new.inode);
if (unlikely(retval))
@@ -3870,17 +4016,25 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
retval = 0;
end_rename:
- brelse(old.dir_bh);
- brelse(old.bh);
- brelse(new.bh);
if (whiteout) {
- if (retval)
+ if (retval) {
+ ext4_resetent(handle, &old,
+ old.inode->i_ino, old_file_type);
drop_nlink(whiteout);
+ ext4_mark_inode_dirty(handle, whiteout);
+ ext4_orphan_add(handle, whiteout);
+ }
unlock_new_inode(whiteout);
+ ext4_journal_stop(handle);
iput(whiteout);
- }
- if (handle)
+ } else {
ext4_journal_stop(handle);
+ }
+release_bh:
+ brelse(old.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
+
return retval;
}
@@ -3900,7 +4054,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
};
u8 new_file_type;
int retval;
- struct timespec64 ctime;
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
!projid_eq(EXT4_I(new_dir)->i_projid,
@@ -3956,14 +4109,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_handle_sync(handle);
if (S_ISDIR(old.inode->i_mode)) {
- old.is_dir = true;
- retval = ext4_rename_dir_prepare(handle, &old);
+ retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
if (retval)
goto end_rename;
}
if (S_ISDIR(new.inode->i_mode)) {
- new.is_dir = true;
- retval = ext4_rename_dir_prepare(handle, &new);
+ retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir);
if (retval)
goto end_rename;
}
@@ -3994,16 +4145,16 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- ctime = current_time(old.inode);
- old.inode->i_ctime = ctime;
- new.inode->i_ctime = ctime;
+ inode_set_ctime_current(old.inode);
+ inode_set_ctime_current(new.inode);
retval = ext4_mark_inode_dirty(handle, old.inode);
if (unlikely(retval))
goto end_rename;
retval = ext4_mark_inode_dirty(handle, new.inode);
if (unlikely(retval))
goto end_rename;
-
+ ext4_fc_mark_ineligible(new.inode->i_sb,
+ EXT4_FC_REASON_CROSS_RENAME, handle);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
@@ -4028,14 +4179,16 @@ end_rename:
return retval;
}
-static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int ext4_rename2(struct mnt_idmap *idmap,
+ struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
int err;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb))))
- return -EIO;
+ err = ext4_emergency_state(old_dir->i_sb);
+ if (unlikely(err))
+ return err;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
@@ -4050,7 +4203,7 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
new_dir, new_dentry);
}
- return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags);
}
/*
@@ -4070,15 +4223,17 @@ const struct inode_operations ext4_dir_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
+ .fileattr_get = ext4_fileattr_get,
+ .fileattr_set = ext4_fileattr_set,
};
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
new file mode 100644
index 000000000000..c9b93b670b0f
--- /dev/null
+++ b/fs/ext4/orphan.c
@@ -0,0 +1,661 @@
+/*
+ * Ext4 orphan inode handling
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#define EXT4_MAX_ORPHAN_FILE_BLOCKS 512
+
+static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
+{
+ int i, j, start;
+ struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
+ int ret = 0;
+ bool found = false;
+ __le32 *bdata;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int looped = 0;
+
+ /*
+ * Find block with free orphan entry. Use CPU number for a naive hash
+ * for a search start in the orphan file
+ */
+ start = raw_smp_processor_id()*13 % oi->of_blocks;
+ i = start;
+ do {
+ if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
+ >= 0) {
+ found = true;
+ break;
+ }
+ if (++i >= oi->of_blocks)
+ i = 0;
+ } while (i != start);
+
+ if (!found) {
+ /*
+ * For now we don't grow or shrink orphan file. We just use
+ * whatever was allocated at mke2fs time. The additional
+ * credits we would have to reserve for each orphan inode
+ * operation just don't seem worth it.
+ */
+ return -ENOSPC;
+ }
+
+ ret = ext4_journal_get_write_access(handle, inode->i_sb,
+ oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
+ if (ret) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return ret;
+ }
+
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ /* Find empty slot in a block */
+ j = 0;
+ do {
+ if (looped) {
+ /*
+ * Did we walk through the block several times without
+ * finding free entry? It is theoretically possible
+ * if entries get constantly allocated and freed or
+ * if the block is corrupted. Avoid indefinite looping
+ * and bail. We'll use orphan list instead.
+ */
+ if (looped > 3) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return -ENOSPC;
+ }
+ cond_resched();
+ }
+ while (bdata[j]) {
+ if (++j >= inodes_per_ob) {
+ j = 0;
+ looped++;
+ }
+ }
+ } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
+ (__le32)0);
+
+ EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
+ ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+
+ return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
+}
+
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_rwsem unless
+ * we are just creating the inode or deleting it.
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_iloc iloc;
+ int err = 0, rc;
+ bool dirty = false;
+
+ if (!sbi->s_journal || is_bad_inode(inode))
+ return 0;
+
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
+ !inode_is_locked(inode));
+ if (ext4_inode_orphan_tracked(inode))
+ return 0;
+
+ /*
+ * Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. Note that we either
+ * hold i_rwsem, or the inode can not be referenced from outside,
+ * so i_nlink should not be bumped due to race
+ */
+ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+ if (sbi->s_orphan_info.of_blocks) {
+ err = ext4_orphan_file_add(handle, inode);
+ /*
+ * Fallback to normal orphan list of orphan file is
+ * out of space
+ */
+ if (err != -ENOSPC)
+ return err;
+ }
+
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out;
+
+ mutex_lock(&sbi->s_orphan_lock);
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+ (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+ /* Insert this inode at the head of the on-disk orphan list */
+ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+ lock_buffer(sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ dirty = true;
+ }
+ list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+
+ if (dirty) {
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+ if (err) {
+ /*
+ * We have to remove inode from in-memory list if
+ * addition to on disk orphan list failed. Stray orphan
+ * list entries can cause panics at unmount time.
+ */
+ mutex_lock(&sbi->s_orphan_lock);
+ list_del_init(&EXT4_I(inode)->i_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ } else
+ brelse(iloc.bh);
+
+ ext4_debug("superblock will point to %lu\n", inode->i_ino);
+ ext4_debug("orphan inode %lu will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+out:
+ ext4_std_error(sb, err);
+ return err;
+}
+
+static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
+ __le32 *bdata;
+ int blk, off;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int ret = 0;
+
+ if (!handle)
+ goto out;
+ blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
+ off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
+ if (WARN_ON_ONCE(blk >= oi->of_blocks))
+ goto out;
+
+ ret = ext4_journal_get_write_access(handle, inode->i_sb,
+ oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
+ if (ret)
+ goto out;
+
+ bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
+ bdata[off] = 0;
+ atomic_inc(&oi->of_binfo[blk].ob_free_entries);
+ ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
+out:
+ ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+ INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);
+
+ return ret;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+ struct list_head *prev;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 ino_next;
+ struct ext4_iloc iloc;
+ int err = 0;
+
+ if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
+ return 0;
+
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
+ !inode_is_locked(inode));
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
+ return ext4_orphan_file_del(handle, inode);
+
+ /* Do this quick check before taking global s_orphan_lock. */
+ if (list_empty(&ei->i_orphan))
+ return 0;
+
+ if (handle) {
+ /* Grab inode buffer early before taking global s_orphan_lock */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ }
+
+ mutex_lock(&sbi->s_orphan_lock);
+ ext4_debug("remove inode %lu from orphan list\n", inode->i_ino);
+
+ prev = ei->i_orphan.prev;
+ list_del_init(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+ * disk, but we still need to remove the inode from the linked
+ * list in memory. */
+ if (!handle || err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_err;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+ if (prev == &sbi->s_orphan) {
+ ext4_debug("superblock will point to %u\n", ino_next);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ sbi->s_sbh, EXT4_JTR_NONE);
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_brelse;
+ }
+ lock_buffer(sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ ext4_superblock_csum_set(inode->i_sb);
+ unlock_buffer(sbi->s_sbh);
+ mutex_unlock(&sbi->s_orphan_lock);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ } else {
+ struct ext4_iloc iloc2;
+ struct inode *i_prev =
+ &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+ ext4_debug("orphan inode %lu will point to %u\n",
+ i_prev->i_ino, ino_next);
+ err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_brelse;
+ }
+ NEXT_ORPHAN(i_prev) = ino_next;
+ err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(inode) = 0;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+out_err:
+ ext4_std_error(inode->i_sb, err);
+ return err;
+
+out_brelse:
+ brelse(iloc.bh);
+ goto out_err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+ return dquot_quota_on_mount(sb,
+ rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type],
+ lockdep_is_held(&sb->s_umount)),
+ EXT4_SB(sb)->s_jquota_fmt, type);
+}
+#endif
+
+static void ext4_process_orphan(struct inode *inode,
+ int *nr_truncates, int *nr_orphans)
+{
+ struct super_block *sb = inode->i_sb;
+ int ret;
+
+ dquot_initialize(inode);
+ if (inode->i_nlink) {
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
+ ext4_debug("truncating inode %lu to %lld bytes\n",
+ inode->i_ino, inode->i_size);
+ inode_lock(inode);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ret = ext4_truncate(inode);
+ if (ret) {
+ /*
+ * We need to clean up the in-core orphan list
+ * manually if ext4_truncate() failed to get a
+ * transaction handle.
+ */
+ ext4_orphan_del(NULL, inode);
+ ext4_std_error(inode->i_sb, ret);
+ }
+ inode_unlock(inode);
+ (*nr_truncates)++;
+ } else {
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
+ ext4_debug("deleting unreferenced inode %lu\n",
+ inode->i_ino);
+ (*nr_orphans)++;
+ }
+ iput(inode); /* The delete magic happens here! */
+}
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
+{
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+ struct inode *inode;
+ int i, j;
+#ifdef CONFIG_QUOTA
+ int quota_update = 0;
+#endif
+ __le32 *bdata;
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+
+ if (!es->s_last_orphan && !oi->of_blocks) {
+ ext4_debug("no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (bdev_read_only(sb->s_bdev)) {
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, skipping orphan cleanup");
+ return;
+ }
+
+ /* Check if feature set would not allow a r/w mount */
+ if (!ext4_feature_set_ok(sb, 0)) {
+ ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
+ ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
+ "clearing orphan list.");
+ es->s_last_orphan = 0;
+ }
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
+ return;
+ }
+
+ if (s_flags & SB_RDONLY) {
+ ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+ sb->s_flags &= ~SB_RDONLY;
+ }
+#ifdef CONFIG_QUOTA
+ /*
+ * Turn on quotas which were not enabled for read-only mounts if
+ * filesystem has quota feature, so that they are updated correctly.
+ */
+ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
+ int ret = ext4_enable_quotas(sb);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on quotas: error %d", ret);
+ }
+
+ /* Turn on journaled quotas used for old sytle */
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (EXT4_SB(sb)->s_qf_names[i]) {
+ int ret = ext4_quota_on_mount(sb, i);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on journaled "
+ "quota: type %d: error %d", i, ret);
+ }
+ }
+#endif
+
+ while (es->s_last_orphan) {
+ /*
+ * We may have encountered an error during cleanup; if
+ * so, skip the rest.
+ */
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+ if (IS_ERR(inode)) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+ ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
+ }
+
+ for (i = 0; i < oi->of_blocks; i++) {
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ for (j = 0; j < inodes_per_ob; j++) {
+ if (!bdata[j])
+ continue;
+ inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
+ if (IS_ERR(inode))
+ continue;
+ ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+ EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
+ ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
+ }
+ }
+
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
+
+ if (nr_orphans)
+ ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
+ if (nr_truncates)
+ ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+ /* Turn off quotas if they were enabled for orphan cleanup */
+ if (quota_update) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (sb_dqopt(sb)->files[i])
+ dquot_quota_off(sb, i);
+ }
+ }
+#endif
+ sb->s_flags = s_flags; /* Restore SB_RDONLY status */
+}
+
+void ext4_release_orphan_info(struct super_block *sb)
+{
+ int i;
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+
+ if (!oi->of_blocks)
+ return;
+ for (i = 0; i < oi->of_blocks; i++)
+ brelse(oi->of_binfo[i].ob_bh);
+ kvfree(oi->of_binfo);
+}
+
+static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
+ struct super_block *sb,
+ struct buffer_head *bh)
+{
+ return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
+ sizeof(struct ext4_orphan_block_tail));
+}
+
+static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ __u32 calculated;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct ext4_orphan_block_tail *ot;
+ __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
+
+ if (!ext4_has_feature_metadata_csum(sb))
+ return 1;
+
+ ot = ext4_orphan_block_tail(sb, bh);
+ calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ calculated = ext4_chksum(calculated, (__u8 *)bh->b_data,
+ inodes_per_ob * sizeof(__u32));
+ return le32_to_cpu(ot->ob_checksum) == calculated;
+}
+
+/* This gets called only when checksumming is enabled */
+void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
+ __u32 csum;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct ext4_orphan_block_tail *ot;
+ __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
+
+ csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32));
+ ot = ext4_orphan_block_tail(sb, bh);
+ ot->ob_checksum = cpu_to_le32(csum);
+}
+
+int ext4_init_orphan_info(struct super_block *sb)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct inode *inode;
+ int i, j;
+ int ret;
+ int free;
+ __le32 *bdata;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_block_tail *ot;
+ ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
+
+ if (!ext4_has_feature_orphan_file(sb))
+ return 0;
+
+ inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL);
+ if (IS_ERR(inode)) {
+ ext4_msg(sb, KERN_ERR, "get orphan inode failed");
+ return PTR_ERR(inode);
+ }
+ /*
+ * This is just an artificial limit to prevent corrupted fs from
+ * consuming absurd amounts of memory when pinning blocks of orphan
+ * file in memory.
+ */
+ if (inode->i_size > (EXT4_MAX_ORPHAN_FILE_BLOCKS << inode->i_blkbits)) {
+ ext4_msg(sb, KERN_ERR, "orphan file too big: %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto out_put;
+ }
+ oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
+ oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
+ oi->of_binfo = kvmalloc_array(oi->of_blocks,
+ sizeof(struct ext4_orphan_block),
+ GFP_KERNEL);
+ if (!oi->of_binfo) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ for (i = 0; i < oi->of_blocks; i++) {
+ oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
+ if (IS_ERR(oi->of_binfo[i].ob_bh)) {
+ ret = PTR_ERR(oi->of_binfo[i].ob_bh);
+ goto out_free;
+ }
+ if (!oi->of_binfo[i].ob_bh) {
+ ret = -EIO;
+ goto out_free;
+ }
+ ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
+ if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
+ ext4_error(sb, "orphan file block %d: bad magic", i);
+ ret = -EIO;
+ goto out_free;
+ }
+ if (!ext4_orphan_file_block_csum_verify(sb,
+ oi->of_binfo[i].ob_bh)) {
+ ext4_error(sb, "orphan file block %d: bad checksum", i);
+ ret = -EIO;
+ goto out_free;
+ }
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ free = 0;
+ for (j = 0; j < inodes_per_ob; j++)
+ if (bdata[j] == 0)
+ free++;
+ atomic_set(&oi->of_binfo[i].ob_free_entries, free);
+ }
+ iput(inode);
+ return 0;
+out_free:
+ for (i--; i >= 0; i--)
+ brelse(oi->of_binfo[i].ob_bh);
+ kvfree(oi->of_binfo);
+out_put:
+ iput(inode);
+ return ret;
+}
+
+int ext4_orphan_file_empty(struct super_block *sb)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ int i;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+
+ if (!ext4_has_feature_orphan_file(sb))
+ return 1;
+ for (i = 0; i < oi->of_blocks; i++)
+ if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
+ inodes_per_ob)
+ return 0;
+ return 1;
+}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index de6fe969f773..39abfeec5f36 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -24,7 +24,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/backing-dev.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -99,33 +99,29 @@ static void buffer_io_error(struct buffer_head *bh)
static void ext4_finish_bio(struct bio *bio)
{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- struct page *bounce_page = NULL;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ struct folio *io_folio = NULL;
struct buffer_head *bh, *head;
- unsigned bio_start = bvec->bv_offset;
- unsigned bio_end = bio_start + bvec->bv_len;
+ size_t bio_start = fi.offset;
+ size_t bio_end = bio_start + fi.length;
unsigned under_io = 0;
unsigned long flags;
- if (!page)
- continue;
-
- if (fscrypt_is_bounce_page(page)) {
- bounce_page = page;
- page = fscrypt_pagecache_page(bounce_page);
+ if (fscrypt_is_bounce_folio(folio)) {
+ io_folio = folio;
+ folio = fscrypt_pagecache_folio(folio);
}
if (bio->bi_status) {
- SetPageError(page);
- mapping_set_error(page->mapping, -EIO);
+ int err = blk_status_to_errno(bio->bi_status);
+ mapping_set_error(folio->mapping, err);
}
- bh = head = page_buffers(page);
+ bh = head = folio_buffers(folio);
/*
- * We check all buffers in the page under b_uptodate_lock
+ * We check all buffers in the folio under b_uptodate_lock
* to avoid races with other end io clearing async_write flags
*/
spin_lock_irqsave(&head->b_uptodate_lock, flags);
@@ -137,13 +133,15 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_status)
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
buffer_io_error(bh);
+ }
} while ((bh = bh->b_this_page) != head);
spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
- fscrypt_free_bounce_page(bounce_page);
- end_page_writeback(page);
+ fscrypt_free_bounce_page(&io_folio->page);
+ folio_end_writeback(folio);
}
}
}
@@ -166,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
}
/*
- * Check a range of space and convert unwritten extents to written. Note that
+ * On successful IO, check a range of space and convert unwritten extents to
+ * written. On IO failure, check if journal abort is needed. Note that
* we are protected from truncate touching same part of extent tree by the
* fact that truncate code waits for all DIO to finish (thus exclusion from
* direct IO is achieved) and also waits for PageWriteback bits. Thus we
@@ -177,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
handle_t *handle = io_end->handle;
+ struct super_block *sb = inode->i_sb;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io_end->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
- if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
- ext4_msg(inode->i_sb, KERN_EMERG,
+ /*
+ * Do not convert the unwritten extents if data writeback fails,
+ * or stale data may be exposed.
+ */
+ io_end->handle = NULL; /* Following call will use up the handle */
+ if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) {
+ ret = -EIO;
+ if (handle)
+ jbd2_journal_free_reserved(handle);
+
+ if (test_opt(sb, DATA_ERR_ABORT))
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
+ } else {
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
+ }
+ if (ret < 0 && !ext4_emergency_state(sb) &&
+ io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ ext4_msg(sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
"(inode %lu, error %d)", inode->i_ino, ret);
}
+
ext4_clear_io_unwritten_flag(io_end);
ext4_release_io_end(io_end);
return ret;
@@ -219,6 +234,18 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
#endif
}
+static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
+{
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !list_empty(&io_end->list_vec))
+ return true;
+ if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
+ io_end->flag & EXT4_IO_END_FAILED &&
+ !ext4_emergency_state(io_end->inode->i_sb))
+ return true;
+ return false;
+}
+
/* Add the io_end to per-inode completed end_io list. */
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
@@ -227,9 +254,12 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
- /* Only reserved conversions from writeback should enter here */
- WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- WARN_ON(!io_end->handle && sbi->s_journal);
+ /* Only reserved conversions or pending IO errors will enter here. */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !io_end->handle && sbi->s_journal);
+ WARN_ON(!io_end->bio);
+
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
wq = sbi->rsv_conversion_wq;
if (list_empty(&ei->i_rsv_conversion_list))
@@ -254,7 +284,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
while (!list_empty(&unwritten)) {
io_end = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
list_del_init(&io_end->list);
err = ext4_end_io_end(io_end);
@@ -265,7 +295,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
}
/*
- * work on completed IO, to convert unwritten extents to extents
+ * Used to convert unwritten extents to written extents upon IO completion,
+ * or used to abort the journal upon IO errors.
*/
void ext4_end_io_rsv_work(struct work_struct *work)
{
@@ -282,42 +313,35 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
io_end->inode = inode;
INIT_LIST_HEAD(&io_end->list);
INIT_LIST_HEAD(&io_end->list_vec);
- atomic_set(&io_end->count, 1);
+ refcount_set(&io_end->count, 1);
}
return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
- if (atomic_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
- list_empty(&io_end->list_vec)) {
- ext4_release_io_end(io_end);
- return;
- }
- ext4_add_complete_io(io_end);
+ if (refcount_dec_and_test(&io_end->count)) {
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_add_complete_io(io_end);
+
+ ext4_release_io_end(io_end);
}
}
int ext4_put_io_end(ext4_io_end_t *io_end)
{
- int err = 0;
-
- if (atomic_dec_and_test(&io_end->count)) {
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_io_end_vec(io_end->handle,
- io_end);
- io_end->handle = NULL;
- ext4_clear_io_unwritten_flag(io_end);
- }
+ if (refcount_dec_and_test(&io_end->count)) {
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_end_io_end(io_end);
+
ext4_release_io_end(io_end);
}
- return err;
+ return 0;
}
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
{
- atomic_inc(&io_end->count);
+ refcount_inc(&io_end->count);
return io_end;
}
@@ -326,10 +350,9 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
- char b[BDEVNAME_SIZE];
- if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
- bio_devname(bio, b),
+ if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+ bio->bi_bdev,
(long long) bio->bi_iter.bi_sector,
(unsigned) bio_sectors(bio),
bio->bi_status)) {
@@ -347,11 +370,12 @@ static void ext4_end_bio(struct bio *bio)
bio->bi_status, inode->i_ino,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
+ io_end->flag |= EXT4_IO_END_FAILED;
mapping_set_error(inode->i_mapping,
blk_status_to_errno(bio->bi_status));
}
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ if (ext4_io_end_defer_completion(io_end)) {
/*
* Link bio into list hanging from io_end. We have to do it
* atomically as bio completions can be racing against each
@@ -375,10 +399,8 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
- int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0;
- io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
- bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
+ if (io->io_wbc->sync_mode == WB_SYNC_ALL)
+ io->io_bio->bi_opf |= REQ_SYNC;
submit_bio(io->io_bio);
}
io->io_bio = NULL;
@@ -401,9 +423,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
@@ -413,12 +435,12 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
static void io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
- struct page *page,
+ struct folio *folio,
+ struct folio *io_folio,
struct buffer_head *bh)
{
- int ret;
-
- if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+ if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
+ !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
submit_and_retry:
ext4_io_submit(io);
}
@@ -426,55 +448,46 @@ submit_and_retry:
io_submit_init_bio(io, bh);
io->io_bio->bi_write_hint = inode->i_write_hint;
}
- ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
- if (ret != bh->b_size)
+ if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
goto submit_and_retry;
- wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
+ wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
io->io_next_block++;
}
-int ext4_bio_write_page(struct ext4_io_submit *io,
- struct page *page,
- int len,
- struct writeback_control *wbc,
- bool keep_towrite)
+int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
+ size_t len)
{
- struct page *bounce_page = NULL;
- struct inode *inode = page->mapping->host;
+ struct folio *io_folio = folio;
+ struct inode *inode = folio->mapping->host;
unsigned block_start;
struct buffer_head *bh, *head;
int ret = 0;
- int nr_submitted = 0;
int nr_to_submit = 0;
+ struct writeback_control *wbc = io->io_wbc;
+ bool keep_towrite = false;
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
-
- if (keep_towrite)
- set_page_writeback_keepwrite(page);
- else
- set_page_writeback(page);
- ClearPageError(page);
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
/*
- * Comments copied from block_write_full_page:
+ * Comments copied from block_write_full_folio:
*
- * The page straddles i_size. It must be zeroed out on each and every
+ * The folio straddles i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- if (len < PAGE_SIZE)
- zero_user_segment(page, len, PAGE_SIZE);
+ if (len < folio_size(folio))
+ folio_zero_segment(folio, len, folio_size(folio));
/*
* In the first loop we prepare and mark buffers to submit. We have to
- * mark all buffers in the page before submitting so that
- * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * mark all buffers in the folio before submitting so that
+ * folio_end_writeback() cannot be called from ext4_end_bio() when IO
* on the first buffer finishes and we are still working on submitting
* the second buffer.
*/
- bh = head = page_buffers(page);
+ bh = head = folio_buffers(folio);
do {
block_start = bh_offset(bh);
if (block_start >= len) {
@@ -487,17 +500,34 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/* A hole? We can safely clear the dirty bit */
if (!buffer_mapped(bh))
clear_buffer_dirty(bh);
- if (io->io_bio)
- ext4_io_submit(io);
+ /*
+ * Keeping dirty some buffer we cannot write? Make sure
+ * to redirty the folio and keep TOWRITE tag so that
+ * racing WB_SYNC_ALL writeback does not skip the folio.
+ * This happens e.g. when doing writeout for
+ * transaction commit or when journalled data is not
+ * yet committed.
+ */
+ if (buffer_dirty(bh) ||
+ (buffer_jbd(bh) && buffer_jbddirty(bh))) {
+ if (!folio_test_dirty(folio))
+ folio_redirty_for_writepage(wbc, folio);
+ keep_towrite = true;
+ }
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
set_buffer_async_write(bh);
+ clear_buffer_dirty(bh);
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
- bh = head = page_buffers(page);
+ /* Nothing to submit? Just unlock the folio... */
+ if (!nr_to_submit)
+ return 0;
+
+ bh = head = folio_buffers(folio);
/*
* If any blocks are being written to an encrypted file, encrypt them
@@ -506,9 +536,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* (e.g. holes) to be unnecessarily encrypted, but this is rare and
* can't happen in the common case of blocksize == PAGE_SIZE.
*/
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
gfp_t gfp_flags = GFP_NOFS;
unsigned int enc_bytes = round_up(len, i_blocksize(inode));
+ struct page *bounce_page;
/*
* Since bounce page allocation uses a mempool, we can only use
@@ -516,47 +547,47 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* first page of the bio. Otherwise it can deadlock.
*/
if (io->io_bio)
- gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
+ gfp_flags = GFP_NOWAIT;
retry_encrypt:
- bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
- 0, gfp_flags);
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
+ enc_bytes, 0, gfp_flags);
if (IS_ERR(bounce_page)) {
ret = PTR_ERR(bounce_page);
if (ret == -ENOMEM &&
(io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
- gfp_flags = GFP_NOFS;
+ gfp_t new_gfp_flags = GFP_NOFS;
if (io->io_bio)
ext4_io_submit(io);
else
- gfp_flags |= __GFP_NOFAIL;
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ new_gfp_flags |= __GFP_NOFAIL;
+ memalloc_retry_wait(gfp_flags);
+ gfp_flags = new_gfp_flags;
goto retry_encrypt;
}
printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
do {
- clear_buffer_async_write(bh);
+ if (buffer_async_write(bh)) {
+ clear_buffer_async_write(bh);
+ set_buffer_dirty(bh);
+ }
bh = bh->b_this_page;
} while (bh != head);
- goto unlock;
+
+ return ret;
}
+ io_folio = page_folio(bounce_page);
}
+ __folio_start_writeback(folio, keep_towrite);
+
/* Now submit buffers to write */
do {
if (!buffer_async_write(bh))
continue;
- io_submit_add_bh(io, inode,
- bounce_page ? bounce_page : page, bh);
- nr_submitted++;
- clear_buffer_dirty(bh);
+ io_submit_add_bh(io, inode, folio, io_folio, bh);
} while ((bh = bh->b_this_page) != head);
-unlock:
- unlock_page(page);
- /* Nothing submitted - we have to end page writeback */
- if (!nr_submitted)
- end_page_writeback(page);
- return ret;
+ return 0;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5761e9961682..e7f2350c725b 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -43,7 +43,6 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
-#include <linux/cleancache.h>
#include "ext4.h"
@@ -69,23 +68,10 @@ struct bio_post_read_ctx {
static void __read_end_io(struct bio *bio)
{
- struct page *page;
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bv, bio, iter_all) {
- page = bv->bv_page;
-
- /* PG_error was set if any post_read step failed */
- if (bio->bi_status || PageError(page)) {
- ClearPageUptodate(page);
- /* will re-read again later */
- ClearPageError(page);
- } else {
- SetPageUptodate(page);
- }
- unlock_page(page);
- }
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, bio->bi_status == 0);
if (bio->bi_private)
mempool_free(bio->bi_private, bio_post_read_ctx_pool);
bio_put(bio);
@@ -97,10 +83,12 @@ static void decrypt_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
- fscrypt_decrypt_bio(ctx->bio);
-
- bio_post_read_processing(ctx);
+ if (fscrypt_decrypt_bio(bio))
+ bio_post_read_processing(ctx);
+ else
+ __read_end_io(bio);
}
static void verity_work(struct work_struct *work)
@@ -110,7 +98,7 @@ static void verity_work(struct work_struct *work)
struct bio *bio = ctx->bio;
/*
- * fsverity_verify_bio() may call readpages() again, and although verity
+ * fsverity_verify_bio() may call readahead() again, and although verity
* will be disabled for that, decryption may still be needed, causing
* another bio_post_read_ctx to be allocated. So to guarantee that
* mempool_alloc() never deadlocks we must free the current ctx first.
@@ -140,7 +128,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
case STEP_VERITY:
if (ctx->enabled_steps & (1 << STEP_VERITY)) {
INIT_WORK(&ctx->work, verity_work);
@@ -148,7 +136,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
default:
__read_end_io(ctx->bio);
}
@@ -164,7 +152,7 @@ static bool bio_post_read_required(struct bio *bio)
*
* The mpage code never puts partial pages into a BIO (except for end-of-file).
* If a page does not map to a contiguous run of blocks then it simply falls
- * back to block_read_full_page().
+ * back to block_read_full_folio().
*
* Why is this? If a page's completion depends on a number of different BIOs
* which can complete in any order (or at the same time) then determining the
@@ -195,7 +183,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio,
{
unsigned int post_read_steps = 0;
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (ext4_need_verity(inode, first_idx))
@@ -214,52 +202,55 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio,
static inline loff_t ext4_readpage_limit(struct inode *inode)
{
- if (IS_ENABLED(CONFIG_FS_VERITY) &&
- (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+ if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
return inode->i_sb->s_maxbytes;
return i_size_read(inode);
}
int ext4_mpage_readpages(struct inode *inode,
- struct readahead_control *rac, struct page *page)
+ struct readahead_control *rac, struct folio *folio)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
-
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
+ sector_t next_block;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
+ sector_t first_block;
unsigned page_block;
struct block_device *bdev = inode->i_sb->s_bdev;
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
- unsigned int nr_pages = rac ? readahead_count(rac) : 1;
+ unsigned int nr_pages, folio_pages;
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
- for (; nr_pages; nr_pages--) {
+ nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio);
+ for (; nr_pages; nr_pages -= folio_pages) {
int fully_mapped = 1;
- unsigned first_hole = blocks_per_page;
+ unsigned int first_hole;
+ unsigned int blocks_per_folio;
- if (rac) {
- page = readahead_page(rac);
- prefetchw(&page->flags);
- }
+ if (rac)
+ folio = readahead_folio(rac);
+
+ folio_pages = folio_nr_pages(folio);
+ prefetchw(&folio->flags);
- if (page_has_buffers(page))
+ if (folio_buffers(folio))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
- last_block = block_in_file + nr_pages * blocks_per_page;
+ blocks_per_folio = folio_size(folio) >> blkbits;
+ first_hole = blocks_per_folio;
+ block_in_file = next_block = EXT4_PG_TO_LBLK(inode, folio->index);
+ last_block = EXT4_PG_TO_LBLK(inode, folio->index + nr_pages);
last_block_in_file = (ext4_readpage_limit(inode) +
blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
@@ -275,16 +266,15 @@ int ext4_mpage_readpages(struct inode *inode,
unsigned map_offset = block_in_file - map.m_lblk;
unsigned last = map.m_len - map_offset;
+ first_block = map.m_pblk + map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
}
- if (page_block == blocks_per_page)
+ if (page_block == blocks_per_folio)
break;
- blocks[page_block] = map.m_pblk + map_offset +
- relative_block;
page_block++;
block_in_file++;
}
@@ -292,73 +282,68 @@ int ext4_mpage_readpages(struct inode *inode,
/*
* Then do more ext4_map_blocks() calls until we are
- * done with this page.
+ * done with this folio.
*/
- while (page_block < blocks_per_page) {
+ while (page_block < blocks_per_folio) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
set_error_page:
- SetPageError(page);
- zero_user_segment(page, 0,
- PAGE_SIZE);
- unlock_page(page);
+ folio_zero_segment(folio, 0,
+ folio_size(folio));
+ folio_unlock(folio);
goto next_page;
}
}
if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
fully_mapped = 0;
- if (first_hole == blocks_per_page)
+ if (first_hole == blocks_per_folio)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
- if (first_hole != blocks_per_page)
+ if (first_hole != blocks_per_folio)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
- if (page_block && blocks[page_block-1] != map.m_pblk-1)
+ if (!page_block)
+ first_block = map.m_pblk;
+ else if (first_block + page_block != map.m_pblk)
goto confused;
for (relative_block = 0; ; relative_block++) {
if (relative_block == map.m_len) {
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
- } else if (page_block == blocks_per_page)
+ } else if (page_block == blocks_per_folio)
break;
- blocks[page_block] = map.m_pblk+relative_block;
page_block++;
block_in_file++;
}
}
- if (first_hole != blocks_per_page) {
- zero_user_segment(page, first_hole << blkbits,
- PAGE_SIZE);
+ if (first_hole != blocks_per_folio) {
+ folio_zero_segment(folio, first_hole << blkbits,
+ folio_size(folio));
if (first_hole == 0) {
- if (ext4_need_verity(inode, page->index) &&
- !fsverity_verify_page(page))
+ if (ext4_need_verity(inode, folio->index) &&
+ !fsverity_verify_folio(folio))
goto set_error_page;
- SetPageUptodate(page);
- unlock_page(page);
- goto next_page;
+ folio_end_read(folio, true);
+ continue;
}
} else if (fully_mapped) {
- SetPageMappedToDisk(page);
- }
- if (fully_mapped && blocks_per_page == 1 &&
- !PageUptodate(page) && cleancache_get_page(page) == 0) {
- SetPageUptodate(page);
- goto confused;
+ folio_set_mappedtodisk(folio);
}
/*
- * This page will go to BIO. Do we need to send this
+ * This folio will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != blocks[0] - 1)) {
+ if (bio && (last_block_in_bio != first_block - 1 ||
+ !fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
bio = NULL;
@@ -368,40 +353,40 @@ int ext4_mpage_readpages(struct inode *inode,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_KERNEL,
- min_t(int, nr_pages, BIO_MAX_PAGES));
- ext4_set_bio_post_read_ctx(bio, inode, page->index);
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ bio = bio_alloc(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
+ GFP_KERNEL);
+ ext4_set_bio_post_read_ctx(bio, inode, folio->index);
+ bio->bi_iter.bi_sector = first_block << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ,
- rac ? REQ_RAHEAD : 0);
+ if (rac)
+ bio->bi_opf |= REQ_RAHEAD;
}
length = first_hole << blkbits;
- if (bio_add_page(bio, page, length, 0) < length)
+ if (!bio_add_folio(bio, folio, length, 0))
goto submit_and_realloc;
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
- (first_hole != blocks_per_page)) {
+ (first_hole != blocks_per_folio)) {
submit_bio(bio);
bio = NULL;
} else
- last_block_in_bio = blocks[blocks_per_page - 1];
- goto next_page;
+ last_block_in_bio = first_block + blocks_per_folio - 1;
+ continue;
confused:
if (bio) {
submit_bio(bio);
bio = NULL;
}
- if (!PageUptodate(page))
- block_read_full_page(page, ext4_get_block);
+ if (!folio_test_uptodate(folio))
+ block_read_full_folio(folio, ext4_get_block);
else
- unlock_page(page);
- next_page:
- if (rac)
- put_page(page);
+ folio_unlock(folio);
+next_page:
+ ; /* A label shall be followed by a statement until C23 */
}
if (bio)
submit_bio(bio);
@@ -410,9 +395,8 @@ int ext4_mpage_readpages(struct inode *inode,
int __init ext4_init_post_read_processing(void)
{
- bio_post_read_ctx_cache =
- kmem_cache_create("ext4_bio_post_read_ctx",
- sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+ bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);
+
if (!bio_post_read_ctx_cache)
goto fail;
bio_post_read_ctx_pool =
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a50b51270ea9..050f26168d97 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -10,10 +10,9 @@
*/
-#define EXT4FS_DEBUG
-
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include "ext4_jbd2.h"
@@ -53,14 +52,24 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
/*
+ * If the reserved GDT blocks is non-zero, the resize_inode feature
+ * should always be set.
+ */
+ if (sbi->s_es->s_reserved_gdt_blocks &&
+ !ext4_has_feature_resize_inode(sb)) {
+ ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
+ return -EFSCORRUPTED;
+ }
+
+ /*
* If we are not using the primary superblock/GDT copy don't resize,
* because the user tools have no way of handling this. Probably a
* bad time to do it anyways.
*/
if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) !=
- le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+ le32_to_cpu(sbi->s_es->s_first_data_block)) {
ext4_warning(sb, "won't resize using backup superblock at %llu",
- (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+ (unsigned long long)sbi->s_sbh->b_blocknr);
return -EPERM;
}
@@ -68,35 +77,31 @@ int ext4_resize_begin(struct super_block *sb)
* We are not allowed to do online-resizing on a filesystem mounted
* with error, because it can destroy the filesystem easily.
*/
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ if (sbi->s_mount_state & EXT4_ERROR_FS) {
ext4_warning(sb, "There are errors in the filesystem, "
"so online resizing is not allowed");
return -EPERM;
}
+ if (ext4_has_feature_sparse_super2(sb)) {
+ ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2");
+ return -EOPNOTSUPP;
+ }
+
if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
- &EXT4_SB(sb)->s_ext4_flags))
+ &sbi->s_ext4_flags))
ret = -EBUSY;
return ret;
}
-void ext4_resize_end(struct super_block *sb)
+int ext4_resize_end(struct super_block *sb, bool update_backups)
{
clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags);
smp_mb__after_atomic();
-}
-
-static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
- ext4_group_t group) {
- return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
- EXT4_DESC_PER_BLOCK_BITS(sb);
-}
-
-static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
- ext4_group_t group) {
- group = ext4_meta_bg_first_group(sb, group);
- return ext4_group_first_block_no(sb, group);
+ if (update_backups)
+ return ext4_update_overhead(sb, true);
+ return 0;
}
static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
@@ -135,8 +140,9 @@ static int verify_group_input(struct super_block *sb,
overhead = ext4_group_overhead_blocks(sb, group);
metaend = start + overhead;
- input->free_clusters_count = free_blocks_count =
- input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+ free_blocks_count = input->blocks_count - 2 - overhead -
+ sbi->s_itb_per_group;
+ input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
@@ -212,35 +218,55 @@ struct ext4_new_flex_group_data {
in the flex group */
__u16 *bg_flags; /* block group flags of groups
in @groups */
+ ext4_group_t resize_bg; /* number of allocated
+ new_group_data */
ext4_group_t count; /* number of groups in @groups
*/
};
/*
- * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
- * @flexbg_size.
+ * Avoiding memory allocation failures due to too many groups added each time.
+ */
+#define MAX_RESIZE_BG 16384
+
+/*
+ * alloc_flex_gd() allocates an ext4_new_flex_group_data that satisfies the
+ * resizing from @o_group to @n_group, its size is typically @flexbg_size.
*
* Returns NULL on failure otherwise address of the allocated structure.
*/
-static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size,
+ ext4_group_t o_group, ext4_group_t n_group)
{
+ ext4_group_t last_group;
+ unsigned int max_resize_bg;
struct ext4_new_flex_group_data *flex_gd;
flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
if (flex_gd == NULL)
goto out3;
- if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
- goto out2;
- flex_gd->count = flexbg_size;
+ max_resize_bg = umin(flexbg_size, MAX_RESIZE_BG);
+ flex_gd->resize_bg = max_resize_bg;
+
+ /* Avoid allocating large 'groups' array if not needed */
+ last_group = o_group | (flex_gd->resize_bg - 1);
+ if (n_group <= last_group)
+ flex_gd->resize_bg = 1 << fls(n_group - o_group);
+ else if (n_group - last_group < flex_gd->resize_bg)
+ flex_gd->resize_bg = 1 << max(fls(last_group - o_group),
+ fls(n_group - last_group));
+
+ if (WARN_ON_ONCE(flex_gd->resize_bg > max_resize_bg))
+ flex_gd->resize_bg = max_resize_bg;
- flex_gd->groups = kmalloc_array(flexbg_size,
+ flex_gd->groups = kmalloc_array(flex_gd->resize_bg,
sizeof(struct ext4_new_group_data),
GFP_NOFS);
if (flex_gd->groups == NULL)
goto out2;
- flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16),
+ flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16),
GFP_NOFS);
if (flex_gd->bg_flags == NULL)
goto out1;
@@ -277,7 +303,7 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
*/
static int ext4_alloc_group_tables(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
- int flexbg_size)
+ unsigned int flexbg_size)
{
struct ext4_new_group_data *group_data = flex_gd->groups;
ext4_fsblk_t start_blk;
@@ -378,12 +404,12 @@ next_group:
group = group_data[0].group;
printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
- "%d groups, flexbg size is %d:\n", flex_gd->count,
+ "%u groups, flexbg size is %u:\n", flex_gd->count,
flexbg_size);
for (i = 0; i < flex_gd->count; i++) {
ext4_debug(
- "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n",
+ "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n",
ext4_bg_has_super(sb, group + i) ? "normal" :
"no-super", group + i,
group_data[i].blocks_count,
@@ -404,7 +430,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
BUFFER_TRACE(bh, "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, bh))) {
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
+ if (err) {
brelse(bh);
bh = ERR_PTR(err);
} else {
@@ -440,8 +467,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster,
last_cluster);
- for (count2 = count; count > 0;
- count -= count2, first_cluster += count2) {
+ for (; count > 0; count -= count2, first_cluster += count2) {
ext4_fsblk_t start;
struct buffer_head *bh;
ext4_group_t group;
@@ -469,14 +495,15 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
return -ENOMEM;
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh,
+ EXT4_JTR_NONE);
if (err) {
brelse(bh);
return err;
}
ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
first_cluster, first_cluster - start, count2);
- ext4_set_bits(bh->b_data, first_cluster - start, count2);
+ mb_set_bits(bh->b_data, first_cluster - start, count2);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -539,13 +566,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
goto handle_itb;
- if (meta_bg == 1) {
- ext4_group_t first_group;
- first_group = ext4_meta_bg_first_group(sb, group);
- if (first_group != group + 1 &&
- first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
- goto handle_itb;
- }
+ if (meta_bg == 1)
+ goto handle_itb;
block = start + ext4_bg_has_super(sb, group);
/* Copy all of the GDT blocks into the backup in this group */
@@ -564,7 +586,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
}
BUFFER_TRACE(gdb, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb);
+ err = ext4_journal_get_write_access(handle, sb, gdb,
+ EXT4_JTR_NONE);
if (err) {
brelse(gdb);
goto out;
@@ -592,7 +615,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
}
handle_itb:
- /* Initialize group tables of the grop @group */
+ /* Initialize group tables of the group @group */
if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
goto handle_bb;
@@ -624,7 +647,7 @@ handle_bb:
if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0,
+ mb_set_bits(bh->b_data, 0,
EXT4_NUM_B2C(sbi, overhead));
}
ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
@@ -682,16 +705,14 @@ handle_ib:
block = start;
}
- if (count) {
- err = set_flexbg_block_bitmap(sb, handle,
- flex_gd,
- EXT4_B2C(sbi, start),
- EXT4_B2C(sbi,
- start + count
- - 1));
- if (err)
- goto out;
- }
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd,
+ EXT4_B2C(sbi, start),
+ EXT4_B2C(sbi,
+ start + count
+ - 1));
+ if (err)
+ goto out;
}
out:
@@ -709,12 +730,23 @@ out:
* sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
* For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
*/
-static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
- unsigned *five, unsigned *seven)
+unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three,
+ unsigned int *five, unsigned int *seven)
{
- unsigned *min = three;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ unsigned int *min = three;
int mult = 3;
- unsigned ret;
+ unsigned int ret;
+
+ if (ext4_has_feature_sparse_super2(sb)) {
+ do {
+ if (*min > 2)
+ return UINT_MAX;
+ ret = le32_to_cpu(es->s_backup_bgs[*min - 1]);
+ *min += 1;
+ } while (!ret);
+ return ret;
+ }
if (!ext4_has_feature_sparse_super(sb)) {
ret = *min;
@@ -832,19 +864,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
}
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (unlikely(err))
goto errout;
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE);
if (unlikely(err))
goto errout;
BUFFER_TRACE(dind, "get_write_access");
- err = ext4_journal_get_write_access(handle, dind);
- if (unlikely(err))
+ err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE);
+ if (unlikely(err)) {
ext4_std_error(sb, err);
+ goto errout;
+ }
/* ext4_reserve_inode_write() gets a reference on the iloc */
err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -897,8 +932,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
EXT4_SB(sb)->s_gdb_count++;
ext4_kvfree_array_rcu(o_group_desc);
+ lock_buffer(EXT4_SB(sb)->s_sbh);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
- err = ext4_handle_dirty_super(handle, sb);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
if (err)
ext4_std_error(sb, err);
return err;
@@ -913,7 +951,13 @@ errout:
}
/*
- * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ * If there is no available space in the existing block group descriptors for
+ * the new block group and there are no reserved block group descriptors, then
+ * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set
+ * to the first block group that is managed using meta_bg and s_first_meta_bg
+ * must be a multiple of EXT4_DESC_PER_BLOCK(sb).
+ * This function will be called when first group of meta_bg is added to bring
+ * new group descriptors block of new added meta_bg.
*/
static int add_new_gdb_meta_bg(struct super_block *sb,
handle_t *handle, ext4_group_t group) {
@@ -923,8 +967,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int err;
- gdblock = ext4_meta_bg_first_block_no(sb, group) +
- ext4_bg_has_super(sb, group);
+ gdblock = ext4_group_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group);
gdb_bh = ext4_sb_bread(sb, gdblock, 0);
if (IS_ERR(gdb_bh))
return PTR_ERR(gdb_bh);
@@ -946,7 +990,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
n_group_desc[gdb_num] = gdb_bh;
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE);
if (err) {
kvfree(n_group_desc);
brelse(gdb_bh);
@@ -1032,7 +1076,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
for (i = 0; i < reserved_gdb; i++) {
BUFFER_TRACE(primary[i], "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, primary[i])))
+ if ((err = ext4_journal_get_write_access(handle, sb, primary[i],
+ EXT4_JTR_NONE)))
goto exit_bh;
}
@@ -1047,9 +1092,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
for (i = 0; i < reserved_gdb; i++) {
int err2;
data = (__le32 *)primary[i]->b_data;
- /* printk("reserving backup %lu[%u] = %lu\n",
- primary[i]->b_blocknr, gdbackups,
- blk + primary[i]->b_blocknr); */
data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
if (!err)
@@ -1070,6 +1112,16 @@ exit_free:
return err;
}
+static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
+ ext4_group_t group)
+{
+ struct ext4_super_block *es = (struct ext4_super_block *) data;
+
+ es->s_block_group_nr = cpu_to_le16(group);
+ if (ext4_has_feature_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(es);
+}
+
/*
* Update the backup copies of the ext4 metadata. These don't need to be part
* of the main resize transaction, because e2fsck will re-write them if there
@@ -1118,6 +1170,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
while (group < sbi->s_groups_count) {
struct buffer_head *bh;
ext4_fsblk_t backup_block;
+ int has_super = ext4_bg_has_super(sb, group);
+ ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group);
/* Out of journal space, and can't get more - abort - so sad */
err = ext4_resize_ensure_credits_batch(handle, 1);
@@ -1127,8 +1181,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
if (meta_bg == 0)
backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
else
- backup_block = (ext4_group_first_block_no(sb, group) +
- ext4_bg_has_super(sb, group));
+ backup_block = first_block + has_super;
bh = sb_getblk(sb, backup_block);
if (unlikely(!bh)) {
@@ -1139,7 +1192,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
backup_block, backup_block -
ext4_group_first_block_no(sb, group));
BUFFER_TRACE(bh, "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, bh))) {
+ if ((err = ext4_journal_get_write_access(handle, sb, bh,
+ EXT4_JTR_NONE))) {
brelse(bh);
break;
}
@@ -1147,6 +1201,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
memcpy(bh->b_data, data, size);
if (rest)
memset(bh->b_data + size, 0, rest);
+ if (has_super && (backup_block == first_block))
+ ext4_set_block_group_nr(sb, bh->b_data, group);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -1222,7 +1278,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
gdb_num);
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh,
+ EXT4_JTR_NONE);
if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
err = reserve_backup_gdb(handle, resize_inode, group);
@@ -1243,7 +1300,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
+ if (ext4_read_bh(bh, 0, NULL, false) < 0) {
brelse(bh);
return NULL;
}
@@ -1253,26 +1310,24 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
}
static int ext4_set_bitmap_checksums(struct super_block *sb,
- ext4_group_t group,
struct ext4_group_desc *gdp,
struct ext4_new_group_data *group_data)
{
struct buffer_head *bh;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 0;
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
if (!bh)
return -EIO;
- ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, bh);
brelse(bh);
bh = ext4_get_bitmap(sb, group_data->block_bitmap);
if (!bh)
return -EIO;
- ext4_block_bitmap_csum_set(sb, group, gdp, bh);
+ ext4_block_bitmap_csum_set(sb, gdp, bh);
brelse(bh);
return 0;
@@ -1310,7 +1365,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
memset(gdp, 0, EXT4_DESC_SIZE(sb));
ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
- err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
+ err = ext4_set_bitmap_checksums(sb, gdp, group_data);
if (err) {
ext4_std_error(sb, err);
break;
@@ -1343,6 +1398,17 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
return err;
}
+static void ext4_add_overhead(struct super_block *sb,
+ const ext4_fsblk_t overhead)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+
+ sbi->s_overhead += overhead;
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+ smp_wmb();
+}
+
/*
* ext4_update_super() updates the super block so that the newly added
* groups can be seen by the filesystem.
@@ -1382,6 +1448,7 @@ static void ext4_update_super(struct super_block *sb,
reserved_blocks *= blocks_count;
do_div(reserved_blocks, 100);
+ lock_buffer(sbi->s_sbh);
ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
@@ -1441,10 +1508,21 @@ static void ext4_update_super(struct super_block *sb,
}
/*
- * Update the fs overhead information
+ * Update the fs overhead information.
+ *
+ * For bigalloc, if the superblock already has a properly calculated
+ * overhead, update it with a value based on numbers already computed
+ * above for the newly allocated capacity.
*/
- ext4_calculate_overhead(sb);
+ if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0))
+ ext4_add_overhead(sb,
+ EXT4_NUM_B2C(sbi, blocks_count - free_blocks));
+ else
+ ext4_calculate_overhead(sb);
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
"%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
@@ -1496,7 +1574,8 @@ static int ext4_flex_group_add(struct super_block *sb,
}
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto exit_journal;
@@ -1513,7 +1592,7 @@ static int ext4_flex_group_add(struct super_block *sb,
ext4_update_super(sb, flex_gd);
- err = ext4_handle_dirty_super(handle, sb);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
exit_journal:
err2 = ext4_journal_stop(handle);
@@ -1524,21 +1603,20 @@ exit_journal:
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = ext4_has_feature_meta_bg(sb);
- sector_t old_gdb = 0;
+ int meta_bg = ext4_has_feature_meta_bg(sb) &&
+ gdb_num >= le32_to_cpu(es->s_first_meta_bg);
+ sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
+ ext4_group_first_block_no(sb, 0);
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block), 0);
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
+ (char *)es, sizeof(struct ext4_super_block), 0);
for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
gdb_num);
- if (old_gdb == gdb_bh->b_blocknr)
- continue;
- update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
- gdb_bh->b_size, meta_bg);
- old_gdb = gdb_bh->b_blocknr;
+ update_backups(sb, gdb_bh->b_blocknr - padding_blocks,
+ gdb_bh->b_data, gdb_bh->b_size, meta_bg);
}
}
exit:
@@ -1547,8 +1625,7 @@ exit:
static int ext4_setup_next_flex_gd(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
- ext4_fsblk_t n_blocks_count,
- unsigned long flexbg_size)
+ ext4_fsblk_t n_blocks_count)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
@@ -1572,7 +1649,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
BUG_ON(last);
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
- last_group = group | (flexbg_size - 1);
+ last_group = group | (flex_gd->resize_bg - 1);
if (last_group > n_group)
last_group = n_group;
@@ -1709,21 +1786,25 @@ static int ext4_group_extend_no_check(struct super_block *sb,
}
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (err) {
ext4_warning(sb, "error %d on journal write access", err);
goto errout;
}
+ lock_buffer(EXT4_SB(sb)->s_sbh);
ext4_blocks_count_set(es, o_blocks_count + add);
ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
/* We add the blocks to the bitmap and set the group need init bit */
err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
if (err)
goto errout;
- ext4_handle_dirty_super(handle, sb);
+ ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
errout:
@@ -1735,7 +1816,7 @@ errout:
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
@@ -1758,7 +1839,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_grpblk_t last;
ext4_grpblk_t add;
struct buffer_head *bh;
- int err;
ext4_group_t group;
o_blocks_count = ext4_blocks_count(es);
@@ -1806,15 +1886,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, o_blocks_count + add - 1);
- if (!bh) {
+ bh = ext4_sb_bread(sb, o_blocks_count + add - 1, 0);
+ if (IS_ERR(bh)) {
ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
brelse(bh);
- err = ext4_group_extend_no_check(sb, o_blocks_count, add);
- return err;
+ return ext4_group_extend_no_check(sb, o_blocks_count, add);
} /* ext4_group_extend */
@@ -1868,16 +1947,20 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
return PTR_ERR(handle);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto errout;
+ lock_buffer(sbi->s_sbh);
ext4_clear_feature_resize_inode(sb);
ext4_set_feature_meta_bg(sb);
sbi->s_es->s_first_meta_bg =
cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
- err = ext4_handle_dirty_super(handle, sb);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
if (err) {
ext4_std_error(sb, err);
goto errout;
@@ -1898,9 +1981,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
errout:
ret = ext4_journal_stop(handle);
- if (!err)
- err = ret;
- return ret;
+ return err ? err : ret;
invalid_resize_inode:
ext4_error(sb, "corrupted/inconsistent resize inode");
@@ -1928,17 +2009,28 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
ext4_fsblk_t o_blocks_count;
ext4_fsblk_t n_blocks_count_retry = 0;
unsigned long last_update_time = 0;
- int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+ int err = 0;
int meta_bg;
+ unsigned int flexbg_size = ext4_flex_bg_size(sbi);
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, n_blocks_count - 1);
- if (!bh) {
+ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0);
+ if (IS_ERR(bh)) {
ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
brelse(bh);
+ /*
+ * For bigalloc, trim the requested size to the nearest cluster
+ * boundary to avoid creating an unusable filesystem. We do this
+ * silently, instead of returning an error, to avoid breaking
+ * callers that blindly resize the filesystem to the full size of
+ * the underlying block device.
+ */
+ if (ext4_has_feature_bigalloc(sb))
+ n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1);
+
retry:
o_blocks_count = ext4_blocks_count(es);
@@ -1994,7 +2086,7 @@ retry:
}
}
- if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+ if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) {
err = ext4_convert_meta_bg(sb, resize_inode);
if (err)
goto out;
@@ -2040,7 +2132,7 @@ retry:
goto out;
}
- if (ext4_blocks_count(es) == n_blocks_count)
+ if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0)
goto out;
err = ext4_alloc_flex_bg_array(sb, n_group + 1);
@@ -2051,7 +2143,7 @@ retry:
if (err)
goto out;
- flex_gd = alloc_flex_gd(flexbg_size);
+ flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group);
if (flex_gd == NULL) {
err = -ENOMEM;
goto out;
@@ -2060,9 +2152,8 @@ retry:
/* Add flex groups. Note that a regular group is a
* flex group with 1 group.
*/
- while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
- flexbg_size)) {
- if (jiffies - last_update_time > HZ * 10) {
+ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) {
+ if (time_is_before_jiffies(last_update_time + HZ * 10)) {
if (last_update_time)
ext4_msg(sb, KERN_INFO,
"resized to %llu blocks",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c668f6b42374..87205660c5d0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,13 +39,15 @@
#include <linux/log2.h>
#include <linux/crc16.h>
#include <linux/dax.h>
-#include <linux/cleancache.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "ext4.h"
#include "ext4_extents.h" /* Needed for trace points definition */
@@ -59,42 +61,46 @@
#include <trace/events/ext4.h>
static struct ext4_lazy_init *ext4_li_info;
-static struct mutex ext4_li_mtx;
+static DEFINE_MUTEX(ext4_li_mtx);
static struct ratelimit_state ext4_mount_msg_ratelimit;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
static int ext4_show_options(struct seq_file *seq, struct dentry *root);
-static int ext4_commit_super(struct super_block *sb, int sync);
-static void ext4_mark_recovery_complete(struct super_block *sb,
+static void ext4_update_super(struct super_block *sb);
+static int ext4_commit_super(struct super_block *sb);
+static int ext4_mark_recovery_complete(struct super_block *sb,
struct ext4_super_block *es);
-static void ext4_clear_journal_err(struct super_block *sb,
- struct ext4_super_block *es);
+static int ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
-static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
-static int ext4_feature_set_ok(struct super_block *sb, int readonly);
-static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
unsigned int journal_inum);
+static int ext4_validate_options(struct fs_context *fc);
+static int ext4_check_opt_consistency(struct fs_context *fc,
+ struct super_block *sb);
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
+static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
+static int ext4_get_tree(struct fs_context *fc);
+static int ext4_reconfigure(struct fs_context *fc);
+static void ext4_fc_free(struct fs_context *fc);
+static int ext4_init_fs_context(struct fs_context *fc);
+static void ext4_kill_sb(struct super_block *sb);
+static const struct fs_parameter_spec ext4_param_specs[];
/*
* Lock ordering
*
- * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
- * i_mmap_rwsem (inode->i_mmap_rwsem)!
- *
* page fault path:
- * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
- * page lock -> i_data_sem (rw)
+ * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
+ * -> page lock -> i_data_sem (rw)
*
* buffered write path:
* sb_start_write -> i_mutex -> mmap_lock
@@ -102,8 +108,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* i_data_sem (rw)
*
* truncate:
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
+ * page lock
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
* i_data_sem (rw)
*
* direct IO:
@@ -114,54 +121,169 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* transaction start -> page lock(s) -> i_data_sem (rw)
*/
+static const struct fs_context_operations ext4_context_ops = {
+ .parse_param = ext4_parse_param,
+ .get_tree = ext4_get_tree,
+ .reconfigure = ext4_reconfigure,
+ .free = ext4_fc_free,
+};
+
+
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext2",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "ext2",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = ext4_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
-#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif
static struct file_system_type ext3_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext3",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "ext3",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = ext4_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
-#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)
+
+
+static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io, bool simu_fail)
+{
+ if (simu_fail) {
+ clear_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ return;
+ }
+
+ /*
+ * buffer's verified bit is no longer valid after reading from
+ * disk again due to write out error, clear it to make sure we
+ * recheck the buffer contents.
+ */
+ clear_buffer_verified(bh);
+
+ bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_READ | op_flags, bh);
+}
+
+void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io, bool simu_fail)
+{
+ BUG_ON(!buffer_locked(bh));
+
+ if (ext4_buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return;
+ }
+ __ext4_read_bh(bh, op_flags, end_io, simu_fail);
+}
+
+int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io, bool simu_fail)
+{
+ BUG_ON(!buffer_locked(bh));
+
+ if (ext4_buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return 0;
+ }
+
+ __ext4_read_bh(bh, op_flags, end_io, simu_fail);
+
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return 0;
+ return -EIO;
+}
+
+int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
+{
+ lock_buffer(bh);
+ if (!wait) {
+ ext4_read_bh_nowait(bh, op_flags, NULL, false);
+ return 0;
+ }
+ return ext4_read_bh(bh, op_flags, NULL, false);
+}
/*
- * This works like sb_bread() except it uses ERR_PTR for error
+ * This works like __bread_gfp() except it uses ERR_PTR for error
* returns. Currently with sb_bread it's impossible to distinguish
* between ENOMEM and EIO situations (since both result in a NULL
* return.
*/
-struct buffer_head *
-ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
+static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
+ sector_t block,
+ blk_opf_t op_flags, gfp_t gfp)
{
- struct buffer_head *bh = sb_getblk(sb, block);
+ struct buffer_head *bh;
+ int ret;
+ bh = sb_getblk_gfp(sb, block, gfp);
if (bh == NULL)
return ERR_PTR(-ENOMEM);
if (ext4_buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- return ERR_PTR(-EIO);
+
+ ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
+ if (ret) {
+ put_bh(bh);
+ return ERR_PTR(ret);
+ }
+ return bh;
+}
+
+struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
+ blk_opf_t op_flags)
+{
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
+ ~__GFP_FS) | __GFP_MOVABLE;
+
+ return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
+}
+
+struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
+ sector_t block)
+{
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
+ ~__GFP_FS);
+
+ return __ext4_sb_bread_gfp(sb, block, 0, gfp);
+}
+
+struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
+ sector_t block)
+{
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
+ ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL;
+
+ return __ext4_sb_bread_gfp(sb, block, 0, gfp);
+}
+
+void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
+{
+ struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
+ sb->s_blocksize, GFP_NOWAIT);
+
+ if (likely(bh)) {
+ if (trylock_buffer(bh))
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false);
+ brelse(bh);
+ }
}
static int ext4_verify_csum_type(struct super_block *sb,
@@ -173,14 +295,12 @@ static int ext4_verify_csum_type(struct super_block *sb,
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}
-static __le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es)
+__le32 ext4_superblock_csum(struct ext4_super_block *es)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct ext4_super_block, s_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+ csum = ext4_chksum(~0, (char *)es, offset);
return cpu_to_le32(csum);
}
@@ -188,20 +308,20 @@ static __le32 ext4_superblock_csum(struct super_block *sb,
static int ext4_superblock_csum_verify(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
- return es->s_checksum == ext4_superblock_csum(sb, es);
+ return es->s_checksum == ext4_superblock_csum(es);
}
void ext4_superblock_csum_set(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
}
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -239,9 +359,9 @@ __u32 ext4_free_group_clusters(struct super_block *sb,
__u32 ext4_free_inodes_count(struct super_block *sb,
struct ext4_group_desc *bg)
{
- return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+ (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0);
}
__u32 ext4_used_dirs_count(struct super_block *sb,
@@ -295,9 +415,9 @@ void ext4_free_group_clusters_set(struct super_block *sb,
void ext4_free_inodes_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count)
{
- bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count));
if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
- bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+ WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16));
}
void ext4_used_dirs_set(struct super_block *sb,
@@ -316,10 +436,8 @@ void ext4_itable_unused_set(struct super_block *sb,
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
-static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
+static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
{
- time64_t now = ktime_get_real_seconds();
-
now = clamp_val(now, 0, (1ull << 40) - 1);
*lo = cpu_to_le32(lower_32_bits(now));
@@ -331,145 +449,150 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
}
#define ext4_update_tstamp(es, tstamp) \
- __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
+ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
+ ktime_get_real_seconds())
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
-static void __save_error_info(struct super_block *sb, int error,
- __u32 ino, __u64 block,
- const char *func, unsigned int line)
+/*
+ * The ext4_maybe_update_superblock() function checks and updates the
+ * superblock if needed.
+ *
+ * This function is designed to update the on-disk superblock only under
+ * certain conditions to prevent excessive disk writes and unnecessary
+ * waking of the disk from sleep. The superblock will be updated if:
+ * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last
+ * superblock update
+ * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the
+ * last superblock update.
+ *
+ * @sb: The superblock
+ */
+static void ext4_maybe_update_superblock(struct super_block *sb)
{
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- int err;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ journal_t *journal = sbi->s_journal;
+ time64_t now;
+ __u64 last_update;
+ __u64 lifetime_write_kbytes;
+ __u64 diff_size;
+
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ !(sb->s_flags & SB_ACTIVE) || !journal ||
+ journal->j_flags & JBD2_UNMOUNT)
+ return;
- EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
- if (bdev_read_only(sb->s_bdev))
+ now = ktime_get_real_seconds();
+ last_update = ext4_get_tstamp(es, s_wtime);
+
+ if (likely(now - last_update < sbi->s_sb_update_sec))
return;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_update_tstamp(es, s_last_error_time);
- strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
- es->s_last_error_line = cpu_to_le32(line);
- es->s_last_error_ino = cpu_to_le32(ino);
- es->s_last_error_block = cpu_to_le64(block);
- switch (error) {
- case EIO:
- err = EXT4_ERR_EIO;
- break;
- case ENOMEM:
- err = EXT4_ERR_ENOMEM;
- break;
- case EFSBADCRC:
- err = EXT4_ERR_EFSBADCRC;
- break;
- case 0:
- case EFSCORRUPTED:
- err = EXT4_ERR_EFSCORRUPTED;
- break;
- case ENOSPC:
- err = EXT4_ERR_ENOSPC;
- break;
- case ENOKEY:
- err = EXT4_ERR_ENOKEY;
- break;
- case EROFS:
- err = EXT4_ERR_EROFS;
- break;
- case EFBIG:
- err = EXT4_ERR_EFBIG;
- break;
- case EEXIST:
- err = EXT4_ERR_EEXIST;
- break;
- case ERANGE:
- err = EXT4_ERR_ERANGE;
- break;
- case EOVERFLOW:
- err = EXT4_ERR_EOVERFLOW;
- break;
- case EBUSY:
- err = EXT4_ERR_EBUSY;
- break;
- case ENOTDIR:
- err = EXT4_ERR_ENOTDIR;
- break;
- case ENOTEMPTY:
- err = EXT4_ERR_ENOTEMPTY;
- break;
- case ESHUTDOWN:
- err = EXT4_ERR_ESHUTDOWN;
- break;
- case EFAULT:
- err = EXT4_ERR_EFAULT;
- break;
- default:
- err = EXT4_ERR_UNKNOWN;
- }
- es->s_last_error_errcode = err;
- if (!es->s_first_error_time) {
- es->s_first_error_time = es->s_last_error_time;
- es->s_first_error_time_hi = es->s_last_error_time_hi;
- strncpy(es->s_first_error_func, func,
- sizeof(es->s_first_error_func));
- es->s_first_error_line = cpu_to_le32(line);
- es->s_first_error_ino = es->s_last_error_ino;
- es->s_first_error_block = es->s_last_error_block;
- es->s_first_error_errcode = es->s_last_error_errcode;
- }
- /*
- * Start the daily error reporting function if it hasn't been
- * started already
- */
- if (!es->s_error_count)
- mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
- le32_add_cpu(&es->s_error_count, 1);
-}
-static void save_error_info(struct super_block *sb, int error,
- __u32 ino, __u64 block,
- const char *func, unsigned int line)
-{
- __save_error_info(sb, error, ino, block, func, line);
- if (!bdev_read_only(sb->s_bdev))
- ext4_commit_super(sb, 1);
-}
+ lifetime_write_kbytes = sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+ sbi->s_sectors_written_start) >> 1);
-/*
- * The del_gendisk() function uninitializes the disk-specific data
- * structures, including the bdi structure, without telling anyone
- * else. Once this happens, any attempt to call mark_buffer_dirty()
- * (for example, by ext4_commit_super), will cause a kernel OOPS.
- * This is a kludge to prevent these oops until we can put in a proper
- * hook in del_gendisk() to inform the VFS and file system layers.
- */
-static int block_device_ejected(struct super_block *sb)
-{
- struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
+ /* Get the number of kilobytes not written to disk to account
+ * for statistics and compare with a multiple of 16 MB. This
+ * is used to determine when the next superblock commit should
+ * occur (i.e. not more often than once per 16MB if there was
+ * less written in an hour).
+ */
+ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
- return bdi->dev == NULL;
+ if (diff_size > sbi->s_sb_update_kb)
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int error = is_journal_aborted(journal);
- struct ext4_journal_cb_entry *jce;
BUG_ON(txn->t_state == T_FINISHED);
ext4_process_freed_data(sb, txn->t_tid);
+ ext4_maybe_update_superblock(sb);
+}
+
+static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode,
+ struct folio *folio)
+{
+ struct buffer_head *bh, *head;
+ struct journal_head *jh;
+
+ bh = head = folio_buffers(folio);
+ do {
+ /*
+ * We have to redirty a page in these cases:
+ * 1) If buffer is dirty, it means the page was dirty because it
+ * contains a buffer that needs checkpointing. So the dirty bit
+ * needs to be preserved so that checkpointing writes the buffer
+ * properly.
+ * 2) If buffer is not part of the committing transaction
+ * (we may have just accidentally come across this buffer because
+ * inode range tracking is not exact) or if the currently running
+ * transaction already contains this buffer as well, dirty bit
+ * needs to be preserved so that the buffer gets writeprotected
+ * properly on running transaction's commit.
+ */
+ jh = bh2jh(bh);
+ if (buffer_dirty(bh) ||
+ (jh && (jh->b_transaction != jinode->i_transaction ||
+ jh->b_next_transaction)))
+ return true;
+ } while ((bh = bh->b_this_page) != head);
+
+ return false;
+}
- spin_lock(&sbi->s_md_lock);
- while (!list_empty(&txn->t_private_list)) {
- jce = list_entry(txn->t_private_list.next,
- struct ext4_journal_cb_entry, jce_list);
- list_del_init(&jce->jce_list);
- spin_unlock(&sbi->s_md_lock);
- jce->jce_func(sb, jce, error);
- spin_lock(&sbi->s_md_lock);
+static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+ struct folio *folio = NULL;
+ int error;
+
+ /*
+ * writeback_iter() already checks for dirty pages and calls
+ * folio_clear_dirty_for_io(), which we want to write protect the
+ * folios.
+ *
+ * However, we may have to redirty a folio sometimes.
+ */
+ while ((folio = writeback_iter(mapping, &wbc, folio, &error))) {
+ if (ext4_journalled_writepage_needs_redirty(jinode, folio))
+ folio_redirty_for_writepage(&wbc, folio);
+ folio_unlock(folio);
}
- spin_unlock(&sbi->s_md_lock);
+
+ return error;
+}
+
+static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ int ret;
+
+ if (ext4_should_journal_data(jinode->i_vfs_inode))
+ ret = ext4_journalled_submit_inode_data_buffers(jinode);
+ else
+ ret = ext4_normal_submit_inode_data_buffers(jinode);
+ return ret;
+}
+
+static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ int ret = 0;
+
+ if (!ext4_should_journal_data(jinode->i_vfs_inode))
+ ret = jbd2_journal_finish_inode_data_buffers(jinode);
+
+ return ret;
}
static bool system_going_down(void)
@@ -478,6 +601,71 @@ static bool system_going_down(void)
|| system_state == SYSTEM_RESTART;
}
+struct ext4_err_translation {
+ int code;
+ int errno;
+};
+
+#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
+
+static struct ext4_err_translation err_translation[] = {
+ EXT4_ERR_TRANSLATE(EIO),
+ EXT4_ERR_TRANSLATE(ENOMEM),
+ EXT4_ERR_TRANSLATE(EFSBADCRC),
+ EXT4_ERR_TRANSLATE(EFSCORRUPTED),
+ EXT4_ERR_TRANSLATE(ENOSPC),
+ EXT4_ERR_TRANSLATE(ENOKEY),
+ EXT4_ERR_TRANSLATE(EROFS),
+ EXT4_ERR_TRANSLATE(EFBIG),
+ EXT4_ERR_TRANSLATE(EEXIST),
+ EXT4_ERR_TRANSLATE(ERANGE),
+ EXT4_ERR_TRANSLATE(EOVERFLOW),
+ EXT4_ERR_TRANSLATE(EBUSY),
+ EXT4_ERR_TRANSLATE(ENOTDIR),
+ EXT4_ERR_TRANSLATE(ENOTEMPTY),
+ EXT4_ERR_TRANSLATE(ESHUTDOWN),
+ EXT4_ERR_TRANSLATE(EFAULT),
+};
+
+static int ext4_errno_to_code(int errno)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(err_translation); i++)
+ if (err_translation[i].errno == errno)
+ return err_translation[i].code;
+ return EXT4_ERR_UNKNOWN;
+}
+
+static void save_error_info(struct super_block *sb, int error,
+ __u32 ino, __u64 block,
+ const char *func, unsigned int line)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* We default to EFSCORRUPTED error... */
+ if (error == 0)
+ error = EFSCORRUPTED;
+
+ spin_lock(&sbi->s_error_lock);
+ sbi->s_add_error_count++;
+ sbi->s_last_error_code = error;
+ sbi->s_last_error_line = line;
+ sbi->s_last_error_ino = ino;
+ sbi->s_last_error_block = block;
+ sbi->s_last_error_func = func;
+ sbi->s_last_error_time = ktime_get_real_seconds();
+ if (!sbi->s_first_error_time) {
+ sbi->s_first_error_code = error;
+ sbi->s_first_error_line = line;
+ sbi->s_first_error_ino = ino;
+ sbi->s_first_error_block = block;
+ sbi->s_first_error_func = func;
+ sbi->s_first_error_time = sbi->s_last_error_time;
+ }
+ spin_unlock(&sbi->s_error_lock);
+}
+
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
*
@@ -491,43 +679,125 @@ static bool system_going_down(void)
* We'll just use the jbd2_journal_abort() error code to record an error in
* the journal instead. On recovery, the journal will complain about
* that error until we've noted it down and cleared it.
+ *
+ * If force_ro is set, we unconditionally force the filesystem into an
+ * ABORT|READONLY state, unless the error response on the fs has been set to
+ * panic in which case we take the easy way out and panic immediately. This is
+ * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
+ * at a critical moment in log management.
*/
-
-static void ext4_handle_error(struct super_block *sb)
+static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
+ __u32 ino, __u64 block,
+ const char *func, unsigned int line)
{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+ bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);
+
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
if (test_opt(sb, WARN_ON_ERROR))
WARN_ON_ONCE(1);
- if (sb_rdonly(sb))
- return;
-
- if (!test_opt(sb, ERRORS_CONT)) {
- journal_t *journal = EXT4_SB(sb)->s_journal;
+ if (!continue_fs && !ext4_emergency_ro(sb) && journal)
+ jbd2_journal_abort(journal, -error);
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
- if (journal)
- jbd2_journal_abort(journal, -EIO);
+ if (!bdev_read_only(sb->s_bdev)) {
+ save_error_info(sb, error, ino, block, func, line);
+ /*
+ * In case the fs should keep running, we need to writeout
+ * superblock through the journal. Due to lock ordering
+ * constraints, it may not be safe to do it right here so we
+ * defer superblock flushing to a workqueue. We just need to be
+ * careful when the journal is already shutting down. If we get
+ * here in that case, just update the sb directly as the last
+ * transaction won't commit anyway.
+ */
+ if (continue_fs && journal &&
+ !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY))
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
+ else
+ ext4_commit_super(sb);
}
+
/*
* We force ERRORS_RO behavior when system is rebooting. Otherwise we
* could panic during 'reboot -f' as the underlying device got already
* disabled.
*/
- if (test_opt(sb, ERRORS_RO) || system_going_down()) {
- ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- /*
- * Make sure updated value of ->s_mount_flags will be visible
- * before ->s_flags update
- */
- smp_wmb();
- sb->s_flags |= SB_RDONLY;
- } else if (test_opt(sb, ERRORS_PANIC)) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
+ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
}
+
+ if (ext4_emergency_ro(sb) || continue_fs)
+ return;
+
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ /*
+ * We don't set SB_RDONLY because that requires sb->s_umount
+ * semaphore and setting it without proper remount procedure is
+ * confusing code such as freeze_super() leading to deadlocks
+ * and other problems.
+ */
+ set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
+}
+
+static void update_super_work(struct work_struct *work)
+{
+ struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
+ s_sb_upd_work);
+ journal_t *journal = sbi->s_journal;
+ handle_t *handle;
+
+ /*
+ * If the journal is still running, we have to write out superblock
+ * through the journal to avoid collisions of other journalled sb
+ * updates.
+ *
+ * We use directly jbd2 functions here to avoid recursing back into
+ * ext4 error handling code during handling of previous errors.
+ */
+ if (!ext4_emergency_state(sbi->s_sb) &&
+ !sb_rdonly(sbi->s_sb) && journal) {
+ struct buffer_head *sbh = sbi->s_sbh;
+ bool call_notify_err = false;
+
+ handle = jbd2_journal_start(journal, 1);
+ if (IS_ERR(handle))
+ goto write_directly;
+ if (jbd2_journal_get_write_access(handle, sbh)) {
+ jbd2_journal_stop(handle);
+ goto write_directly;
+ }
+
+ if (sbi->s_add_error_count > 0)
+ call_notify_err = true;
+
+ ext4_update_super(sbi->s_sb);
+ if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
+ ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+
+ if (jbd2_journal_dirty_metadata(handle, sbh)) {
+ jbd2_journal_stop(handle);
+ goto write_directly;
+ }
+ jbd2_journal_stop(handle);
+
+ if (call_notify_err)
+ ext4_notify_error_sysfs(sbi);
+
+ return;
+ }
+write_directly:
+ /*
+ * Write through journal failed. Write sb directly to get error info
+ * out and hope for the best.
+ */
+ ext4_commit_super(sbi->s_sb);
+ ext4_notify_error_sysfs(sbi);
}
#define ext4_error_ratelimit(sb) \
@@ -535,13 +805,13 @@ static void ext4_handle_error(struct super_block *sb)
"EXT4-fs error")
void __ext4_error(struct super_block *sb, const char *function,
- unsigned int line, int error, __u64 block,
+ unsigned int line, bool force_ro, int error, __u64 block,
const char *fmt, ...)
{
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -554,8 +824,9 @@ void __ext4_error(struct super_block *sb, const char *function,
sb->s_id, function, line, current->comm, &vaf);
va_end(args);
}
- save_error_info(sb, error, 0, block, function, line);
- ext4_handle_error(sb);
+ fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
+
+ ext4_handle_error(sb, force_ro, error, 0, block, function, line);
}
void __ext4_error_inode(struct inode *inode, const char *function,
@@ -565,7 +836,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
va_list args;
struct va_format vaf;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -585,9 +856,10 @@ void __ext4_error_inode(struct inode *inode, const char *function,
current->comm, &vaf);
va_end(args);
}
- save_error_info(inode->i_sb, error, inode->i_ino, block,
- function, line);
- ext4_handle_error(inode->i_sb);
+ fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
+
+ ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
+ function, line);
}
void __ext4_error_file(struct file *file, const char *function,
@@ -599,7 +871,7 @@ void __ext4_error_file(struct file *file, const char *function,
struct inode *inode = file_inode(file);
char pathname[80], *path;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -624,9 +896,10 @@ void __ext4_error_file(struct file *file, const char *function,
current->comm, path, &vaf);
va_end(args);
}
- save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block,
- function, line);
- ext4_handle_error(inode->i_sb);
+ fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
+
+ ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
+ function, line);
}
const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -678,7 +951,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
char nbuf[16];
const char *errstr;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_emergency_state(sb)))
return;
/* Special case: if the error is EROFS, and we're not already
@@ -692,56 +965,9 @@ void __ext4_std_error(struct super_block *sb, const char *function,
printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
sb->s_id, function, line, errstr);
}
+ fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
- save_error_info(sb, -errno, 0, 0, function, line);
- ext4_handle_error(sb);
-}
-
-/*
- * ext4_abort is a much stronger failure handler than ext4_error. The
- * abort function may be used to deal with unrecoverable failures such
- * as journal IO errors or ENOMEM at a critical moment in log management.
- *
- * We unconditionally force the filesystem into an ABORT|READONLY state,
- * unless the error response on the fs has been set to panic in which
- * case we take the easy way out and panic immediately.
- */
-
-void __ext4_abort(struct super_block *sb, const char *function,
- unsigned int line, int error, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
- return;
-
- save_error_info(sb, error, 0, 0, function, line);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n",
- sb->s_id, function, line, &vaf);
- va_end(args);
-
- if (sb_rdonly(sb) == 0) {
- ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
- /*
- * Make sure updated value of ->s_mount_flags will be visible
- * before ->s_flags update
- */
- smp_wmb();
- sb->s_flags |= SB_RDONLY;
- if (EXT4_SB(sb)->s_journal)
- jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
- }
- if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
- panic("EXT4-fs panic from previous error\n");
- }
+ ext4_handle_error(sb, false, -errno, 0, 0, function, line);
}
void __ext4_msg(struct super_block *sb,
@@ -750,19 +976,29 @@ void __ext4_msg(struct super_block *sb,
struct va_format vaf;
va_list args;
- if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
- return;
+ if (sb) {
+ atomic_inc(&EXT4_SB(sb)->s_msg_count);
+ if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
+ "EXT4-fs"))
+ return;
+ }
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+ if (sb)
+ printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+ else
+ printk("%sEXT4-fs: %pV\n", prefix, &vaf);
va_end(args);
}
-#define ext4_warning_ratelimit(sb) \
- ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
- "EXT4-fs warning")
+static int ext4_warning_ratelimit(struct super_block *sb)
+{
+ atomic_inc(&EXT4_SB(sb)->s_warning_count);
+ return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning");
+}
void __ext4_warning(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
@@ -809,12 +1045,10 @@ __acquires(bitlock)
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
- __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
-
if (ext4_error_ratelimit(sb)) {
va_start(args, fmt);
vaf.fmt = fmt;
@@ -830,17 +1064,19 @@ __acquires(bitlock)
va_end(args);
}
- if (test_opt(sb, WARN_ON_ERROR))
- WARN_ON_ONCE(1);
-
if (test_opt(sb, ERRORS_CONT)) {
- ext4_commit_super(sb, 0);
+ if (test_opt(sb, WARN_ON_ERROR))
+ WARN_ON_ONCE(1);
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ if (!bdev_read_only(sb->s_bdev)) {
+ save_error_info(sb, EFSCORRUPTED, ino, block, function,
+ line);
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
+ }
return;
}
-
ext4_unlock_group(sb, grp);
- ext4_commit_super(sb, 1);
- ext4_handle_error(sb);
+ ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
/*
* We only get here in the ERRORS_RO case; relocking the group
* may be dangerous, but nothing bad will happen since the
@@ -865,6 +1101,8 @@ void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
int ret;
+ if (!grp || !gdp)
+ return;
if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
&grp->bb_state);
@@ -911,43 +1149,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
-/*
- * Open the external journal device
- */
-static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
-{
- struct block_device *bdev;
-
- bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
- if (IS_ERR(bdev))
- goto fail;
- return bdev;
-
-fail:
- ext4_msg(sb, KERN_ERR,
- "failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
- return NULL;
-}
-
-/*
- * Release the journal device
- */
-static void ext4_blkdev_put(struct block_device *bdev)
-{
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-
-static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
-{
- struct block_device *bdev;
- bdev = sbi->journal_bdev;
- if (bdev) {
- ext4_blkdev_put(bdev);
- sbi->journal_bdev = NULL;
- }
-}
-
static inline struct inode *orphan_list_entry(struct list_head *l)
{
return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
@@ -974,12 +1175,12 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
#ifdef CONFIG_QUOTA
static int ext4_quota_off(struct super_block *sb, int type);
-static inline void ext4_quota_off_umount(struct super_block *sb)
+static inline void ext4_quotas_off(struct super_block *sb, int type)
{
- int type;
+ BUG_ON(type > EXT4_MAXQUOTAS);
/* Use our quota_off function to clear inode flags etc. */
- for (type = 0; type < EXT4_MAXQUOTAS; type++)
+ for (type--; type >= 0; type--)
ext4_quota_off(sb, type);
}
@@ -995,73 +1196,143 @@ static inline char *get_qf_name(struct super_block *sb,
lockdep_is_held(&sb->s_umount));
}
#else
-static inline void ext4_quota_off_umount(struct super_block *sb)
+static inline void ext4_quotas_off(struct super_block *sb, int type)
{
}
#endif
-static void ext4_put_super(struct super_block *sb)
+static int ext4_percpu_param_init(struct ext4_sb_info *sbi)
+{
+ ext4_fsblk_t block;
+ int err;
+
+ block = ext4_count_free_clusters(sbi->s_sb);
+ ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block));
+ err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
+ GFP_KERNEL);
+ if (!err) {
+ unsigned long freei = ext4_count_free_inodes(sbi->s_sb);
+ sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
+ err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
+ GFP_KERNEL);
+ }
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sbi->s_sb), GFP_KERNEL);
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
+ GFP_KERNEL);
+ if (!err)
+ err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
+ GFP_KERNEL);
+ if (!err)
+ err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
+
+ if (err)
+ ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory");
+
+ return err;
+}
+
+static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi)
+{
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
+ percpu_free_rwsem(&sbi->s_writepages_rwsem);
+}
+
+static void ext4_group_desc_free(struct ext4_sb_info *sbi)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
struct buffer_head **group_desc;
+ int i;
+
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
+ for (i = 0; i < sbi->s_gdb_count; i++)
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ rcu_read_unlock();
+}
+
+static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
+{
struct flex_groups **flex_groups;
- int aborted = 0;
- int i, err;
+ int i;
- ext4_unregister_li_request(sb);
- ext4_quota_off_umount(sb);
+ rcu_read_lock();
+ flex_groups = rcu_dereference(sbi->s_flex_groups);
+ if (flex_groups) {
+ for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+ kvfree(flex_groups[i]);
+ kvfree(flex_groups);
+ }
+ rcu_read_unlock();
+}
- destroy_workqueue(sbi->rsv_conversion_wq);
+static void ext4_put_super(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int aborted = 0;
+ int err;
/*
* Unregister sysfs before destroying jbd2 journal.
* Since we could still access attr_journal_task attribute via sysfs
* path which could have sbi->s_journal->j_task as NULL
+ * Unregister sysfs before flush sbi->s_sb_upd_work.
+ * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
+ * read metadata verify failed then will queue error work.
+ * update_super_work will call start_this_handle may trigger
+ * BUG_ON.
*/
ext4_unregister_sysfs(sb);
+ if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
+ ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
+ &sb->s_uuid);
+
+ ext4_unregister_li_request(sb);
+ ext4_quotas_off(sb, EXT4_MAXQUOTAS);
+
+ destroy_workqueue(sbi->rsv_conversion_wq);
+ ext4_release_orphan_info(sb);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
- err = jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ err = ext4_journal_destroy(sbi, sbi->s_journal);
if ((err < 0) && !aborted) {
ext4_abort(sb, -err, "Couldn't clean up the journal");
}
- }
+ } else
+ flush_work(&sbi->s_sb_upd_work);
ext4_es_unregister_shrinker(sbi);
- del_timer_sync(&sbi->s_err_report);
+ timer_shutdown_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
- if (!sb_rdonly(sb) && !aborted) {
- ext4_clear_feature_journal_needs_recovery(sb);
- es->s_state = cpu_to_le16(sbi->s_mount_state);
+ if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) {
+ if (!aborted) {
+ ext4_clear_feature_journal_needs_recovery(sb);
+ ext4_clear_feature_orphan_present(sb);
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+ }
+ ext4_commit_super(sb);
}
- if (!sb_rdonly(sb))
- ext4_commit_super(sb, 1);
- rcu_read_lock();
- group_desc = rcu_dereference(sbi->s_group_desc);
- for (i = 0; i < sbi->s_gdb_count; i++)
- brelse(group_desc[i]);
- kvfree(group_desc);
- flex_groups = rcu_dereference(sbi->s_flex_groups);
- if (flex_groups) {
- for (i = 0; i < sbi->s_flex_groups_allocated; i++)
- kvfree(flex_groups[i]);
- kvfree(flex_groups);
- }
- rcu_read_unlock();
- percpu_counter_destroy(&sbi->s_freeclusters_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_free_rwsem(&sbi->s_writepages_rwsem);
+ ext4_group_desc_free(sbi);
+ ext4_flex_groups_free(sbi);
+
+ WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter));
+ ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
- for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ for (int i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
@@ -1071,19 +1342,18 @@ static void ext4_put_super(struct super_block *sb)
* in-memory list had better be clean by this point. */
if (!list_empty(&sbi->s_orphan))
dump_orphan_list(sb, sbi);
- J_ASSERT(list_empty(&sbi->s_orphan));
+ ASSERT(list_empty(&sbi->s_orphan));
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+ if (sbi->s_journal_bdev_file) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->journal_bdev);
- invalidate_bdev(sbi->journal_bdev);
- ext4_blkdev_remove(sbi);
+ sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
+ invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -1092,8 +1362,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
sbi->s_ea_block_cache = NULL;
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ ext4_stop_mmpd(sbi);
+
brelse(sbi->s_sbh);
sb->s_fs_info = NULL;
/*
@@ -1102,13 +1372,11 @@ static void ext4_put_super(struct super_block *sb)
*/
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
- if (sbi->s_chksum_driver)
- crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
- fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
-#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+ fs_put_dax(sbi->s_daxdev, NULL);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
+#if IS_ENABLED(CONFIG_UNICODE)
+ utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -1122,20 +1390,24 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
{
struct ext4_inode_info *ei;
- ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
inode_set_iversion(&ei->vfs_inode, 1);
+ ei->i_flags = 0;
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
spin_lock_init(&ei->i_raw_lock);
- INIT_LIST_HEAD(&ei->i_prealloc_list);
- spin_lock_init(&ei->i_prealloc_lock);
+ ei->i_prealloc_node = RB_ROOT;
+ atomic_set(&ei->i_prealloc_active, 0);
+ rwlock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_list);
ei->i_es_all_nr = 0;
ei->i_es_shk_nr = 0;
ei->i_es_shrink_lblk = 0;
+ ei->i_es_seq = 0;
ei->i_reserved_data_blocks = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
ext4_init_pending_tree(&ei->i_pending_tree);
@@ -1148,14 +1420,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
- atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ ext4_fc_init_inode(&ei->vfs_inode);
+ spin_lock_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
static int ext4_drop_inode(struct inode *inode)
{
- int drop = generic_drop_inode(inode);
+ int drop = inode_generic_drop(inode);
if (!drop)
drop = fscrypt_drop_inode(inode);
@@ -1167,39 +1440,60 @@ static int ext4_drop_inode(struct inode *inode)
static void ext4_free_in_core_inode(struct inode *inode)
{
fscrypt_free_inode(inode);
+ if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
+ pr_warn("%s: inode %ld still in fc list",
+ __func__, inode->i_ino);
+ }
kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}
static void ext4_destroy_inode(struct inode *inode)
{
- if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
+ if (ext4_inode_orphan_tracked(inode)) {
ext4_msg(inode->i_sb, KERN_ERR,
- "Inode %lu (%p): orphan list check failed!",
+ "Inode %lu (%p): inode tracked as orphan!",
inode->i_ino, EXT4_I(inode));
print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
EXT4_I(inode), sizeof(struct ext4_inode_info),
true);
dump_stack();
}
+
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
+ WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
+ inode->i_ino, EXT4_I(inode),
+ EXT4_I(inode)->i_reserved_data_blocks);
+}
+
+static void ext4_shutdown(struct super_block *sb)
+{
+ ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH);
}
static void init_once(void *foo)
{
- struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+ struct ext4_inode_info *ei = foo;
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
- init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
+ ext4_fc_init_inode(&ei->vfs_inode);
+#ifdef CONFIG_FS_ENCRYPTION
+ ei->i_crypt_info = NULL;
+#endif
+#ifdef CONFIG_FS_VERITY
+ ei->i_verity_info = NULL;
+#endif
}
static int __init init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
sizeof(struct ext4_inode_info), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT),
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
offsetof(struct ext4_inode_info, i_data),
sizeof_field(struct ext4_inode_info, i_data),
init_once);
@@ -1220,6 +1514,7 @@ static void destroy_inodecache(void)
void ext4_clear_inode(struct inode *inode)
{
+ ext4_fc_del(inode);
invalidate_inode_buffers(inode);
clear_inode(inode);
ext4_discard_preallocations(inode);
@@ -1279,147 +1574,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode)
return ext4_write_inode(inode, &wbc);
}
-/*
- * Try to release metadata pages (indirect blocks, directories) which are
- * mapped via the block device. Since these pages could have journal heads
- * which would prevent try_to_free_buffers() from freeing them, we must use
- * jbd2 layer's try_to_free_buffers() function to release them.
- */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
- gfp_t wait)
-{
- journal_t *journal = EXT4_SB(sb)->s_journal;
-
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
- return 0;
- if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_DIRECT_RECLAIM);
- return try_to_free_buffers(page);
-}
-
-#ifdef CONFIG_FS_ENCRYPTION
-static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
-{
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
-}
-
-static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
- void *fs_data)
-{
- handle_t *handle = fs_data;
- int res, res2, credits, retries = 0;
-
- /*
- * Encrypting the root directory is not allowed because e2fsck expects
- * lost+found to exist and be unencrypted, and encrypting the root
- * directory would imply encrypting the lost+found directory as well as
- * the filename "lost+found" itself.
- */
- if (inode->i_ino == EXT4_ROOT_INO)
- return -EPERM;
-
- if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
- return -EINVAL;
-
- res = ext4_convert_inline_data(inode);
- if (res)
- return res;
-
- /*
- * If a journal handle was specified, then the encryption context is
- * being set on a new inode via inheritance and is part of a larger
- * transaction to create the inode. Otherwise the encryption context is
- * being set on an existing inode in its own transaction. Only in the
- * latter case should the "retry on ENOSPC" logic be used.
- */
-
- if (handle) {
- res = ext4_xattr_set_handle(handle, inode,
- EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- ext4_clear_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA);
- /*
- * Update inode->i_flags - S_ENCRYPTED will be enabled,
- * S_DAX may be disabled
- */
- ext4_set_inode_flags(inode);
- }
- return res;
- }
-
- res = dquot_initialize(inode);
- if (res)
- return res;
-retry:
- res = ext4_xattr_set_credits(inode, len, false /* is_create */,
- &credits);
- if (res)
- return res;
-
- handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- /*
- * Update inode->i_flags - S_ENCRYPTED will be enabled,
- * S_DAX may be disabled
- */
- ext4_set_inode_flags(inode);
- res = ext4_mark_inode_dirty(handle, inode);
- if (res)
- EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
- }
- res2 = ext4_journal_stop(handle);
-
- if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- if (!res)
- res = res2;
- return res;
-}
-
-static const union fscrypt_context *
-ext4_get_dummy_context(struct super_block *sb)
-{
- return EXT4_SB(sb)->s_dummy_enc_ctx.ctx;
-}
-
-static bool ext4_has_stable_inodes(struct super_block *sb)
-{
- return ext4_has_feature_stable_inodes(sb);
-}
-
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
- *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
-static const struct fscrypt_operations ext4_cryptops = {
- .key_prefix = "ext4:",
- .get_context = ext4_get_context,
- .set_context = ext4_set_context,
- .get_dummy_context = ext4_get_dummy_context,
- .empty_dir = ext4_empty_dir,
- .max_namelen = EXT4_NAME_LEN,
- .has_stable_inodes = ext4_has_stable_inodes,
- .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
-};
-#endif
-
#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
@@ -1431,16 +1585,14 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path);
-static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
-static int ext4_enable_quotas(struct super_block *sb);
-static struct dquot **ext4_get_dquots(struct inode *inode)
+static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
{
return EXT4_I(inode)->i_dquot;
}
@@ -1484,17 +1636,17 @@ static const struct super_operations ext4_sops = {
.freeze_fs = ext4_freeze,
.unfreeze_fs = ext4_unfreeze,
.statfs = ext4_statfs,
- .remount_fs = ext4_remount,
.show_options = ext4_show_options,
+ .shutdown = ext4_shutdown,
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
.get_dquots = ext4_get_dquots,
#endif
- .bdev_try_to_free_page = bdev_try_to_free_page,
};
static const struct export_operations ext4_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ext4_fh_to_dentry,
.fh_to_parent = ext4_fh_to_parent,
.get_parent = ext4_get_parent,
@@ -1503,232 +1655,190 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
- Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+ Opt_resgid, Opt_resuid, Opt_sb,
Opt_nouid32, Opt_debug, Opt_removed,
- Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_user_xattr, Opt_acl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
- Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+ Opt_inlinecrypt,
+ Opt_usrjquota, Opt_grpjquota, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota,
+ Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
- Opt_nowarn_on_error, Opt_mblk_io_submit,
- Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
+ Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+ Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
+ Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
+#ifdef CONFIG_EXT4_DEBUG
+ Opt_fc_debug_max_replay, Opt_fc_debug_force
+#endif
};
-static const match_table_t tokens = {
- {Opt_bsd_df, "bsddf"},
- {Opt_minix_df, "minixdf"},
- {Opt_grpid, "grpid"},
- {Opt_grpid, "bsdgroups"},
- {Opt_nogrpid, "nogrpid"},
- {Opt_nogrpid, "sysvgroups"},
- {Opt_resgid, "resgid=%u"},
- {Opt_resuid, "resuid=%u"},
- {Opt_sb, "sb=%u"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_nouid32, "nouid32"},
- {Opt_debug, "debug"},
- {Opt_removed, "oldalloc"},
- {Opt_removed, "orlov"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_noload, "norecovery"},
- {Opt_noload, "noload"},
- {Opt_removed, "nobh"},
- {Opt_removed, "bh"},
- {Opt_commit, "commit=%u"},
- {Opt_min_batch_time, "min_batch_time=%u"},
- {Opt_max_batch_time, "max_batch_time=%u"},
- {Opt_journal_dev, "journal_dev=%u"},
- {Opt_journal_path, "journal_path=%s"},
- {Opt_journal_checksum, "journal_checksum"},
- {Opt_nojournal_checksum, "nojournal_checksum"},
- {Opt_journal_async_commit, "journal_async_commit"},
- {Opt_abort, "abort"},
- {Opt_data_journal, "data=journal"},
- {Opt_data_ordered, "data=ordered"},
- {Opt_data_writeback, "data=writeback"},
- {Opt_data_err_abort, "data_err=abort"},
- {Opt_data_err_ignore, "data_err=ignore"},
- {Opt_offusrjquota, "usrjquota="},
- {Opt_usrjquota, "usrjquota=%s"},
- {Opt_offgrpjquota, "grpjquota="},
- {Opt_grpjquota, "grpjquota=%s"},
- {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
- {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
- {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_grpquota, "grpquota"},
- {Opt_noquota, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_prjquota, "prjquota"},
- {Opt_barrier, "barrier=%u"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_i_version, "i_version"},
- {Opt_dax, "dax"},
- {Opt_stripe, "stripe=%u"},
- {Opt_delalloc, "delalloc"},
- {Opt_warn_on_error, "warn_on_error"},
- {Opt_nowarn_on_error, "nowarn_on_error"},
- {Opt_lazytime, "lazytime"},
- {Opt_nolazytime, "nolazytime"},
- {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
- {Opt_nodelalloc, "nodelalloc"},
- {Opt_removed, "mblk_io_submit"},
- {Opt_removed, "nomblk_io_submit"},
- {Opt_block_validity, "block_validity"},
- {Opt_noblock_validity, "noblock_validity"},
- {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
- {Opt_journal_ioprio, "journal_ioprio=%u"},
- {Opt_auto_da_alloc, "auto_da_alloc=%u"},
- {Opt_auto_da_alloc, "auto_da_alloc"},
- {Opt_noauto_da_alloc, "noauto_da_alloc"},
- {Opt_dioread_nolock, "dioread_nolock"},
- {Opt_dioread_lock, "nodioread_nolock"},
- {Opt_dioread_lock, "dioread_lock"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_init_itable, "init_itable=%u"},
- {Opt_init_itable, "init_itable"},
- {Opt_noinit_itable, "noinit_itable"},
- {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
- {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
- {Opt_test_dummy_encryption, "test_dummy_encryption"},
- {Opt_nombcache, "nombcache"},
- {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
- {Opt_removed, "check=none"}, /* mount option from ext2/3 */
- {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
- {Opt_removed, "reservation"}, /* mount option from ext2/3 */
- {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
- {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
- {Opt_err, NULL},
+static const struct constant_table ext4_param_errors[] = {
+ {"continue", EXT4_MOUNT_ERRORS_CONT},
+ {"panic", EXT4_MOUNT_ERRORS_PANIC},
+ {"remount-ro", EXT4_MOUNT_ERRORS_RO},
+ {}
};
-static ext4_fsblk_t get_sb_block(void **data)
-{
- ext4_fsblk_t sb_block;
- char *options = (char *) *data;
-
- if (!options || strncmp(options, "sb=", 3) != 0)
- return 1; /* Default location */
-
- options += 3;
- /* TODO: use simple_strtoll with >32bit ext4 */
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
- printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
- (char *) *data);
- return 1;
- }
- if (*options == ',')
- options++;
- *data = (void *) options;
-
- return sb_block;
-}
-
-#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
-static const char deprecated_msg[] =
- "Mount option \"%s\" will be removed by %s\n"
- "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
-
-#ifdef CONFIG_QUOTA
-static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
- int ret = -1;
+static const struct constant_table ext4_param_data[] = {
+ {"journal", EXT4_MOUNT_JOURNAL_DATA},
+ {"ordered", EXT4_MOUNT_ORDERED_DATA},
+ {"writeback", EXT4_MOUNT_WRITEBACK_DATA},
+ {}
+};
- if (sb_any_quota_loaded(sb) && !old_qname) {
- ext4_msg(sb, KERN_ERR,
- "Cannot change journaled "
- "quota options when quota turned on");
- return -1;
- }
- if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_INFO, "Journaled quota options "
- "ignored when QUOTA feature is enabled");
- return 1;
- }
- qname = match_strdup(args);
- if (!qname) {
- ext4_msg(sb, KERN_ERR,
- "Not enough memory for storing quotafile name");
- return -1;
- }
- if (old_qname) {
- if (strcmp(old_qname, qname) == 0)
- ret = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "%s quota file already specified",
- QTYPE2NAME(qtype));
- goto errout;
- }
- if (strchr(qname, '/')) {
- ext4_msg(sb, KERN_ERR,
- "quotafile must be on filesystem root");
- goto errout;
- }
- rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
- set_opt(sb, QUOTA);
- return 1;
-errout:
- kfree(qname);
- return ret;
-}
+static const struct constant_table ext4_param_data_err[] = {
+ {"abort", Opt_data_err_abort},
+ {"ignore", Opt_data_err_ignore},
+ {}
+};
-static int clear_qf_name(struct super_block *sb, int qtype)
-{
+static const struct constant_table ext4_param_jqfmt[] = {
+ {"vfsold", QFMT_VFS_OLD},
+ {"vfsv0", QFMT_VFS_V0},
+ {"vfsv1", QFMT_VFS_V1},
+ {}
+};
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *old_qname = get_qf_name(sb, sbi, qtype);
+static const struct constant_table ext4_param_dax[] = {
+ {"always", Opt_dax_always},
+ {"inode", Opt_dax_inode},
+ {"never", Opt_dax_never},
+ {}
+};
- if (sb_any_quota_loaded(sb) && old_qname) {
- ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
- " when quota turned on");
- return -1;
- }
- rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
- synchronize_rcu();
- kfree(old_qname);
- return 1;
-}
+/*
+ * Mount option specification
+ * We don't use fsparam_flag_no because of the way we set the
+ * options and the way we show them in _ext4_show_options(). To
+ * keep the changes to a minimum, let's keep the negative options
+ * separate for now.
+ */
+static const struct fs_parameter_spec ext4_param_specs[] = {
+ fsparam_flag ("bsddf", Opt_bsd_df),
+ fsparam_flag ("minixdf", Opt_minix_df),
+ fsparam_flag ("grpid", Opt_grpid),
+ fsparam_flag ("bsdgroups", Opt_grpid),
+ fsparam_flag ("nogrpid", Opt_nogrpid),
+ fsparam_flag ("sysvgroups", Opt_nogrpid),
+ fsparam_gid ("resgid", Opt_resgid),
+ fsparam_uid ("resuid", Opt_resuid),
+ fsparam_u32 ("sb", Opt_sb),
+ fsparam_enum ("errors", Opt_errors, ext4_param_errors),
+ fsparam_flag ("nouid32", Opt_nouid32),
+ fsparam_flag ("debug", Opt_debug),
+ fsparam_flag ("oldalloc", Opt_removed),
+ fsparam_flag ("orlov", Opt_removed),
+ fsparam_flag ("user_xattr", Opt_user_xattr),
+ fsparam_flag ("acl", Opt_acl),
+ fsparam_flag ("norecovery", Opt_noload),
+ fsparam_flag ("noload", Opt_noload),
+ fsparam_flag ("bh", Opt_removed),
+ fsparam_flag ("nobh", Opt_removed),
+ fsparam_u32 ("commit", Opt_commit),
+ fsparam_u32 ("min_batch_time", Opt_min_batch_time),
+ fsparam_u32 ("max_batch_time", Opt_max_batch_time),
+ fsparam_u32 ("journal_dev", Opt_journal_dev),
+ fsparam_bdev ("journal_path", Opt_journal_path),
+ fsparam_flag ("journal_checksum", Opt_journal_checksum),
+ fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum),
+ fsparam_flag ("journal_async_commit",Opt_journal_async_commit),
+ fsparam_flag ("abort", Opt_abort),
+ fsparam_enum ("data", Opt_data, ext4_param_data),
+ fsparam_enum ("data_err", Opt_data_err,
+ ext4_param_data_err),
+ fsparam_string_empty
+ ("usrjquota", Opt_usrjquota),
+ fsparam_string_empty
+ ("grpjquota", Opt_grpjquota),
+ fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("noquota", Opt_noquota),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("prjquota", Opt_prjquota),
+ fsparam_flag ("barrier", Opt_barrier),
+ fsparam_u32 ("barrier", Opt_barrier),
+ fsparam_flag ("nobarrier", Opt_nobarrier),
+ fsparam_flag ("i_version", Opt_removed),
+ fsparam_flag ("dax", Opt_dax),
+ fsparam_enum ("dax", Opt_dax_type, ext4_param_dax),
+ fsparam_u32 ("stripe", Opt_stripe),
+ fsparam_flag ("delalloc", Opt_delalloc),
+ fsparam_flag ("nodelalloc", Opt_nodelalloc),
+ fsparam_flag ("warn_on_error", Opt_warn_on_error),
+ fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error),
+ fsparam_u32 ("debug_want_extra_isize",
+ Opt_debug_want_extra_isize),
+ fsparam_flag ("mblk_io_submit", Opt_removed),
+ fsparam_flag ("nomblk_io_submit", Opt_removed),
+ fsparam_flag ("block_validity", Opt_block_validity),
+ fsparam_flag ("noblock_validity", Opt_noblock_validity),
+ fsparam_u32 ("inode_readahead_blks",
+ Opt_inode_readahead_blks),
+ fsparam_u32 ("journal_ioprio", Opt_journal_ioprio),
+ fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc),
+ fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc),
+ fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc),
+ fsparam_flag ("dioread_nolock", Opt_dioread_nolock),
+ fsparam_flag ("nodioread_nolock", Opt_dioread_lock),
+ fsparam_flag ("dioread_lock", Opt_dioread_lock),
+ fsparam_flag ("discard", Opt_discard),
+ fsparam_flag ("nodiscard", Opt_nodiscard),
+ fsparam_u32 ("init_itable", Opt_init_itable),
+ fsparam_flag ("init_itable", Opt_init_itable),
+ fsparam_flag ("noinit_itable", Opt_noinit_itable),
+#ifdef CONFIG_EXT4_DEBUG
+ fsparam_flag ("fc_debug_force", Opt_fc_debug_force),
+ fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay),
#endif
+ fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb),
+ fsparam_flag ("test_dummy_encryption",
+ Opt_test_dummy_encryption),
+ fsparam_string ("test_dummy_encryption",
+ Opt_test_dummy_encryption),
+ fsparam_flag ("inlinecrypt", Opt_inlinecrypt),
+ fsparam_flag ("nombcache", Opt_nombcache),
+ fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */
+ fsparam_flag ("prefetch_block_bitmaps",
+ Opt_removed),
+ fsparam_flag ("no_prefetch_block_bitmaps",
+ Opt_no_prefetch_block_bitmaps),
+ fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan),
+ fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */
+ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */
+ {}
+};
+
#define MOPT_SET 0x0001
#define MOPT_CLEAR 0x0002
#define MOPT_NOSUPPORT 0x0004
#define MOPT_EXPLICIT 0x0008
-#define MOPT_CLEAR_ERR 0x0010
-#define MOPT_GTE0 0x0020
#ifdef CONFIG_QUOTA
#define MOPT_Q 0
-#define MOPT_QFMT 0x0040
+#define MOPT_QFMT 0x0010
#else
#define MOPT_Q MOPT_NOSUPPORT
#define MOPT_QFMT MOPT_NOSUPPORT
#endif
-#define MOPT_DATAJ 0x0080
-#define MOPT_NO_EXT2 0x0100
-#define MOPT_NO_EXT3 0x0200
+#define MOPT_NO_EXT2 0x0020
+#define MOPT_NO_EXT3 0x0040
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
-#define MOPT_STRING 0x0400
+#define MOPT_SKIP 0x0080
+#define MOPT_2 0x0100
static const struct mount_opts {
int token;
@@ -1753,6 +1863,7 @@ static const struct mount_opts {
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_NO_EXT2},
{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
@@ -1761,46 +1872,25 @@ static const struct mount_opts {
EXT4_MOUNT_JOURNAL_CHECKSUM),
MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
- {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2},
- {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2},
+ {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
- {Opt_commit, 0, MOPT_GTE0},
- {Opt_max_batch_time, 0, MOPT_GTE0},
- {Opt_min_batch_time, 0, MOPT_GTE0},
- {Opt_inode_readahead_blks, 0, MOPT_GTE0},
- {Opt_init_itable, 0, MOPT_GTE0},
- {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
- {Opt_stripe, 0, MOPT_GTE0},
- {Opt_resuid, 0, MOPT_GTE0},
- {Opt_resgid, 0, MOPT_GTE0},
- {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
- {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
- {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
- {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
- {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
- {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
- MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_dax_type, 0, MOPT_EXT4_ONLY},
+ {Opt_journal_dev, 0, MOPT_NO_EXT2},
+ {Opt_journal_path, 0, MOPT_NO_EXT2},
+ {Opt_journal_ioprio, 0, MOPT_NO_EXT2},
+ {Opt_data, 0, MOPT_NO_EXT2},
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
- {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
- {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
#else
{Opt_acl, 0, MOPT_NOSUPPORT},
- {Opt_noacl, 0, MOPT_NOSUPPORT},
#endif
{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
- {Opt_debug_want_extra_isize, 0, MOPT_GTE0},
{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
MOPT_SET | MOPT_Q},
@@ -1813,415 +1903,976 @@ static const struct mount_opts {
MOPT_CLEAR | MOPT_Q},
{Opt_usrjquota, 0, MOPT_Q},
{Opt_grpjquota, 0, MOPT_Q},
- {Opt_offusrjquota, 0, MOPT_Q},
- {Opt_offgrpjquota, 0, MOPT_Q},
- {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
- {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
- {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
- {Opt_max_dir_size_kb, 0, MOPT_GTE0},
- {Opt_test_dummy_encryption, 0, MOPT_STRING},
+ {Opt_jqfmt, 0, MOPT_QFMT},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+ {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
+ MOPT_SET},
+#ifdef CONFIG_EXT4_DEBUG
+ {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
+ MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
+#endif
+ {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
{Opt_err, 0, 0}
};
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
static const struct ext4_sb_encodings {
__u16 magic;
char *name;
- char *version;
+ unsigned int version;
} ext4_sb_encoding_map[] = {
- {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
+ {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
};
-static int ext4_sb_read_encoding(const struct ext4_super_block *es,
- const struct ext4_sb_encodings **encoding,
- __u16 *flags)
+static const struct ext4_sb_encodings *
+ext4_sb_read_encoding(const struct ext4_super_block *es)
{
__u16 magic = le16_to_cpu(es->s_encoding);
int i;
for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
if (magic == ext4_sb_encoding_map[i].magic)
- break;
+ return &ext4_sb_encoding_map[i];
- if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
+ return NULL;
+}
+#endif
+
+#define EXT4_SPEC_JQUOTA (1 << 0)
+#define EXT4_SPEC_JQFMT (1 << 1)
+#define EXT4_SPEC_DATAJ (1 << 2)
+#define EXT4_SPEC_SB_BLOCK (1 << 3)
+#define EXT4_SPEC_JOURNAL_DEV (1 << 4)
+#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5)
+#define EXT4_SPEC_s_want_extra_isize (1 << 7)
+#define EXT4_SPEC_s_max_batch_time (1 << 8)
+#define EXT4_SPEC_s_min_batch_time (1 << 9)
+#define EXT4_SPEC_s_inode_readahead_blks (1 << 10)
+#define EXT4_SPEC_s_li_wait_mult (1 << 11)
+#define EXT4_SPEC_s_max_dir_size_kb (1 << 12)
+#define EXT4_SPEC_s_stripe (1 << 13)
+#define EXT4_SPEC_s_resuid (1 << 14)
+#define EXT4_SPEC_s_resgid (1 << 15)
+#define EXT4_SPEC_s_commit_interval (1 << 16)
+#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17)
+#define EXT4_SPEC_s_sb_block (1 << 18)
+#define EXT4_SPEC_mb_optimize_scan (1 << 19)
+
+struct ext4_fs_context {
+ char *s_qf_names[EXT4_MAXQUOTAS];
+ struct fscrypt_dummy_policy dummy_enc_policy;
+ int s_jquota_fmt; /* Format of quota to use */
+#ifdef CONFIG_EXT4_DEBUG
+ int s_fc_debug_max_replay;
+#endif
+ unsigned short qname_spec;
+ unsigned long vals_s_flags; /* Bits to set in s_flags */
+ unsigned long mask_s_flags; /* Bits changed in s_flags */
+ unsigned long journal_devnum;
+ unsigned long s_commit_interval;
+ unsigned long s_stripe;
+ unsigned int s_inode_readahead_blks;
+ unsigned int s_want_extra_isize;
+ unsigned int s_li_wait_mult;
+ unsigned int s_max_dir_size_kb;
+ unsigned int journal_ioprio;
+ unsigned int vals_s_mount_opt;
+ unsigned int mask_s_mount_opt;
+ unsigned int vals_s_mount_opt2;
+ unsigned int mask_s_mount_opt2;
+ unsigned int opt_flags; /* MOPT flags */
+ unsigned int spec;
+ u32 s_max_batch_time;
+ u32 s_min_batch_time;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ ext4_fsblk_t s_sb_block;
+};
+
+static void ext4_fc_free(struct fs_context *fc)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int i;
+
+ if (!ctx)
+ return;
+
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ kfree(ctx->s_qf_names[i]);
+
+ fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
+ kfree(ctx);
+}
+
+int ext4_init_fs_context(struct fs_context *fc)
+{
+ struct ext4_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->fs_private = ctx;
+ fc->ops = &ext4_context_ops;
+
+ /* i_version is always enabled now */
+ fc->sb_flags |= SB_I_VERSION;
+
+ return 0;
+}
+
+#ifdef CONFIG_QUOTA
+/*
+ * Note the name of the specified quota file.
+ */
+static int note_qf_name(struct fs_context *fc, int qtype,
+ struct fs_parameter *param)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ char *qname;
+
+ if (param->size < 1) {
+ ext4_msg(NULL, KERN_ERR, "Missing quota name");
return -EINVAL;
+ }
+ if (strchr(param->string, '/')) {
+ ext4_msg(NULL, KERN_ERR,
+ "quotafile must be on filesystem root");
+ return -EINVAL;
+ }
+ if (ctx->s_qf_names[qtype]) {
+ if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
+ ext4_msg(NULL, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ return -EINVAL;
+ }
+ return 0;
+ }
- *encoding = &ext4_sb_encoding_map[i];
- *flags = le16_to_cpu(es->s_encoding_flags);
+ qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
+ if (!qname) {
+ ext4_msg(NULL, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return -ENOMEM;
+ }
+ ctx->s_qf_names[qtype] = qname;
+ ctx->qname_spec |= 1 << qtype;
+ ctx->spec |= EXT4_SPEC_JQUOTA;
+ return 0;
+}
+
+/*
+ * Clear the name of the specified quota file.
+ */
+static int unnote_qf_name(struct fs_context *fc, int qtype)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+
+ kfree(ctx->s_qf_names[qtype]);
+ ctx->s_qf_names[qtype] = NULL;
+ ctx->qname_spec |= 1 << qtype;
+ ctx->spec |= EXT4_SPEC_JQUOTA;
return 0;
}
#endif
-static int ext4_set_test_dummy_encryption(struct super_block *sb,
- const char *opt,
- const substring_t *arg,
- bool is_remount)
+static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct ext4_fs_context *ctx)
{
-#ifdef CONFIG_FS_ENCRYPTION
- struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
- /*
- * This mount option is just for testing, and it's not worthwhile to
- * implement the extra complexity (e.g. RCU protection) that would be
- * needed to allow it to be set or changed during remount. We do allow
- * it to be specified during remount, but only if there is no change.
- */
- if (is_remount && !sbi->s_dummy_enc_ctx.ctx) {
- ext4_msg(sb, KERN_WARNING,
- "Can't set test_dummy_encryption on remount");
- return -1;
+ if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption option not supported");
+ return -EINVAL;
}
- err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx);
- if (err) {
- if (err == -EEXIST)
- ext4_msg(sb, KERN_WARNING,
- "Can't change test_dummy_encryption on remount");
- else if (err == -EINVAL)
- ext4_msg(sb, KERN_WARNING,
- "Value of option \"%s\" is unrecognized", opt);
- else
- ext4_msg(sb, KERN_WARNING,
- "Error processing option \"%s\" [%d]",
- opt, err);
- return -1;
+ err = fscrypt_parse_test_dummy_encryption(param,
+ &ctx->dummy_enc_policy);
+ if (err == -EINVAL) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized", param->key);
+ } else if (err == -EEXIST) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
}
- ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
-#else
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mount option ignored");
-#endif
- return 1;
+ return err;
}
-static int handle_mount_opt(struct super_block *sb, char *opt, int token,
- substring_t *args, unsigned long *journal_devnum,
- unsigned int *journal_ioprio, int is_remount)
+#define EXT4_SET_CTX(name) \
+static inline __maybe_unused \
+void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \
+{ \
+ ctx->mask_s_##name |= flag; \
+ ctx->vals_s_##name |= flag; \
+}
+
+#define EXT4_CLEAR_CTX(name) \
+static inline __maybe_unused \
+void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \
+{ \
+ ctx->mask_s_##name |= flag; \
+ ctx->vals_s_##name &= ~flag; \
+}
+
+#define EXT4_TEST_CTX(name) \
+static inline unsigned long \
+ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \
+{ \
+ return (ctx->vals_s_##name & flag); \
+}
+
+EXT4_SET_CTX(flags); /* set only */
+EXT4_SET_CTX(mount_opt);
+EXT4_CLEAR_CTX(mount_opt);
+EXT4_TEST_CTX(mount_opt);
+EXT4_SET_CTX(mount_opt2);
+EXT4_CLEAR_CTX(mount_opt2);
+EXT4_TEST_CTX(mount_opt2);
+
+static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
const struct mount_opts *m;
- kuid_t uid;
- kgid_t gid;
- int arg = 0;
+ int is_remount;
+ int token;
-#ifdef CONFIG_QUOTA
- if (token == Opt_usrjquota)
- return set_qf_name(sb, USRQUOTA, &args[0]);
- else if (token == Opt_grpjquota)
- return set_qf_name(sb, GRPQUOTA, &args[0]);
- else if (token == Opt_offusrjquota)
- return clear_qf_name(sb, USRQUOTA);
- else if (token == Opt_offgrpjquota)
- return clear_qf_name(sb, GRPQUOTA);
-#endif
- switch (token) {
- case Opt_noacl:
- case Opt_nouser_xattr:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
- break;
- case Opt_sb:
- return 1; /* handled by get_sb_block() */
- case Opt_removed:
- ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
- return 1;
- case Opt_abort:
- sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
- return 1;
- case Opt_i_version:
- sb->s_flags |= SB_I_VERSION;
- return 1;
- case Opt_lazytime:
- sb->s_flags |= SB_LAZYTIME;
- return 1;
- case Opt_nolazytime:
- sb->s_flags &= ~SB_LAZYTIME;
- return 1;
- }
+ token = fs_parse(fc, ext4_param_specs, param, &result);
+ if (token < 0)
+ return token;
+ is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
for (m = ext4_mount_opts; m->token != Opt_err; m++)
if (token == m->token)
break;
- if (m->token == Opt_err) {
- ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
- "or missing value", opt);
- return -1;
- }
+ ctx->opt_flags |= m->flags;
- if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Mount option \"%s\" incompatible with ext2", opt);
- return -1;
- }
- if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Mount option \"%s\" incompatible with ext3", opt);
- return -1;
- }
-
- if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
- return -1;
- if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
- return -1;
if (m->flags & MOPT_EXPLICIT) {
if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
- set_opt2(sb, EXPLICIT_DELALLOC);
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
- set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
+ ctx_set_mount_opt2(ctx,
+ EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
} else
- return -1;
- }
- if (m->flags & MOPT_CLEAR_ERR)
- clear_opt(sb, ERRORS_MASK);
- if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot change quota "
- "options when quota turned on");
- return -1;
+ return -EINVAL;
}
if (m->flags & MOPT_NOSUPPORT) {
- ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
- } else if (token == Opt_commit) {
- if (arg == 0)
- arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
- else if (arg > INT_MAX / HZ) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR, "%s option not supported",
+ param->key);
+ return 0;
+ }
+
+ switch (token) {
+#ifdef CONFIG_QUOTA
+ case Opt_usrjquota:
+ if (!*param->string)
+ return unnote_qf_name(fc, USRQUOTA);
+ else
+ return note_qf_name(fc, USRQUOTA, param);
+ case Opt_grpjquota:
+ if (!*param->string)
+ return unnote_qf_name(fc, GRPQUOTA);
+ else
+ return note_qf_name(fc, GRPQUOTA, param);
+#endif
+ case Opt_sb:
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Ignoring %s option on remount", param->key);
+ } else {
+ ctx->s_sb_block = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_sb_block;
+ }
+ return 0;
+ case Opt_removed:
+ ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
+ param->key);
+ return 0;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ ctx_set_flags(ctx, SB_INLINECRYPT);
+#else
+ ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
+#endif
+ return 0;
+ case Opt_errors:
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
+ ctx_set_mount_opt(ctx, result.uint_32);
+ return 0;
+#ifdef CONFIG_QUOTA
+ case Opt_jqfmt:
+ ctx->s_jquota_fmt = result.uint_32;
+ ctx->spec |= EXT4_SPEC_JQFMT;
+ return 0;
+#endif
+ case Opt_data:
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
+ ctx_set_mount_opt(ctx, result.uint_32);
+ ctx->spec |= EXT4_SPEC_DATAJ;
+ return 0;
+ case Opt_commit:
+ if (result.uint_32 == 0)
+ result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ else if (result.uint_32 > INT_MAX / HZ) {
+ ext4_msg(NULL, KERN_ERR,
"Invalid commit interval %d, "
"must be smaller than %d",
- arg, INT_MAX / HZ);
- return -1;
+ result.uint_32, INT_MAX / HZ);
+ return -EINVAL;
}
- sbi->s_commit_interval = HZ * arg;
- } else if (token == Opt_debug_want_extra_isize) {
- if ((arg & 1) ||
- (arg < 4) ||
- (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
- ext4_msg(sb, KERN_ERR,
- "Invalid want_extra_isize %d", arg);
- return -1;
+ ctx->s_commit_interval = HZ * result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_commit_interval;
+ return 0;
+ case Opt_debug_want_extra_isize:
+ if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Invalid want_extra_isize %d", result.uint_32);
+ return -EINVAL;
}
- sbi->s_want_extra_isize = arg;
- } else if (token == Opt_max_batch_time) {
- sbi->s_max_batch_time = arg;
- } else if (token == Opt_min_batch_time) {
- sbi->s_min_batch_time = arg;
- } else if (token == Opt_inode_readahead_blks) {
- if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
- ext4_msg(sb, KERN_ERR,
+ ctx->s_want_extra_isize = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_want_extra_isize;
+ return 0;
+ case Opt_max_batch_time:
+ ctx->s_max_batch_time = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_max_batch_time;
+ return 0;
+ case Opt_min_batch_time:
+ ctx->s_min_batch_time = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_min_batch_time;
+ return 0;
+ case Opt_inode_readahead_blks:
+ if (result.uint_32 &&
+ (result.uint_32 > (1 << 30) ||
+ !is_power_of_2(result.uint_32))) {
+ ext4_msg(NULL, KERN_ERR,
"EXT4-fs: inode_readahead_blks must be "
"0 or a power of 2 smaller than 2^31");
- return -1;
- }
- sbi->s_inode_readahead_blks = arg;
- } else if (token == Opt_init_itable) {
- set_opt(sb, INIT_INODE_TABLE);
- if (!args->from)
- arg = EXT4_DEF_LI_WAIT_MULT;
- sbi->s_li_wait_mult = arg;
- } else if (token == Opt_max_dir_size_kb) {
- sbi->s_max_dir_size_kb = arg;
- } else if (token == Opt_stripe) {
- sbi->s_stripe = arg;
- } else if (token == Opt_resuid) {
- uid = make_kuid(current_user_ns(), arg);
- if (!uid_valid(uid)) {
- ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
- return -1;
- }
- sbi->s_resuid = uid;
- } else if (token == Opt_resgid) {
- gid = make_kgid(current_user_ns(), arg);
- if (!gid_valid(gid)) {
- ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
- return -1;
+ return -EINVAL;
}
- sbi->s_resgid = gid;
- } else if (token == Opt_journal_dev) {
+ ctx->s_inode_readahead_blks = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
+ return 0;
+ case Opt_init_itable:
+ ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
+ ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ if (param->type == fs_value_is_string)
+ ctx->s_li_wait_mult = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_li_wait_mult;
+ return 0;
+ case Opt_max_dir_size_kb:
+ ctx->s_max_dir_size_kb = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
+ return 0;
+#ifdef CONFIG_EXT4_DEBUG
+ case Opt_fc_debug_max_replay:
+ ctx->s_fc_debug_max_replay = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
+ return 0;
+#endif
+ case Opt_stripe:
+ ctx->s_stripe = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_stripe;
+ return 0;
+ case Opt_resuid:
+ ctx->s_resuid = result.uid;
+ ctx->spec |= EXT4_SPEC_s_resuid;
+ return 0;
+ case Opt_resgid:
+ ctx->s_resgid = result.gid;
+ ctx->spec |= EXT4_SPEC_s_resgid;
+ return 0;
+ case Opt_journal_dev:
if (is_remount) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR,
"Cannot specify journal on remount");
- return -1;
+ return -EINVAL;
}
- *journal_devnum = arg;
- } else if (token == Opt_journal_path) {
- char *journal_path;
+ ctx->journal_devnum = result.uint_32;
+ ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
+ return 0;
+ case Opt_journal_path:
+ {
struct inode *journal_inode;
struct path path;
int error;
if (is_remount) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR,
"Cannot specify journal on remount");
- return -1;
- }
- journal_path = match_strdup(&args[0]);
- if (!journal_path) {
- ext4_msg(sb, KERN_ERR, "error: could not dup "
- "journal device string");
- return -1;
+ return -EINVAL;
}
- error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
if (error) {
- ext4_msg(sb, KERN_ERR, "error: could not find "
- "journal device path: error %d", error);
- kfree(journal_path);
- return -1;
+ ext4_msg(NULL, KERN_ERR, "error: could not find "
+ "journal device path");
+ return -EINVAL;
}
journal_inode = d_inode(path.dentry);
- if (!S_ISBLK(journal_inode->i_mode)) {
- ext4_msg(sb, KERN_ERR, "error: journal path %s "
- "is not a block device", journal_path);
- path_put(&path);
- kfree(journal_path);
- return -1;
- }
-
- *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
path_put(&path);
- kfree(journal_path);
- } else if (token == Opt_journal_ioprio) {
- if (arg > 7) {
- ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+ return 0;
+ }
+ case Opt_journal_ioprio:
+ if (result.uint_32 > 7) {
+ ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
" (must be 0-7)");
- return -1;
- }
- *journal_ioprio =
- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
- } else if (token == Opt_test_dummy_encryption) {
- return ext4_set_test_dummy_encryption(sb, opt, &args[0],
- is_remount);
- } else if (m->flags & MOPT_DATAJ) {
- if (is_remount) {
- if (!sbi->s_journal)
- ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
- else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
- ext4_msg(sb, KERN_ERR,
- "Cannot change data mode on remount");
- return -1;
- }
- } else {
- clear_opt(sb, DATA_FLAGS);
- sbi->s_mount_opt |= m->mount_opt;
- }
-#ifdef CONFIG_QUOTA
- } else if (m->flags & MOPT_QFMT) {
- if (sb_any_quota_loaded(sb) &&
- sbi->s_jquota_fmt != m->mount_opt) {
- ext4_msg(sb, KERN_ERR, "Cannot change journaled "
- "quota options when quota turned on");
- return -1;
- }
- if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_INFO,
- "Quota format mount options ignored "
- "when QUOTA feature is enabled");
- return 1;
+ return -EINVAL;
}
- sbi->s_jquota_fmt = m->mount_opt;
-#endif
- } else if (token == Opt_dax) {
+ ctx->journal_ioprio =
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
+ ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
+ return 0;
+ case Opt_test_dummy_encryption:
+ return ext4_parse_test_dummy_encryption(param, ctx);
+ case Opt_dax:
+ case Opt_dax_type:
#ifdef CONFIG_FS_DAX
- ext4_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= m->mount_opt;
+ {
+ int type = (token == Opt_dax) ?
+ Opt_dax : result.uint_32;
+
+ switch (type) {
+ case Opt_dax:
+ case Opt_dax_always:
+ ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
+ break;
+ case Opt_dax_never:
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
+ break;
+ case Opt_dax_inode:
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
+ /* Strictly for printing options */
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
+ break;
+ }
+ return 0;
+ }
#else
- ext4_msg(sb, KERN_INFO, "dax option not supported");
- return -1;
+ ext4_msg(NULL, KERN_INFO, "dax option not supported");
+ return -EINVAL;
#endif
- } else if (token == Opt_data_err_abort) {
- sbi->s_mount_opt |= m->mount_opt;
- } else if (token == Opt_data_err_ignore) {
- sbi->s_mount_opt &= ~m->mount_opt;
- } else {
- if (!args->from)
- arg = 1;
+ case Opt_data_err:
+ if (result.uint_32 == Opt_data_err_abort)
+ ctx_set_mount_opt(ctx, m->mount_opt);
+ else if (result.uint_32 == Opt_data_err_ignore)
+ ctx_clear_mount_opt(ctx, m->mount_opt);
+ return 0;
+ case Opt_mb_optimize_scan:
+ if (result.int_32 == 1) {
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else if (result.int_32 == 0) {
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else {
+ ext4_msg(NULL, KERN_WARNING,
+ "mb_optimize_scan should be set to 0 or 1.");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ /*
+ * At this point we should only be getting options requiring MOPT_SET,
+ * or MOPT_CLEAR. Anything else is a bug
+ */
+ if (m->token == Opt_err) {
+ ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
+ param->key);
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ else {
+ unsigned int set = 0;
+
+ if ((param->type == fs_value_is_flag) ||
+ result.uint_32 > 0)
+ set = 1;
+
if (m->flags & MOPT_CLEAR)
- arg = !arg;
+ set = !set;
else if (unlikely(!(m->flags & MOPT_SET))) {
- ext4_msg(sb, KERN_WARNING,
- "buggy handling of option %s", opt);
+ ext4_msg(NULL, KERN_WARNING,
+ "buggy handling of option %s",
+ param->key);
WARN_ON(1);
- return -1;
+ return -EINVAL;
+ }
+ if (m->flags & MOPT_2) {
+ if (set != 0)
+ ctx_set_mount_opt2(ctx, m->mount_opt);
+ else
+ ctx_clear_mount_opt2(ctx, m->mount_opt);
+ } else {
+ if (set != 0)
+ ctx_set_mount_opt(ctx, m->mount_opt);
+ else
+ ctx_clear_mount_opt(ctx, m->mount_opt);
}
- if (arg != 0)
- sbi->s_mount_opt |= m->mount_opt;
- else
- sbi->s_mount_opt &= ~m->mount_opt;
}
- return 1;
+
+ return 0;
}
-static int parse_options(char *options, struct super_block *sb,
- unsigned long *journal_devnum,
- unsigned int *journal_ioprio,
- int is_remount)
+static int parse_options(struct fs_context *fc, char *options)
{
- struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
- char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
- substring_t args[MAX_OPT_ARGS];
- int token;
+ struct fs_parameter param;
+ int ret;
+ char *key;
if (!options)
- return 1;
+ return 0;
- while ((p = strsep(&options, ",")) != NULL) {
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- if (handle_mount_opt(sb, p, token, args, journal_devnum,
- journal_ioprio, is_remount) < 0)
- return 0;
+ while ((key = strsep(&options, ",")) != NULL) {
+ if (*key) {
+ size_t v_len = 0;
+ char *value = strchr(key, '=');
+
+ param.type = fs_value_is_flag;
+ param.string = NULL;
+
+ if (value) {
+ if (value == key)
+ continue;
+
+ *value++ = 0;
+ v_len = strlen(value);
+ param.string = kmemdup_nul(value, v_len,
+ GFP_KERNEL);
+ if (!param.string)
+ return -ENOMEM;
+ param.type = fs_value_is_string;
+ }
+
+ param.key = key;
+ param.size = v_len;
+
+ ret = ext4_parse_param(fc, &param);
+ kfree(param.string);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ ret = ext4_validate_options(fc);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static int parse_apply_sb_mount_options(struct super_block *sb,
+ struct ext4_fs_context *m_ctx)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char s_mount_opts[64];
+ struct ext4_fs_context *s_ctx = NULL;
+ struct fs_context *fc = NULL;
+ int ret = -ENOMEM;
+
+ if (!sbi->s_es->s_mount_opts[0])
+ return 0;
+
+ if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts) < 0)
+ return -E2BIG;
+
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ return -ENOMEM;
+
+ s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
+ if (!s_ctx)
+ goto out_free;
+
+ fc->fs_private = s_ctx;
+ fc->s_fs_info = sbi;
+
+ ret = parse_options(fc, s_mount_opts);
+ if (ret < 0)
+ goto parse_failed;
+
+ ret = ext4_check_opt_consistency(fc, sb);
+ if (ret < 0) {
+parse_failed:
+ ext4_msg(sb, KERN_WARNING,
+ "failed to parse options in superblock: %s",
+ s_mount_opts);
+ ret = 0;
+ goto out_free;
+ }
+
+ if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
+ m_ctx->journal_devnum = s_ctx->journal_devnum;
+ if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
+ m_ctx->journal_ioprio = s_ctx->journal_ioprio;
+
+ ext4_apply_options(fc, sb);
+ ret = 0;
+
+out_free:
+ ext4_fc_free(fc);
+ kfree(fc);
+ return ret;
+}
+
+static void ext4_apply_quota_options(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+ bool quota_feature = ext4_has_feature_quota(sb);
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *qname;
+ int i;
+
+ if (quota_feature)
+ return;
+
+ if (ctx->spec & EXT4_SPEC_JQUOTA) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (!(ctx->qname_spec & (1 << i)))
+ continue;
+
+ qname = ctx->s_qf_names[i]; /* May be NULL */
+ if (qname)
+ set_opt(sb, QUOTA);
+ ctx->s_qf_names[i] = NULL;
+ qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
+ lockdep_is_held(&sb->s_umount));
+ if (qname)
+ kfree_rcu_mightsleep(qname);
+ }
}
+
+ if (ctx->spec & EXT4_SPEC_JQFMT)
+ sbi->s_jquota_fmt = ctx->s_jquota_fmt;
+#endif
+}
+
+/*
+ * Check quota settings consistency.
+ */
+static int ext4_check_quota_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
#ifdef CONFIG_QUOTA
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ bool quota_feature = ext4_has_feature_quota(sb);
+ bool quota_loaded = sb_any_quota_loaded(sb);
+ bool usr_qf_name, grp_qf_name, usrquota, grpquota;
+ int quota_flags, i;
+
/*
* We do the test below only for project quotas. 'usrquota' and
* 'grpquota' mount options are allowed even without quota feature
* to support legacy quotas in quota files.
*/
- if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
- ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
+ !ext4_has_feature_project(sb)) {
+ ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
"Cannot enable project quota enforcement.");
- return 0;
+ return -EINVAL;
}
- usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
- grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
- if (usr_qf_name || grp_qf_name) {
- if (test_opt(sb, USRQUOTA) && usr_qf_name)
- clear_opt(sb, USRQUOTA);
- if (test_opt(sb, GRPQUOTA) && grp_qf_name)
- clear_opt(sb, GRPQUOTA);
+ quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+ EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
+ if (quota_loaded &&
+ ctx->mask_s_mount_opt & quota_flags &&
+ !ctx_test_mount_opt(ctx, quota_flags))
+ goto err_quota_change;
+
+ if (ctx->spec & EXT4_SPEC_JQUOTA) {
+
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (!(ctx->qname_spec & (1 << i)))
+ continue;
+
+ if (quota_loaded &&
+ !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
+ goto err_jquota_change;
+
+ if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
+ strcmp(get_qf_name(sb, sbi, i),
+ ctx->s_qf_names[i]) != 0)
+ goto err_jquota_specified;
+ }
+
+ if (quota_feature) {
+ ext4_msg(NULL, KERN_INFO,
+ "Journaled quota options ignored when "
+ "QUOTA feature is enabled");
+ return 0;
+ }
+ }
- if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
- ext4_msg(sb, KERN_ERR, "old and new quota "
- "format mixing");
+ if (ctx->spec & EXT4_SPEC_JQFMT) {
+ if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
+ goto err_jquota_change;
+ if (quota_feature) {
+ ext4_msg(NULL, KERN_INFO, "Quota format mount options "
+ "ignored when QUOTA feature is enabled");
return 0;
}
+ }
+
+ /* Make sure we don't mix old and new quota format */
+ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
+ ctx->s_qf_names[USRQUOTA]);
+ grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
+ ctx->s_qf_names[GRPQUOTA]);
+
+ usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
+ test_opt(sb, USRQUOTA));
+
+ grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
+ test_opt(sb, GRPQUOTA));
+
+ if (usr_qf_name) {
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
+ usrquota = false;
+ }
+ if (grp_qf_name) {
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
+ grpquota = false;
+ }
+
+ if (usr_qf_name || grp_qf_name) {
+ if (usrquota || grpquota) {
+ ext4_msg(NULL, KERN_ERR, "old and new quota "
+ "format mixing");
+ return -EINVAL;
+ }
+
+ if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
+ ext4_msg(NULL, KERN_ERR, "journaled quota format "
+ "not specified");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
- if (!sbi->s_jquota_fmt) {
- ext4_msg(sb, KERN_ERR, "journaled quota format "
- "not specified");
+err_quota_change:
+ ext4_msg(NULL, KERN_ERR,
+ "Cannot change quota options when quota turned on");
+ return -EINVAL;
+err_jquota_change:
+ ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
+ "options when quota turned on");
+ return -EINVAL;
+err_jquota_specified:
+ ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
+ QTYPE2NAME(i));
+ return -EINVAL;
+#else
+ return 0;
+#endif
+}
+
+static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
+ struct super_block *sb)
+{
+ const struct ext4_fs_context *ctx = fc->fs_private;
+ const struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
+ return 0;
+
+ if (!ext4_has_feature_encrypt(sb)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption requires encrypt feature");
+ return -EINVAL;
+ }
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
return 0;
+ ext4_msg(NULL, KERN_WARNING,
+ "Can't set or change test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+ /* Also make sure s_mount_opts didn't contain a conflicting value. */
+ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
+ struct super_block *sb)
+{
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
+ /* if already set, it was already verified to be the same */
+ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
+ return;
+ EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
+ memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
+ ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
+}
+
+static int ext4_check_opt_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = fc->s_fs_info;
+ int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
+ int err;
+
+ if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Mount option(s) incompatible with ext2");
+ return -EINVAL;
+ }
+ if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Mount option(s) incompatible with ext3");
+ return -EINVAL;
+ }
+
+ if (ctx->s_want_extra_isize >
+ (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Invalid want_extra_isize %d",
+ ctx->s_want_extra_isize);
+ return -EINVAL;
+ }
+
+ err = ext4_check_test_dummy_encryption(fc, sb);
+ if (err)
+ return err;
+
+ if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
+ if (!sbi->s_journal) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting file system with no journal "
+ "so ignoring journalled data option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
+ } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
+ test_opt(sb, DATA_FLAGS)) {
+ ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
+ "on remount");
+ return -EINVAL;
+ }
+ }
+
+ if (is_remount) {
+ if (!sbi->s_journal &&
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting fs w/o journal so ignoring data_err option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
+ }
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
+ (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
+ ext4_msg(NULL, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -EINVAL;
+ }
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
+ (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
+fail_dax_change_remount:
+ ext4_msg(NULL, KERN_ERR, "can't change "
+ "dax mount option while remounting");
+ return -EINVAL;
+ } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
+ (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
+ goto fail_dax_change_remount;
+ } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
+ ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
+ goto fail_dax_change_remount;
}
}
+
+ return ext4_check_quota_consistency(fc, sb);
+}
+
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = fc->s_fs_info;
+
+ sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
+ sbi->s_mount_opt |= ctx->vals_s_mount_opt;
+ sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
+ sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
+ sb->s_flags &= ~ctx->mask_s_flags;
+ sb->s_flags |= ctx->vals_s_flags;
+
+#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
+ APPLY(s_commit_interval);
+ APPLY(s_stripe);
+ APPLY(s_max_batch_time);
+ APPLY(s_min_batch_time);
+ APPLY(s_want_extra_isize);
+ APPLY(s_inode_readahead_blks);
+ APPLY(s_max_dir_size_kb);
+ APPLY(s_li_wait_mult);
+ APPLY(s_resgid);
+ APPLY(s_resuid);
+
+#ifdef CONFIG_EXT4_DEBUG
+ APPLY(s_fc_debug_max_replay);
#endif
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- int blocksize =
- BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
- if (blocksize < PAGE_SIZE)
- ext4_msg(sb, KERN_WARNING, "Warning: mounting with an "
- "experimental mount option 'dioread_nolock' "
- "for blocksize < PAGE_SIZE");
+
+ ext4_apply_quota_options(fc, sb);
+ ext4_apply_test_dummy_encryption(ctx, sb);
+}
+
+
+static int ext4_validate_options(struct fs_context *fc)
+{
+#ifdef CONFIG_QUOTA
+ struct ext4_fs_context *ctx = fc->fs_private;
+ char *usr_qf_name, *grp_qf_name;
+
+ usr_qf_name = ctx->s_qf_names[USRQUOTA];
+ grp_qf_name = ctx->s_qf_names[GRPQUOTA];
+
+ if (usr_qf_name || grp_qf_name) {
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
+ ext4_msg(NULL, KERN_ERR, "old and new quota "
+ "format mixing");
+ return -EINVAL;
+ }
}
+#endif
return 1;
}
@@ -2262,12 +2913,12 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
static const char *token2str(int token)
{
- const struct match_token *t;
+ const struct fs_parameter_spec *spec;
- for (t = tokens; t->token != Opt_err; t++)
- if (t->token == token && !strchr(t->pattern, '='))
+ for (spec = ext4_param_specs; spec->name != NULL; spec++)
+ if (spec->opt == token && !spec->type)
break;
- return t->pattern;
+ return spec->name;
}
/*
@@ -2280,7 +2931,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- int def_errors, def_mount_opt = sbi->s_def_mount_opt;
+ int def_errors;
const struct mount_opts *m;
char sep = nodefs ? '\n' : ',';
@@ -2292,24 +2943,37 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
+ int opt_2 = m->flags & MOPT_2;
+ unsigned int mount_opt, def_mount_opt;
+
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
- (m->flags & MOPT_CLEAR_ERR))
+ m->flags & MOPT_SKIP)
+ continue;
+
+ if (opt_2) {
+ mount_opt = sbi->s_mount_opt2;
+ def_mount_opt = sbi->s_def_mount_opt2;
+ } else {
+ mount_opt = sbi->s_mount_opt;
+ def_mount_opt = sbi->s_def_mount_opt;
+ }
+ /* skip if same as the default */
+ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt)))
continue;
- if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
- continue; /* skip if same as the default */
+ /* select Opt_noFoo vs Opt_Foo */
if ((want_set &&
- (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
- (!want_set && (sbi->s_mount_opt & m->mount_opt)))
- continue; /* select Opt_noFoo vs Opt_Foo */
+ (mount_opt & m->mount_opt) != m->mount_opt) ||
+ (!want_set && (mount_opt & m->mount_opt)))
+ continue;
SEQ_OPTS_PRINT("%s", token2str(m->token));
}
if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
- le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+ ext4_get_resuid(es) != EXT4_DEF_RESUID)
SEQ_OPTS_PRINT("resuid=%u",
from_kuid_munged(&init_user_ns, sbi->s_resuid));
if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
- le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+ ext4_get_resgid(es) != EXT4_DEF_RESGID)
SEQ_OPTS_PRINT("resgid=%u",
from_kgid_munged(&init_user_ns, sbi->s_resgid));
def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
@@ -2325,12 +2989,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
- if (sb->s_flags & SB_I_VERSION)
+ if (nodefs && sb->s_flags & SB_I_VERSION)
SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
- (sbi->s_mount_opt ^ def_mount_opt)) {
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
SEQ_OPTS_PUTS("data=journal");
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
@@ -2353,6 +3017,37 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
fscrypt_show_test_dummy_encryption(seq, sep, sb);
+ if (sb->s_flags & SB_INLINECRYPT)
+ SEQ_OPTS_PUTS("inlinecrypt");
+
+ if (test_opt(sb, DAX_ALWAYS)) {
+ if (IS_EXT2_SB(sb))
+ SEQ_OPTS_PUTS("dax");
+ else
+ SEQ_OPTS_PUTS("dax=always");
+ } else if (test_opt2(sb, DAX_NEVER)) {
+ SEQ_OPTS_PUTS("dax=never");
+ } else if (test_opt2(sb, DAX_INODE)) {
+ SEQ_OPTS_PUTS("dax=inode");
+ }
+
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+ !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+ SEQ_OPTS_PUTS("mb_optimize_scan=0");
+ } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+ test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+ SEQ_OPTS_PUTS("mb_optimize_scan=1");
+ }
+
+ if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
+ SEQ_OPTS_PUTS("prefetch_block_bitmaps");
+
+ if (ext4_emergency_ro(sb))
+ SEQ_OPTS_PUTS("emergency_ro");
+
+ if (ext4_forced_shutdown(sb))
+ SEQ_OPTS_PUTS("shutdown");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -2369,7 +3064,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset)
seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
rc = _ext4_show_options(seq, sb, 1);
- seq_puts(seq, "\n");
+ seq_putc(seq, '\n');
return rc;
}
@@ -2383,6 +3078,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
err = -EROFS;
+ goto done;
}
if (read_only)
goto done;
@@ -2411,10 +3107,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext4_update_tstamp(es, s_mtime);
- if (sbi->s_journal)
+ if (sbi->s_journal) {
ext4_set_feature_journal_needs_recovery(sb);
+ if (ext4_has_feature_orphan_file(sb))
+ ext4_set_feature_orphan_present(sb);
+ }
- err = ext4_commit_super(sb, 1);
+ err = ext4_commit_super(sb);
done:
if (test_opt(sb, DEBUG))
printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
@@ -2424,8 +3123,6 @@ done:
EXT4_BLOCKS_PER_GROUP(sb),
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt, sbi->s_mount_opt2);
-
- cleancache_init_fs(sb);
return err;
}
@@ -2518,19 +3215,19 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
__le32 le_group = cpu_to_le32(block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (ext4_has_metadata_csum(sbi->s_sb)) {
+ if (ext4_has_feature_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
__u32 csum32;
__u16 dummy_csum = 0;
- csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+ csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset);
+ csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum,
sizeof(dummy_csum));
offset += sizeof(dummy_csum);
if (offset < sbi->s_desc_size)
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset,
sbi->s_desc_size - offset);
crc = csum32 & 0xFFFF;
@@ -2546,11 +3243,9 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
crc = crc16(crc, (__u8 *)gdp, offset);
offset += sizeof(gdp->bg_checksum); /* skip checksum */
/* for checksum of struct ext4_group_desc do the rest...*/
- if (ext4_has_feature_64bit(sb) &&
- offset < le16_to_cpu(sbi->s_es->s_desc_size))
+ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
crc = crc16(crc, (__u8 *)gdp + offset,
- le16_to_cpu(sbi->s_es->s_desc_size) -
- offset);
+ sbi->s_desc_size - offset);
out:
return cpu_to_le16(crc);
@@ -2694,165 +3389,6 @@ static int ext4_check_descriptors(struct super_block *sb,
return 1;
}
-/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
- * the superblock) which were deleted from all directories, but held open by
- * a process at the time of a crash. We walk the list and try to delete these
- * inodes at recovery time (only with a read-write filesystem).
- *
- * In order to keep the orphan inode chain consistent during traversal (in
- * case of crash during recovery), we link each inode into the superblock
- * orphan list_head and handle it the same way as an inode deletion during
- * normal operation (which journals the operations for us).
- *
- * We only do an iget() and an iput() on each inode, which is very safe if we
- * accidentally point at an in-use or already deleted inode. The worst that
- * can happen in this case is that we get a "bit already cleared" message from
- * ext4_free_inode(). The only reason we would point at a wrong inode is if
- * e2fsck was run on this filesystem, and it must have already done the orphan
- * inode cleanup for us, so we can safely abort without any further action.
- */
-static void ext4_orphan_cleanup(struct super_block *sb,
- struct ext4_super_block *es)
-{
- unsigned int s_flags = sb->s_flags;
- int ret, nr_orphans = 0, nr_truncates = 0;
-#ifdef CONFIG_QUOTA
- int quota_update = 0;
- int i;
-#endif
- if (!es->s_last_orphan) {
- jbd_debug(4, "no orphan inodes to clean up\n");
- return;
- }
-
- if (bdev_read_only(sb->s_bdev)) {
- ext4_msg(sb, KERN_ERR, "write access "
- "unavailable, skipping orphan cleanup");
- return;
- }
-
- /* Check if feature set would not allow a r/w mount */
- if (!ext4_feature_set_ok(sb, 0)) {
- ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
- "unknown ROCOMPAT features");
- return;
- }
-
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- /* don't clear list on RO mount w/ errors */
- if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
- ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
- "clearing orphan list.\n");
- es->s_last_orphan = 0;
- }
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
- return;
- }
-
- if (s_flags & SB_RDONLY) {
- ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
- sb->s_flags &= ~SB_RDONLY;
- }
-#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sb->s_flags |= SB_ACTIVE;
-
- /*
- * Turn on quotas which were not enabled for read-only mounts if
- * filesystem has quota feature, so that they are updated correctly.
- */
- if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
- int ret = ext4_enable_quotas(sb);
-
- if (!ret)
- quota_update = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "Cannot turn on quotas: error %d", ret);
- }
-
- /* Turn on journaled quotas used for old sytle */
- for (i = 0; i < EXT4_MAXQUOTAS; i++) {
- if (EXT4_SB(sb)->s_qf_names[i]) {
- int ret = ext4_quota_on_mount(sb, i);
-
- if (!ret)
- quota_update = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "Cannot turn on journaled "
- "quota: type %d: error %d", i, ret);
- }
- }
-#endif
-
- while (es->s_last_orphan) {
- struct inode *inode;
-
- /*
- * We may have encountered an error during cleanup; if
- * so, skip the rest.
- */
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
- es->s_last_orphan = 0;
- break;
- }
-
- inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
- if (IS_ERR(inode)) {
- es->s_last_orphan = 0;
- break;
- }
-
- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
- dquot_initialize(inode);
- if (inode->i_nlink) {
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG,
- "%s: truncating inode %lu to %lld bytes",
- __func__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %lu to %lld bytes\n",
- inode->i_ino, inode->i_size);
- inode_lock(inode);
- truncate_inode_pages(inode->i_mapping, inode->i_size);
- ret = ext4_truncate(inode);
- if (ret)
- ext4_std_error(inode->i_sb, ret);
- inode_unlock(inode);
- nr_truncates++;
- } else {
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG,
- "%s: deleting unreferenced inode %lu",
- __func__, inode->i_ino);
- jbd_debug(2, "deleting unreferenced inode %lu\n",
- inode->i_ino);
- nr_orphans++;
- }
- iput(inode); /* The delete magic happens here! */
- }
-
-#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
-
- if (nr_orphans)
- ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
- PLURAL(nr_orphans));
- if (nr_truncates)
- ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
- PLURAL(nr_truncates));
-#ifdef CONFIG_QUOTA
- /* Turn off quotas if they were enabled for orphan cleanup */
- if (quota_update) {
- for (i = 0; i < EXT4_MAXQUOTAS; i++) {
- if (sb_dqopt(sb)->files[i])
- dquot_quota_off(sb, i);
- }
- }
-#endif
- sb->s_flags = s_flags; /* Restore SB_RDONLY status */
-}
-
/*
* Maximal extent format file size.
* Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -2905,17 +3441,18 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
*/
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
- loff_t res = EXT4_NDIR_BLOCKS;
+ loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
- /* This is calculated to be the largest file size for a dense, block
+ unsigned int ppb = 1 << (bits - 2);
+
+ /*
+ * This is calculated to be the largest file size for a dense, block
* mapped file such that the file's total number of 512-byte sectors,
* including data and all indirect blocks, does not exceed (2^48 - 1).
*
* __u32 i_blocks_lo and _u16 i_blocks_high represent the total
* number of 512-byte sectors of the file.
*/
-
if (!has_huge_files) {
/*
* !has_huge_files or implies that the inode i_block field
@@ -2938,23 +3475,38 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
}
+ /* Compute how many blocks we can address by block tree */
+ res += ppb;
+ res += ppb * ppb;
+ res += ((loff_t)ppb) * ppb * ppb;
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
+ /* Does block tree limit file size? */
+ if (res + meta_blocks <= upper_limit)
+ goto check_lfs;
+
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT4_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
+ DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
@@ -3032,7 +3584,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
* Returns 1 if this filesystem can be mounted as requested,
* 0 if it cannot be.
*/
-static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
if (ext4_has_unknown_ext4_incompat_features(sb)) {
ext4_msg(sb, KERN_ERR,
@@ -3043,14 +3595,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#ifndef CONFIG_UNICODE
- if (ext4_has_feature_casefold(sb)) {
+ if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) {
ext4_msg(sb, KERN_ERR,
"Filesystem with casefold feature cannot be "
"mounted without CONFIG_UNICODE");
return 0;
}
-#endif
if (readonly)
return 1;
@@ -3093,7 +3643,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
*/
static void print_daily_error_info(struct timer_list *t)
{
- struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
+ struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report);
struct super_block *sb = sbi->s_sb;
struct ext4_super_block *es = sbi->s_es;
@@ -3138,15 +3688,32 @@ static void print_daily_error_info(struct timer_list *t)
static int ext4_run_li_request(struct ext4_li_request *elr)
{
struct ext4_group_desc *gdp = NULL;
- ext4_group_t group, ngroups;
- struct super_block *sb;
- unsigned long timeout = 0;
+ struct super_block *sb = elr->lr_super;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t group = elr->lr_next_group;
+ unsigned int prefetch_ios = 0;
int ret = 0;
+ int nr = EXT4_SB(sb)->s_mb_prefetch;
+ u64 start_time;
+
+ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
+ elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios);
+ ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr);
+ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr);
+ if (group >= elr->lr_next_group) {
+ ret = 1;
+ if (elr->lr_first_not_zeroed != ngroups &&
+ !ext4_emergency_state(sb) && !sb_rdonly(sb) &&
+ test_opt(sb, INIT_INODE_TABLE)) {
+ elr->lr_next_group = elr->lr_first_not_zeroed;
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ ret = 0;
+ }
+ }
+ return ret;
+ }
- sb = elr->lr_super;
- ngroups = EXT4_SB(sb)->s_groups_count;
-
- for (group = elr->lr_next_group; group < ngroups; group++) {
+ for (; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
ret = 1;
@@ -3161,13 +3728,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
+ trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- elr->lr_sbi->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
@@ -3181,15 +3748,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
- struct ext4_sb_info *sbi;
-
if (!elr)
return;
- sbi = elr->lr_sbi;
-
list_del(&elr->lr_request);
- sbi->s_li_request = NULL;
+ EXT4_SB(elr->lr_super)->s_li_request = NULL;
kfree(elr);
}
@@ -3220,17 +3783,19 @@ static struct task_struct *ext4_lazyinit_task;
*/
static int ext4_lazyinit_thread(void *arg)
{
- struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+ struct ext4_lazy_init *eli = arg;
struct list_head *pos, *n;
struct ext4_li_request *elr;
unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
+ set_freezable();
cont_thread:
while (true) {
- next_wakeup = MAX_JIFFY_OFFSET;
+ bool next_wakeup_initialized = false;
+ next_wakeup = 0;
mutex_lock(&eli->li_list_mtx);
if (list_empty(&eli->li_request_list)) {
mutex_unlock(&eli->li_list_mtx);
@@ -3243,8 +3808,11 @@ cont_thread:
lr_request);
if (time_before(jiffies, elr->lr_next_sched)) {
- if (time_before(elr->lr_next_sched, next_wakeup))
+ if (!next_wakeup_initialized ||
+ time_before(elr->lr_next_sched, next_wakeup)) {
next_wakeup = elr->lr_next_sched;
+ next_wakeup_initialized = true;
+ }
continue;
}
if (down_read_trylock(&elr->lr_super->s_umount)) {
@@ -3270,19 +3838,20 @@ cont_thread:
}
if (!progress) {
elr->lr_next_sched = jiffies +
- (prandom_u32()
- % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
- if (time_before(elr->lr_next_sched, next_wakeup))
+ if (!next_wakeup_initialized ||
+ time_before(elr->lr_next_sched, next_wakeup)) {
next_wakeup = elr->lr_next_sched;
+ next_wakeup_initialized = true;
+ }
}
mutex_unlock(&eli->li_list_mtx);
try_to_freeze();
cur = jiffies;
- if ((time_after_eq(cur, next_wakeup)) ||
- (MAX_JIFFY_OFFSET == next_wakeup)) {
+ if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) {
cond_resched();
continue;
}
@@ -3398,7 +3967,6 @@ static int ext4_li_info_new(void)
static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
ext4_group_t start)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
@@ -3406,16 +3974,20 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
return NULL;
elr->lr_super = sb;
- elr->lr_sbi = sbi;
- elr->lr_next_group = start;
+ elr->lr_first_not_zeroed = start;
+ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ elr->lr_next_group = start;
+ } else {
+ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
+ }
/*
* Randomize first schedule time of the request to
* spread the inode table initialization requests
* better.
*/
- elr->lr_next_sched = jiffies + (prandom_u32() %
- (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
return elr;
}
@@ -3437,8 +4009,9 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
+ (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
@@ -3499,7 +4072,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
int compat, incompat;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
/* journal checksum v3 */
compat = 0;
incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@@ -3555,9 +4128,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_fsblk_t first_block, last_block, b;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
+ int has_super = ext4_bg_has_super(sb, grp);
if (!ext4_has_feature_bigalloc(sb))
- return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ return (has_super + ext4_bg_num_gdb(sb, grp) +
+ (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
sbi->s_itb_per_group + 2);
first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
@@ -3616,7 +4191,7 @@ int ext4_calculate_overhead(struct super_block *sb)
unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
ext4_fsblk_t overhead = 0;
- char *buf = (char *) get_zeroed_page(GFP_NOFS);
+ char *buf = kvmalloc(sb->s_blocksize, GFP_NOFS | __GFP_ZERO);
if (!buf)
return -ENOMEM;
@@ -3641,7 +4216,7 @@ int ext4_calculate_overhead(struct super_block *sb)
blks = count_overhead(sb, i, buf);
overhead += blks;
if (blks)
- memset(buf, 0, PAGE_SIZE);
+ memset(buf, 0, sb->s_blocksize);
cond_resched();
}
@@ -3649,12 +4224,12 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->journal_bdev)
- overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+ if (sbi->s_journal && !sbi->s_journal_bdev_file)
+ overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
- if (j_inode) {
+ if (!IS_ERR(j_inode)) {
j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
overhead += EXT4_NUM_B2C(sbi, j_blocks);
iput(j_inode);
@@ -3664,7 +4239,7 @@ int ext4_calculate_overhead(struct super_block *sb)
}
sbi->s_overhead = overhead;
smp_wmb();
- free_page((unsigned long) buf);
+ kvfree(buf);
return 0;
}
@@ -3698,124 +4273,75 @@ static void ext4_set_resv_clusters(struct super_block *sb)
atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}
-static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+static const char *ext4_quota_mode(struct super_block *sb)
{
- struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
- char *orig_data = kstrdup(data, GFP_KERNEL);
- struct buffer_head *bh, **group_desc;
- struct ext4_super_block *es = NULL;
- struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- struct flex_groups **flex_groups;
- ext4_fsblk_t block;
- ext4_fsblk_t sb_block = get_sb_block(&data);
- ext4_fsblk_t logical_sb_block;
- unsigned long offset = 0;
- unsigned long journal_devnum = 0;
- unsigned long def_mount_opts;
- struct inode *root;
- const char *descr;
- int ret = -ENOMEM;
- int blocksize, clustersize;
- unsigned int db_count;
- unsigned int i;
- int needs_recovery, has_huge_files;
- __u64 blocks_count;
- int err = 0;
- unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- ext4_group_t first_not_zeroed;
+#ifdef CONFIG_QUOTA
+ if (!ext4_quota_capable(sb))
+ return "none";
- if ((data && !orig_data) || !sbi)
- goto out_free_base;
+ if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
+ return "journalled";
+ else
+ return "writeback";
+#else
+ return "disabled";
+#endif
+}
- sbi->s_daxdev = dax_dev;
- sbi->s_blockgroup_lock =
- kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- if (!sbi->s_blockgroup_lock)
- goto out_free_base;
+static void ext4_setup_csum_trigger(struct super_block *sb,
+ enum ext4_journal_trigger_type type,
+ void (*trigger)(
+ struct jbd2_buffer_trigger_type *type,
+ struct buffer_head *bh,
+ void *mapped_data,
+ size_t size))
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
- sb->s_fs_info = sbi;
- sbi->s_sb = sb;
- sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
- sbi->s_sb_block = sb_block;
- if (sb->s_bdev->bd_part)
- sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
+ sbi->s_journal_triggers[type].sb = sb;
+ sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
+}
- /* Cleanup superblock name */
- strreplace(sb->s_id, '/', '!');
+static void ext4_free_sbi(struct ext4_sb_info *sbi)
+{
+ if (!sbi)
+ return;
- /* -EINVAL is default */
- ret = -EINVAL;
- blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
- if (!blocksize) {
- ext4_msg(sb, KERN_ERR, "unable to set blocksize");
- goto out_fail;
- }
+ kfree(sbi->s_blockgroup_lock);
+ fs_put_dax(sbi->s_daxdev, NULL);
+ kfree(sbi);
+}
- /*
- * The ext4 superblock will not be buffer aligned for other than 1kB
- * block sizes. We need to calculate the offset from buffer start.
- */
- if (blocksize != EXT4_MIN_BLOCK_SIZE) {
- logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- } else {
- logical_sb_block = sb_block;
- }
+static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi;
- if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
- ext4_msg(sb, KERN_ERR, "unable to read superblock");
- goto out_fail;
- }
- /*
- * Note: s_es must be initialized as soon as possible because
- * some ext4 macro-instructions depend on its value
- */
- es = (struct ext4_super_block *) (bh->b_data + offset);
- sbi->s_es = es;
- sb->s_magic = le16_to_cpu(es->s_magic);
- if (sb->s_magic != EXT4_SUPER_MAGIC)
- goto cantfind_ext4;
- sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ return NULL;
- /* Warn if metadata_csum and gdt_csum are both set. */
- if (ext4_has_feature_metadata_csum(sb) &&
- ext4_has_feature_gdt_csum(sb))
- ext4_warning(sb, "metadata_csum and uninit_bg are "
- "redundant flags; please run fsck.");
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);
- /* Check for a known checksum algorithm */
- if (!ext4_verify_csum_type(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "unknown checksum algorithm.");
- silent = 1;
- goto cantfind_ext4;
- }
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- /* Load the checksum driver */
- sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(sbi->s_chksum_driver)) {
- ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
- ret = PTR_ERR(sbi->s_chksum_driver);
- sbi->s_chksum_driver = NULL;
- goto failed_mount;
- }
+ if (!sbi->s_blockgroup_lock)
+ goto err_out;
- /* Check superblock checksum */
- if (!ext4_superblock_csum_verify(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "invalid superblock checksum. Run e2fsck?");
- silent = 1;
- ret = -EFSBADCRC;
- goto cantfind_ext4;
- }
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+ return sbi;
+err_out:
+ fs_put_dax(sbi->s_daxdev, NULL);
+ kfree(sbi);
+ return NULL;
+}
- /* Precompute checksum seed for all metadata */
- if (ext4_has_feature_csum_seed(sb))
- sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
- sizeof(es->s_uuid));
+static void ext4_set_def_opts(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ unsigned long def_mount_opts;
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -3831,8 +4357,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
+ if (ext4_has_feature_fast_commit(sb))
+ set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
@@ -3842,9 +4370,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
set_opt(sb, WRITEBACK_DATA);
- if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+ if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
set_opt(sb, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+ else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
set_opt(sb, ERRORS_CONT);
else
set_opt(sb, ERRORS_RO);
@@ -3853,12 +4381,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);
- sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
- sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
- sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
- sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-
if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
set_opt(sb, BARRIER);
@@ -3870,24 +4392,124 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
- /*
- * set default s_li_wait_mult for lazyinit, for the case there is
- * no mount option specified.
- */
- sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
-
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ set_opt(sb, DIOREAD_NOLOCK);
+}
- if (blocksize == PAGE_SIZE)
- set_opt(sb, DIOREAD_NOLOCK);
+static int ext4_handle_clustersize(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int clustersize;
- if (blocksize < EXT4_MIN_BLOCK_SIZE ||
- blocksize > EXT4_MAX_BLOCK_SIZE) {
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ if (ext4_has_feature_bigalloc(sb)) {
+ if (clustersize < sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ } else {
+ if (clustersize != sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "fragment/cluster size (%d) != "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ return -EINVAL;
+ }
+ sbi->s_cluster_bits = 0;
+ }
+ sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
ext4_msg(sb, KERN_ERR,
- "Unsupported filesystem blocksize %d (%d log_block_size)",
- blocksize, le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
+ "blocks per group (%lu) and clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group, sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
+
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
+ return 0;
+}
+
+/*
+ * ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * With non-bigalloc filesystem awu will be based upon filesystem blocksize
+ * & bdev awu units.
+ * With bigalloc it will be based upon bigalloc cluster size & bdev awu units.
+ * @sb: super block
+ */
+static void ext4_atomic_write_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct block_device *bdev = sb->s_bdev;
+ unsigned int clustersize = EXT4_CLUSTER_SIZE(sb);
+
+ if (!bdev_can_atomic_write(bdev))
+ return;
+
+ if (!ext4_has_feature_extents(sb))
+ return;
+
+ sbi->s_awu_min = max(sb->s_blocksize,
+ bdev_atomic_write_unit_min_bytes(bdev));
+ sbi->s_awu_max = min(clustersize,
+ bdev_atomic_write_unit_max_bytes(bdev));
+ if (sbi->s_awu_min && sbi->s_awu_max &&
+ sbi->s_awu_min <= sbi->s_awu_max) {
+ ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
+ sbi->s_awu_min, sbi->s_awu_max);
+ } else {
+ sbi->s_awu_min = 0;
+ sbi->s_awu_max = 0;
}
+}
+
+static void ext4_fast_commit_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Initialize fast commit stuff */
+ atomic_set(&sbi->s_fc_subtid, 0);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ sbi->s_fc_bytes = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ sbi->s_fc_ineligible_tid = 0;
+ mutex_init(&sbi->s_fc_lock);
+ memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+ sbi->s_fc_replay_state.fc_regions = NULL;
+ sbi->s_fc_replay_state.fc_regions_size = 0;
+ sbi->s_fc_replay_state.fc_regions_used = 0;
+ sbi->s_fc_replay_state.fc_regions_valid = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes = NULL;
+ sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+}
+
+static int ext4_inode_info_init(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -3898,16 +4520,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
sbi->s_first_ino);
- goto failed_mount;
+ return -EINVAL;
}
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
+ (sbi->s_inode_size > sb->s_blocksize)) {
ext4_msg(sb, KERN_ERR,
"unsupported inode size: %d",
sbi->s_inode_size);
- ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
- goto failed_mount;
+ ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
+ return -EINVAL;
}
/*
* i_atime_extra is the last extra field available for
@@ -3925,6 +4547,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_time_min = EXT4_TIMESTAMP_MIN;
}
+
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
@@ -3936,7 +4559,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_want_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
@@ -3945,96 +4568,105 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_min_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
}
}
- if (sbi->s_es->s_mount_opts[0]) {
- char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
- sizeof(sbi->s_es->s_mount_opts),
- GFP_KERNEL);
- if (!s_mount_opts)
- goto failed_mount;
- if (!parse_options(s_mount_opts, sb, &journal_devnum,
- &journal_ioprio, 0)) {
- ext4_msg(sb, KERN_WARNING,
- "failed to parse options in superblock: %s",
- s_mount_opts);
- }
- kfree(s_mount_opts);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ const struct ext4_sb_encodings *encoding_info;
+ struct unicode_map *encoding;
+ __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
+
+ if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
+ return 0;
+
+ encoding_info = ext4_sb_read_encoding(es);
+ if (!encoding_info) {
+ ext4_msg(sb, KERN_ERR,
+ "Encoding requested by superblock is unknown");
+ return -EINVAL;
}
- sbi->s_def_mount_opt = sbi->s_mount_opt;
- if (!parse_options((char *) data, sb, &journal_devnum,
- &journal_ioprio, 0))
- goto failed_mount;
-#ifdef CONFIG_UNICODE
- if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) {
- const struct ext4_sb_encodings *encoding_info;
- struct unicode_map *encoding;
- __u16 encoding_flags;
+ encoding = utf8_load(encoding_info->version);
+ if (IS_ERR(encoding)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with superblock charset: %s-%u.%u.%u "
+ "not supported by the kernel. flags: 0x%x.",
+ encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
+ return -EINVAL;
+ }
+ ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
+ "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
- if (ext4_has_feature_encrypt(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Can't mount with encoding and encryption");
- goto failed_mount;
- }
+ sb->s_encoding = encoding;
+ sb->s_encoding_flags = encoding_flags;
- if (ext4_sb_read_encoding(es, &encoding_info,
- &encoding_flags)) {
- ext4_msg(sb, KERN_ERR,
- "Encoding requested by superblock is unknown");
- goto failed_mount;
- }
+ return 0;
+}
+#else
+static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ return 0;
+}
+#endif
- encoding = utf8_load(encoding_info->version);
- if (IS_ERR(encoding)) {
- ext4_msg(sb, KERN_ERR,
- "can't mount with superblock charset: %s-%s "
- "not supported by the kernel. flags: 0x%x.",
- encoding_info->name, encoding_info->version,
- encoding_flags);
- goto failed_mount;
- }
- ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
- "%s-%s with flags 0x%hx", encoding_info->name,
- encoding_info->version?:"\b", encoding_flags);
+static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
- sbi->s_encoding = encoding;
- sbi->s_encoding_flags = encoding_flags;
+ /* Warn if metadata_csum and gdt_csum are both set. */
+ if (ext4_has_feature_metadata_csum(sb) &&
+ ext4_has_feature_gdt_csum(sb))
+ ext4_warning(sb, "metadata_csum and uninit_bg are "
+ "redundant flags; please run fsck.");
+
+ /* Check for a known checksum algorithm */
+ if (!ext4_verify_csum_type(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "unknown checksum algorithm.");
+ return -EINVAL;
}
-#endif
+ ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
+ ext4_orphan_file_block_trigger);
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n");
- /* can't mount with both data=journal and dioread_nolock. */
- clear_opt(sb, DIOREAD_NOLOCK);
- if (test_opt2(sb, EXPLICIT_DELALLOC)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and delalloc");
- goto failed_mount;
- }
- if (test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- goto failed_mount;
- }
- if (ext4_has_feature_encrypt(sb)) {
- ext4_msg(sb, KERN_WARNING,
- "encrypted files will use data=ordered "
- "instead of data journaling mode");
- }
- if (test_opt(sb, DELALLOC))
- clear_opt(sb, DELALLOC);
- } else {
- sb->s_iflags |= SB_I_CGROUPWB;
+ /* Check superblock checksum */
+ if (!ext4_superblock_csum_verify(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "invalid superblock checksum. Run e2fsck?");
+ return -EFSBADCRC;
}
- sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+ /* Precompute checksum seed for all metadata */
+ if (ext4_has_feature_csum_seed(sb))
+ sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ else if (ext4_has_feature_metadata_csum(sb) ||
+ ext4_has_feature_ea_inode(sb))
+ sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid,
+ sizeof(es->s_uuid));
+ return 0;
+}
+
+static int ext4_check_feature_compatibility(struct super_block *sb,
+ struct ext4_super_block *es,
+ int silent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(ext4_has_compat_features(sb) ||
@@ -4049,7 +4681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (ext4_has_feature_64bit(sb)) {
ext4_msg(sb, KERN_ERR,
"The Hurd can't support 64-bit file systems");
- goto failed_mount;
+ return -EINVAL;
}
/*
@@ -4059,7 +4691,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (ext4_has_feature_ea_inode(sb)) {
ext4_msg(sb, KERN_ERR,
"ea_inode feature is not supported for Hurd");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4073,10 +4705,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* it's actually an ext[34] filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4090,10 +4722,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* it's actually an ext4 filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4103,183 +4735,50 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
- goto failed_mount;
-
- if (le32_to_cpu(es->s_log_block_size) >
- (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log block size: %u",
- le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
- if (le32_to_cpu(es->s_log_cluster_size) >
- (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log cluster size: %u",
- le32_to_cpu(es->s_log_cluster_size));
- goto failed_mount;
- }
+ return -EINVAL;
- if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
- ext4_msg(sb, KERN_ERR,
- "Number of reserved GDT blocks insanely large: %d",
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
- goto failed_mount;
+ if (sbi->s_daxdev) {
+ if (sb->s_blocksize == PAGE_SIZE)
+ set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+ else
+ ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
}
- if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
if (ext4_has_feature_inline_data(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
" that may contain inline data");
- goto failed_mount;
+ return -EINVAL;
}
- if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
ext4_msg(sb, KERN_ERR,
"DAX unsupported by block device.");
- goto failed_mount;
+ return -EINVAL;
}
}
if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
es->s_encryption_level);
- goto failed_mount;
- }
-
- if (sb->s_blocksize != blocksize) {
- /* Validate the filesystem blocksize */
- if (!sb_set_blocksize(sb, blocksize)) {
- ext4_msg(sb, KERN_ERR, "bad block size %d",
- blocksize);
- goto failed_mount;
- }
-
- brelse(bh);
- logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- bh = sb_bread_unmovable(sb, logical_sb_block);
- if (!bh) {
- ext4_msg(sb, KERN_ERR,
- "Can't read superblock on 2nd try");
- goto failed_mount;
- }
- es = (struct ext4_super_block *)(bh->b_data + offset);
- sbi->s_es = es;
- if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
- ext4_msg(sb, KERN_ERR,
- "Magic mismatch, very weird!");
- goto failed_mount;
- }
+ return -EINVAL;
}
- has_huge_files = ext4_has_feature_huge_file(sb);
- sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
- has_huge_files);
- sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
-
- sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
- if (ext4_has_feature_64bit(sb)) {
- if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
- sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
- !is_power_of_2(sbi->s_desc_size)) {
- ext4_msg(sb, KERN_ERR,
- "unsupported descriptor size %lu",
- sbi->s_desc_size);
- goto failed_mount;
- }
- } else
- sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
-
- sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
- sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-
- sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
- if (sbi->s_inodes_per_block == 0)
- goto cantfind_ext4;
- if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
- sbi->s_inodes_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
- sbi->s_inodes_per_group);
- goto failed_mount;
- }
- sbi->s_itb_per_group = sbi->s_inodes_per_group /
- sbi->s_inodes_per_block;
- sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
- sbi->s_sbh = bh;
- sbi->s_mount_state = le16_to_cpu(es->s_state);
- sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
- sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+ return 0;
+}
- for (i = 0; i < 4; i++)
- sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
- sbi->s_def_hash_version = es->s_def_hash_version;
- if (ext4_has_feature_dir_index(sb)) {
- i = le32_to_cpu(es->s_flags);
- if (i & EXT2_FLAGS_UNSIGNED_HASH)
- sbi->s_hash_unsigned = 3;
- else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
-#ifdef __CHAR_UNSIGNED__
- if (!sb_rdonly(sb))
- es->s_flags |=
- cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
- sbi->s_hash_unsigned = 3;
-#else
- if (!sb_rdonly(sb))
- es->s_flags |=
- cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
-#endif
- }
- }
+static int ext4_check_geometry(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ __u64 blocks_count;
+ int err;
- /* Handle clustersize */
- clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- if (ext4_has_feature_bigalloc(sb)) {
- if (clustersize < blocksize) {
- ext4_msg(sb, KERN_ERR,
- "cluster size (%d) smaller than "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
- le32_to_cpu(es->s_log_block_size);
- sbi->s_clusters_per_group =
- le32_to_cpu(es->s_clusters_per_group);
- if (sbi->s_clusters_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#clusters per group too big: %lu",
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group !=
- (sbi->s_clusters_per_group * (clustersize / blocksize))) {
- ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
- "clusters per group (%lu) inconsistent",
- sbi->s_blocks_per_group,
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- } else {
- if (clustersize != blocksize) {
- ext4_msg(sb, KERN_ERR,
- "fragment/cluster size (%d) != "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#blocks per group too big: %lu",
- sbi->s_blocks_per_group);
- goto failed_mount;
- }
- sbi->s_clusters_per_group = sbi->s_blocks_per_group;
- sbi->s_cluster_bits = 0;
+ if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
+ ext4_msg(sb, KERN_ERR,
+ "Number of reserved GDT blocks insanely large: %d",
+ le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
+ return -EINVAL;
}
- sbi->s_cluster_ratio = clustersize / blocksize;
-
- /* Do we have standard group size of clustersize * 8 blocks ? */
- if (sbi->s_blocks_per_group == clustersize << 3)
- set_opt2(sb, STD_GROUP_SIZE);
-
/*
* Test whether we have more sectors than will fit in sector_t,
* and whether the max offset is addressable by the page cache.
@@ -4289,19 +4788,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (err) {
ext4_msg(sb, KERN_ERR, "filesystem"
" too large to mount safely on this system");
- goto failed_mount;
+ return err;
}
- if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext4;
-
/* check blocks count against device size */
- blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+ blocks_count = sb_bdev_nr_blocks(sb);
if (blocks_count && ext4_blocks_count(es) > blocks_count) {
ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
"exceeds size of device (%llu blocks)",
ext4_blocks_count(es), blocks_count);
- goto failed_mount;
+ return -EINVAL;
}
/*
@@ -4313,13 +4809,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"block %u is beyond end of filesystem (%llu)",
le32_to_cpu(es->s_first_data_block),
ext4_blocks_count(es));
- goto failed_mount;
+ return -EINVAL;
}
if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
(sbi->s_cluster_ratio == 1)) {
ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
"block is 0 with a 1k block and cluster size");
- goto failed_mount;
+ return -EINVAL;
}
blocks_count = (ext4_blocks_count(es) -
@@ -4333,7 +4829,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_blocks_count(es),
le32_to_cpu(es->s_first_data_block),
EXT4_BLOCKS_PER_GROUP(sb));
- goto failed_mount;
+ return -EINVAL;
}
sbi->s_groups_count = blocks_count;
sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
@@ -4343,9 +4839,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
le32_to_cpu(es->s_inodes_count),
((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
- ret = -EINVAL;
- goto failed_mount;
+ return -EINVAL;
}
+
+ return 0;
+}
+
+static int ext4_group_desc_init(struct super_block *sb,
+ struct ext4_super_block *es,
+ ext4_fsblk_t logical_sb_block,
+ ext4_group_t *first_not_zeroed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned int db_count;
+ ext4_fsblk_t block;
+ int i;
+
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
if (ext4_has_feature_meta_bg(sb)) {
@@ -4354,7 +4863,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"first meta block group too large: %u "
"(group descriptor block count %u)",
le32_to_cpu(es->s_first_meta_bg), db_count);
- goto failed_mount;
+ return -EINVAL;
}
}
rcu_assign_pointer(sbi->s_group_desc,
@@ -4363,8 +4872,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
GFP_KERNEL));
if (sbi->s_group_desc == NULL) {
ext4_msg(sb, KERN_ERR, "not enough memory");
- ret = -ENOMEM;
- goto failed_mount;
+ return -ENOMEM;
}
bgl_lock_init(sbi->s_blockgroup_lock);
@@ -4372,38 +4880,537 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Pre-read the descriptors into the buffer cache */
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
- sb_breadahead_unmovable(sb, block);
+ ext4_sb_breadahead_unmovable(sb, block);
}
for (i = 0; i < db_count; i++) {
struct buffer_head *bh;
block = descriptor_loc(sb, logical_sb_block, i);
- bh = sb_bread_unmovable(sb, block);
- if (!bh) {
+ bh = ext4_sb_bread_unmovable(sb, block);
+ if (IS_ERR(bh)) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
- db_count = i;
- goto failed_mount2;
+ sbi->s_gdb_count = i;
+ return PTR_ERR(bh);
}
rcu_read_lock();
rcu_dereference(sbi->s_group_desc)[i] = bh;
rcu_read_unlock();
}
sbi->s_gdb_count = db_count;
- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
+ if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
- ret = -EFSCORRUPTED;
- goto failed_mount2;
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+static int ext4_load_and_init_journal(struct super_block *sb,
+ struct ext4_super_block *es,
+ struct ext4_fs_context *ctx)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ err = ext4_load_journal(sb, es, ctx->journal_devnum);
+ if (err)
+ return err;
+
+ if (ext4_has_feature_64bit(sb) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+ goto out;
}
+ if (!set_journal_csum_feature_set(sb)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+ "feature set");
+ goto out;
+ }
+
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to set fast commit journal feature");
+ goto out;
+ }
+
+ /* We have now updated the journal if required, so we can
+ * validate the data journaling mode. */
+ switch (test_opt(sb, DATA_FLAGS)) {
+ case 0:
+ /* No mode set, assume a default based on the journal
+ * capabilities: ORDERED_DATA if the journal can
+ * cope, else JOURNAL_DATA
+ */
+ if (jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ set_opt(sb, ORDERED_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ } else {
+ set_opt(sb, JOURNAL_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ }
+ break;
+
+ case EXT4_MOUNT_ORDERED_DATA:
+ case EXT4_MOUNT_WRITEBACK_DATA:
+ if (!jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ ext4_msg(sb, KERN_ERR, "Journal does not support "
+ "requested data journaling mode");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ goto out;
+ }
+
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
+
+ sbi->s_journal->j_submit_inode_data_buffers =
+ ext4_journal_submit_inode_data_buffers;
+ sbi->s_journal->j_finish_inode_data_buffers =
+ ext4_journal_finish_inode_data_buffers;
+
+ return 0;
+
+out:
+ ext4_journal_destroy(sbi, sbi->s_journal);
+ return -EINVAL;
+}
+
+static int ext4_check_journal_data_mode(struct super_block *sb)
+{
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
+ "data=journal disables delayed allocation, "
+ "dioread_nolock, O_DIRECT and fast_commit support!\n");
+ /* can't mount with both data=journal and dioread_nolock. */
+ clear_opt(sb, DIOREAD_NOLOCK);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ return -EINVAL;
+ }
+ if (test_opt(sb, DAX_ALWAYS)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -EINVAL;
+ }
+ if (ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_WARNING,
+ "encrypted files will use data=ordered "
+ "instead of data journaling mode");
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ } else {
+ sb->s_iflags |= SB_I_CGROUPWB;
+ }
+
+ return 0;
+}
+
+static const char *ext4_has_journal_option(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ return "journal_async_commit";
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM))
+ return "journal_checksum";
+ if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+ return "commit=";
+ if (EXT4_MOUNT_DATA_FLAGS &
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt))
+ return "data=";
+ if (test_opt(sb, DATA_ERR_ABORT))
+ return "data_err=abort";
+ return NULL;
+}
+
+/*
+ * Limit the maximum folio order to 2048 blocks to prevent overestimation
+ * of reserve handle credits during the folio writeback in environments
+ * where the PAGE_SIZE exceeds 4KB.
+ */
+#define EXT4_MAX_PAGECACHE_ORDER(sb) \
+ umin(MAX_PAGECACHE_ORDER, (11 + (sb)->s_blocksize_bits - PAGE_SHIFT))
+static void ext4_set_max_mapping_order(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ sbi->s_max_folio_order = sbi->s_min_folio_order;
+ else
+ sbi->s_max_folio_order = EXT4_MAX_PAGECACHE_ORDER(sb);
+}
+
+static int ext4_check_large_folio(struct super_block *sb)
+{
+ const char *err_str = NULL;
+
+ if (ext4_has_feature_encrypt(sb))
+ err_str = "encrypt";
+
+ if (!err_str) {
+ ext4_set_max_mapping_order(sb);
+ } else if (sb->s_blocksize > PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s",
+ sb->s_blocksize, PAGE_SIZE, err_str);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
+ int silent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es;
+ ext4_fsblk_t logical_sb_block;
+ unsigned long offset = 0;
+ struct buffer_head *bh;
+ int ret = -EINVAL;
+ int blocksize;
+
+ blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+ if (!blocksize) {
+ ext4_msg(sb, KERN_ERR, "unable to set blocksize");
+ return -EINVAL;
+ }
+
+ /*
+ * The ext4 superblock will not be buffer aligned for other than 1kB
+ * block sizes. We need to calculate the offset from buffer start.
+ */
+ if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ } else {
+ logical_sb_block = sbi->s_sb_block;
+ }
+
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "unable to read superblock");
+ return PTR_ERR(bh);
+ }
+ /*
+ * Note: s_es must be initialized as soon as possible because
+ * some ext4 macro-instructions depend on its value
+ */
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+ if (sb->s_magic != EXT4_SUPER_MAGIC) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto out;
+ }
+
+ if (le32_to_cpu(es->s_log_block_size) >
+ (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log block size: %u",
+ le32_to_cpu(es->s_log_block_size));
+ goto out;
+ }
+ if (le32_to_cpu(es->s_log_cluster_size) >
+ (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log cluster size: %u",
+ le32_to_cpu(es->s_log_cluster_size));
+ goto out;
+ }
+
+ blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ /*
+ * If the default block size is not the same as the real block size,
+ * we need to reload it.
+ */
+ if (sb->s_blocksize == blocksize)
+ goto success;
+
+ /*
+ * bh must be released before kill_bdev(), otherwise
+ * it won't be freed and its page also. kill_bdev()
+ * is called by sb_set_blocksize().
+ */
+ brelse(bh);
+ /* Validate the filesystem blocksize */
+ if (!sb_set_blocksize(sb, blocksize)) {
+ ext4_msg(sb, KERN_ERR, "bad block size %d",
+ blocksize);
+ bh = NULL;
+ goto out;
+ }
+
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
+ ret = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
+ }
+ es = (struct ext4_super_block *)(bh->b_data + offset);
+ sbi->s_es = es;
+ if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+ ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
+ goto out;
+ }
+
+success:
+ sbi->s_min_folio_order = get_order(blocksize);
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+out:
+ brelse(bh);
+ return ret;
+}
+
+static int ext4_hash_info_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ unsigned int i;
+
+ sbi->s_def_hash_version = es->s_def_hash_version;
+
+ if (sbi->s_def_hash_version > DX_HASH_LAST) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid default hash set in the superblock");
+ return -EINVAL;
+ } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) {
+ ext4_msg(sb, KERN_ERR,
+ "SIPHASH is not a valid default hash value");
+ return -EINVAL;
+ }
+
+ for (i = 0; i < 4; i++)
+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+
+ if (ext4_has_feature_dir_index(sb)) {
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+ if (!sb_rdonly(sb))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
+#else
+ if (!sb_rdonly(sb))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+ }
+ }
+ return 0;
+}
+
+static int ext4_block_group_meta_init(struct super_block *sb, int silent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int has_huge_files;
+
+ has_huge_files = ext4_has_feature_huge_file(sb);
+ sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+ has_huge_files);
+ sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
+
+ sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
+ if (ext4_has_feature_64bit(sb)) {
+ if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
+ sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
+ !is_power_of_2(sbi->s_desc_size)) {
+ ext4_msg(sb, KERN_ERR,
+ "unsupported descriptor size %lu",
+ sbi->s_desc_size);
+ return -EINVAL;
+ }
+ } else
+ sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
+ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+
+ sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
+ if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ return -EINVAL;
+ }
+ if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
+ sbi->s_inodes_per_group);
+ return -EINVAL;
+ }
+ sbi->s_itb_per_group = sbi->s_inodes_per_group /
+ sbi->s_inodes_per_block;
+ sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
+ sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
+ sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
+ return 0;
+}
+
+/*
+ * It's hard to get stripe aligned blocks if stripe is not aligned with
+ * cluster, just disable stripe and alert user to simplify code and avoid
+ * stripe aligned allocation which will rarely succeed.
+ */
+static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ return (stripe > 0 && sbi->s_cluster_ratio > 1 &&
+ stripe % sbi->s_cluster_ratio != 0);
+}
+
+static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+{
+ struct ext4_super_block *es = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t logical_sb_block;
+ struct inode *root;
+ int needs_recovery;
+ int err;
+ ext4_group_t first_not_zeroed;
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
+
+ /* Set defaults for the variables that will be set during parsing */
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
+
+ sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
+
+ err = ext4_load_super(sb, &logical_sb_block, silent);
+ if (err)
+ goto out_fail;
+
+ es = sbi->s_es;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+ err = ext4_init_metadata_csum(sb, es);
+ if (err)
+ goto failed_mount;
+
+ ext4_set_def_opts(sb, es);
+
+ sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es));
+ sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es));
+ sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+ sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+ sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB;
+ sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC;
+
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
+ err = ext4_inode_info_init(sb, es);
+ if (err)
+ goto failed_mount;
+
+ err = parse_apply_sb_mount_options(sb, ctx);
+ if (err < 0)
+ goto failed_mount;
+
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
+ sbi->s_def_mount_opt2 = sbi->s_mount_opt2;
+
+ err = ext4_check_opt_consistency(fc, sb);
+ if (err < 0)
+ goto failed_mount;
+
+ ext4_apply_options(fc, sb);
+
+ err = ext4_check_large_folio(sb);
+ if (err < 0)
+ goto failed_mount;
+
+ err = ext4_encoding_init(sb, es);
+ if (err)
+ goto failed_mount;
+
+ err = ext4_check_journal_data_mode(sb);
+ if (err)
+ goto failed_mount;
+
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+
+ /* HSM events are allowed by default. */
+ sb->s_iflags |= SB_I_ALLOW_HSM;
+
+ err = ext4_check_feature_compatibility(sb, es, silent);
+ if (err)
+ goto failed_mount;
+
+ err = ext4_block_group_meta_init(sb, silent);
+ if (err)
+ goto failed_mount;
+
+ err = ext4_hash_info_init(sb);
+ if (err)
+ goto failed_mount;
+
+ err = ext4_handle_clustersize(sb);
+ if (err)
+ goto failed_mount;
+
+ err = ext4_check_geometry(sb, es);
+ if (err)
+ goto failed_mount;
+
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
+ spin_lock_init(&sbi->s_error_lock);
+ INIT_WORK(&sbi->s_sb_upd_work, update_super_work);
- /* Register extent status tree shrinker */
- if (ext4_es_register_shrinker(sbi))
+ err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
+ if (err)
+ goto failed_mount3;
+
+ err = ext4_es_register_shrinker(sbi);
+ if (err)
goto failed_mount3;
sbi->s_stripe = ext4_get_stripe_size(sbi);
+ if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) {
+ ext4_msg(sb, KERN_WARNING,
+ "stripe (%lu) is not aligned with cluster size (%u), "
+ "stripe is disabled",
+ sbi->s_stripe, sbi->s_cluster_ratio);
+ sbi->s_stripe = 0;
+ }
sbi->s_extent_max_zeroout_kb = 32;
/*
@@ -4426,125 +5433,71 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
- memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+ super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
+ super_set_sysfs_name_bdev(sb);
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+
+ ext4_atomic_write_init(sb);
+ ext4_fast_commit_init(sb);
+
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
+ ext4_has_feature_orphan_present(sb) ||
ext4_has_feature_journal_needs_recovery(sb));
- if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
- if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+ if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
+ err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
+ if (err)
goto failed_mount3a;
+ }
+ err = -EINVAL;
/*
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
- err = ext4_load_journal(sb, es, journal_devnum);
+ err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
+ if (bdev_read_only(sb->s_bdev))
+ needs_recovery = 0;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
- goto failed_mount_wq;
+ goto failed_mount3a;
} else {
+ const char *journal_option;
+
/* Nojournal mode, all journal mount options are illegal */
- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_checksum, fs mounted w/o journal");
- goto failed_mount_wq;
- }
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit, fs mounted w/o journal");
- goto failed_mount_wq;
- }
- if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "commit=%lu, fs mounted w/o journal",
- sbi->s_commit_interval / HZ);
- goto failed_mount_wq;
- }
- if (EXT4_MOUNT_DATA_FLAGS &
- (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "data=, fs mounted w/o journal");
- goto failed_mount_wq;
+ journal_option = ext4_has_journal_option(sb);
+ if (journal_option != NULL) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with %s, fs mounted w/o journal",
+ journal_option);
+ goto failed_mount3a;
}
+
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
sbi->s_journal = NULL;
needs_recovery = 0;
- goto no_journal;
- }
-
- if (ext4_has_feature_64bit(sb) &&
- !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_64BIT)) {
- ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
- goto failed_mount_wq;
- }
-
- if (!set_journal_csum_feature_set(sb)) {
- ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
- "feature set");
- goto failed_mount_wq;
- }
-
- /* We have now updated the journal if required, so we can
- * validate the data journaling mode. */
- switch (test_opt(sb, DATA_FLAGS)) {
- case 0:
- /* No mode set, assume a default based on the journal
- * capabilities: ORDERED_DATA if the journal can
- * cope, else JOURNAL_DATA
- */
- if (jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- set_opt(sb, ORDERED_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
- } else {
- set_opt(sb, JOURNAL_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
- }
- break;
-
- case EXT4_MOUNT_ORDERED_DATA:
- case EXT4_MOUNT_WRITEBACK_DATA:
- if (!jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- ext4_msg(sb, KERN_ERR, "Journal does not support "
- "requested data journaling mode");
- goto failed_mount_wq;
- }
- default:
- break;
}
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
- test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit in data=ordered mode");
- goto failed_mount_wq;
- }
-
- set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-
- sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
-
-no_journal:
if (!test_opt(sb, NO_MBCACHE)) {
sbi->s_ea_block_cache = ext4_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
ext4_msg(sb, KERN_ERR,
"Failed to create ea_block_cache");
+ err = -EINVAL;
goto failed_mount_wq;
}
@@ -4553,29 +5506,28 @@ no_journal:
if (!sbi->s_ea_inode_cache) {
ext4_msg(sb, KERN_ERR,
"Failed to create ea_inode_cache");
+ err = -EINVAL;
goto failed_mount_wq;
}
}
}
- if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
- goto failed_mount_wq;
- }
-
- if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
- !ext4_has_feature_encrypt(sb)) {
- ext4_set_feature_encrypt(sb);
- ext4_commit_super(sb, 1);
- }
-
/*
* Get the # of file system overhead blocks from the
* superblock if present.
*/
- if (es->s_overhead_clusters)
- sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
- else {
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ /* ignore the precalculated value if it is ridiculous */
+ if (sbi->s_overhead > ext4_blocks_count(es))
+ sbi->s_overhead = 0;
+ /*
+ * If the bigalloc feature is not enabled recalculating the
+ * overhead doesn't take long, so we might as well just redo
+ * it to make sure we are using the correct value.
+ */
+ if (!ext4_has_feature_bigalloc(sb))
+ sbi->s_overhead = 0;
+ if (sbi->s_overhead == 0) {
err = ext4_calculate_overhead(sb);
if (err)
goto failed_mount_wq;
@@ -4589,7 +5541,7 @@ no_journal:
alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
if (!EXT4_SB(sb)->rsv_conversion_wq) {
printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
- ret = -ENOMEM;
+ err = -ENOMEM;
goto failed_mount4;
}
@@ -4601,45 +5553,57 @@ no_journal:
root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(root)) {
ext4_msg(sb, KERN_ERR, "get root inode failed");
- ret = PTR_ERR(root);
+ err = PTR_ERR(root);
root = NULL;
goto failed_mount4;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
iput(root);
+ err = -EFSCORRUPTED;
goto failed_mount4;
}
-#ifdef CONFIG_UNICODE
- if (sbi->s_encoding)
- sb->s_d_op = &ext4_dentry_ops;
-#endif
-
+ generic_set_sb_d_ops(sb);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
- ret = -ENOMEM;
+ err = -ENOMEM;
goto failed_mount4;
}
- ret = ext4_setup_super(sb, es, sb_rdonly(sb));
- if (ret == -EROFS) {
+ err = ext4_setup_super(sb, es, sb_rdonly(sb));
+ if (err == -EROFS) {
sb->s_flags |= SB_RDONLY;
- ret = 0;
- } else if (ret)
+ } else if (err)
goto failed_mount4a;
ext4_set_resv_clusters(sb);
- err = ext4_setup_system_zone(sb);
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to initialize system "
- "zone (%d)", err);
- goto failed_mount4a;
+ if (test_opt(sb, BLOCK_VALIDITY)) {
+ err = ext4_setup_system_zone(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize system "
+ "zone (%d)", err);
+ goto failed_mount4a;
+ }
}
+ ext4_fc_replay_cleanup(sb);
ext4_ext_init(sb);
+
+ /*
+ * Enable optimize_scan if number of groups is > threshold. This can be
+ * turned off by passing "mb_optimize_scan=0". This can also be
+ * turned on forcefully by passing "mb_optimize_scan=1".
+ */
+ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+ else
+ clear_opt2(sb, MB_OPTIMIZE_SCAN);
+ }
+
err = ext4_mb_init(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
@@ -4647,38 +5611,24 @@ no_journal:
goto failed_mount5;
}
- block = ext4_count_free_clusters(sb);
- ext4_free_blocks_count_set(sbi->s_es,
- EXT4_C2B(sbi, block));
- ext4_superblock_csum_set(sb);
- err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
- GFP_KERNEL);
- if (!err) {
- unsigned long freei = ext4_count_free_inodes(sb);
- sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
- ext4_superblock_csum_set(sb);
- err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
- GFP_KERNEL);
- }
- if (!err)
- err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb), GFP_KERNEL);
- if (!err)
- err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
- GFP_KERNEL);
- if (!err)
- err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
+ /*
+ * We can only set up the journal commit callback once
+ * mballoc is initialized
+ */
+ if (sbi->s_journal)
+ sbi->s_journal->j_commit_callback =
+ ext4_journal_commit_callback;
- if (err) {
- ext4_msg(sb, KERN_ERR, "insufficient memory");
+ err = ext4_percpu_param_init(sbi);
+ if (err)
goto failed_mount6;
- }
if (ext4_has_feature_flex_bg(sb))
if (!ext4_fill_flex_info(sb)) {
ext4_msg(sb, KERN_ERR,
"unable to initialize "
"flex_bg meta info!");
+ err = -ENOMEM;
goto failed_mount6;
}
@@ -4686,10 +5636,9 @@ no_journal:
if (err)
goto failed_mount6;
- err = ext4_register_sysfs(sb);
+ err = ext4_init_orphan_info(sb);
if (err)
goto failed_mount7;
-
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
@@ -4699,38 +5648,36 @@ no_journal:
}
#endif /* CONFIG_QUOTA */
+ /*
+ * Save the original bdev mapping's wb_err value which could be
+ * used to detect the metadata async write error.
+ */
+ errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
+ &sbi->s_bdev_wb_err);
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+ /*
+ * Update the checksum after updating free space/inode counters and
+ * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
+ * checksum in the buffer cache until it is written out and
+ * e2fsprogs programs trying to open a file system immediately
+ * after it is mounted can fail.
+ */
+ ext4_superblock_csum_set(sb);
if (needs_recovery) {
ext4_msg(sb, KERN_INFO, "recovery complete");
- ext4_mark_recovery_complete(sb, es);
+ err = ext4_mark_recovery_complete(sb, es);
+ if (err)
+ goto failed_mount9;
}
- if (EXT4_SB(sb)->s_journal) {
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- descr = " journalled data mode";
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- descr = " ordered data mode";
- else
- descr = " writeback data mode";
- } else
- descr = "out journal";
- if (test_opt(sb, DISCARD)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- ext4_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
+ if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) {
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
+ clear_opt(sb, DISCARD);
}
- if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Opts: %.*s%s%s", descr,
- (int) sizeof(sbi->s_es->s_mount_opts),
- sbi->s_es->s_mount_opts,
- *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
-
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -4738,36 +5685,26 @@ no_journal:
ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+ atomic_set(&sbi->s_warning_count, 0);
+ atomic_set(&sbi->s_msg_count, 0);
- kfree(orig_data);
- return 0;
+ /* Register sysfs after all initializations are complete. */
+ err = ext4_register_sysfs(sb);
+ if (err)
+ goto failed_mount9;
-cantfind_ext4:
- if (!silent)
- ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
- goto failed_mount;
+ return 0;
-#ifdef CONFIG_QUOTA
-failed_mount8:
- ext4_unregister_sysfs(sb);
-#endif
+failed_mount9:
+ ext4_quotas_off(sb, EXT4_MAXQUOTAS);
+failed_mount8: __maybe_unused
+ ext4_release_orphan_info(sb);
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
ext4_mb_release(sb);
- rcu_read_lock();
- flex_groups = rcu_dereference(sbi->s_flex_groups);
- if (flex_groups) {
- for (i = 0; i < sbi->s_flex_groups_allocated; i++)
- kvfree(flex_groups[i]);
- kvfree(flex_groups);
- }
- rcu_read_unlock();
- percpu_counter_destroy(&sbi->s_freeclusters_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_free_rwsem(&sbi->s_writepages_rwsem);
+ ext4_flex_groups_free(sbi);
+ ext4_percpu_param_destroy(sbi);
failed_mount5:
ext4_ext_release(sb);
ext4_release_system_zone(sb);
@@ -4786,45 +5723,90 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
- jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ ext4_journal_destroy(sbi, sbi->s_journal);
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
failed_mount3:
- del_timer_sync(&sbi->s_err_report);
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
-failed_mount2:
- rcu_read_lock();
- group_desc = rcu_dereference(sbi->s_group_desc);
- for (i = 0; i < db_count; i++)
- brelse(group_desc[i]);
- kvfree(group_desc);
- rcu_read_unlock();
+ /* flush s_sb_upd_work before sbi destroy */
+ flush_work(&sbi->s_sb_upd_work);
+ ext4_stop_mmpd(sbi);
+ timer_delete_sync(&sbi->s_err_report);
+ ext4_group_desc_free(sbi);
failed_mount:
- if (sbi->s_chksum_driver)
- crypto_free_shash(sbi->s_chksum_driver);
-
-#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+#if IS_ENABLED(CONFIG_UNICODE)
+ utf8_unload(sb->s_encoding);
#endif
#ifdef CONFIG_QUOTA
- for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
- fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
- ext4_blkdev_remove(sbi);
- brelse(bh);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
+ brelse(sbi->s_sbh);
+ if (sbi->s_journal_bdev_file) {
+ invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
+ bdev_fput(sbi->s_journal_bdev_file);
+ }
out_fail:
+ invalidate_bdev(sb->s_bdev);
sb->s_fs_info = NULL;
- kfree(sbi->s_blockgroup_lock);
-out_free_base:
- kfree(sbi);
- kfree(orig_data);
- fs_put_dax(dax_dev);
- return err ? err : ret;
+ return err;
+}
+
+static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi;
+ const char *descr;
+ int ret;
+
+ sbi = ext4_alloc_sbi(sb);
+ if (!sbi)
+ return -ENOMEM;
+
+ fc->s_fs_info = sbi;
+
+ /* Cleanup superblock name */
+ strreplace(sb->s_id, '/', '!');
+
+ sbi->s_sb_block = 1; /* Default super block location */
+ if (ctx->spec & EXT4_SPEC_s_sb_block)
+ sbi->s_sb_block = ctx->s_sb_block;
+
+ ret = __ext4_fill_super(fc, sb);
+ if (ret < 0)
+ goto free_sbi;
+
+ if (sbi->s_journal) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ descr = " journalled data mode";
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ descr = " ordered data mode";
+ else
+ descr = " writeback data mode";
+ } else
+ descr = "out journal";
+
+ if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
+ ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. "
+ "Quota mode: %s.", &sb->s_uuid,
+ sb_rdonly(sb) ? "ro" : "r/w", descr,
+ ext4_quota_mode(sb));
+
+ /* Update the s_overhead_clusters if necessary */
+ ext4_update_overhead(sb, false);
+ return 0;
+
+free_sbi:
+ ext4_free_sbi(sbi);
+ fc->s_fs_info = NULL;
+ return ret;
+}
+
+static int ext4_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, ext4_fill_super);
}
/*
@@ -4839,16 +5821,18 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_commit_interval = sbi->s_commit_interval;
journal->j_min_batch_time = sbi->s_min_batch_time;
journal->j_max_batch_time = sbi->s_max_batch_time;
+ ext4_fc_init(sb, journal);
write_lock(&journal->j_state_lock);
if (test_opt(sb, BARRIER))
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
- if (test_opt(sb, DATA_ERR_ABORT))
- journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
- else
- journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
+ /*
+ * Always enable journal cycle record option, letting the journal
+ * records log transactions continuously between each mount.
+ */
+ journal->j_flags |= JBD2_CYCLE_RECORD;
write_unlock(&journal->j_state_lock);
}
@@ -4865,81 +5849,110 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
if (IS_ERR(journal_inode)) {
ext4_msg(sb, KERN_ERR, "no journal found");
- return NULL;
+ return ERR_CAST(journal_inode);
}
if (!journal_inode->i_nlink) {
make_bad_inode(journal_inode);
iput(journal_inode);
ext4_msg(sb, KERN_ERR, "journal inode is deleted");
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
}
-
- jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
- journal_inode, journal_inode->i_size);
- if (!S_ISREG(journal_inode->i_mode)) {
+ if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
}
+
+ ext4_debug("Journal inode found at %p: %lld bytes\n",
+ journal_inode, journal_inode->i_size);
return journal_inode;
}
-static journal_t *ext4_get_journal(struct super_block *sb,
- unsigned int journal_inum)
+static int ext4_journal_bmap(journal_t *journal, sector_t *block)
+{
+ struct ext4_map_blocks map;
+ int ret;
+
+ if (journal->j_inode == NULL)
+ return 0;
+
+ map.m_lblk = *block;
+ map.m_len = 1;
+ ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0);
+ if (ret <= 0) {
+ ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
+ "journal bmap failed: block %llu ret %d\n",
+ *block, ret);
+ jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED);
+ return ret;
+ }
+ *block = map.m_pblk;
+ return 0;
+}
+
+static journal_t *ext4_open_inode_journal(struct super_block *sb,
+ unsigned int journal_inum)
{
struct inode *journal_inode;
journal_t *journal;
- BUG_ON(!ext4_has_feature_journal(sb));
-
journal_inode = ext4_get_journal_inode(sb, journal_inum);
- if (!journal_inode)
- return NULL;
+ if (IS_ERR(journal_inode))
+ return ERR_CAST(journal_inode);
journal = jbd2_journal_init_inode(journal_inode);
- if (!journal) {
+ if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "Could not load journal inode");
iput(journal_inode);
- return NULL;
+ return ERR_CAST(journal);
}
journal->j_private = sb;
+ journal->j_bmap = ext4_journal_bmap;
ext4_init_journal_params(sb, journal);
return journal;
}
-static journal_t *ext4_get_dev_journal(struct super_block *sb,
- dev_t j_dev)
+static struct file *ext4_get_journal_blkdev(struct super_block *sb,
+ dev_t j_dev, ext4_fsblk_t *j_start,
+ ext4_fsblk_t *j_len)
{
struct buffer_head *bh;
- journal_t *journal;
- ext4_fsblk_t start;
- ext4_fsblk_t len;
+ struct block_device *bdev;
+ struct file *bdev_file;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
struct ext4_super_block *es;
- struct block_device *bdev;
-
- BUG_ON(!ext4_has_feature_journal(sb));
+ int errno;
- bdev = ext4_blkdev_get(j_dev, sb);
- if (bdev == NULL)
- return NULL;
+ bdev_file = bdev_file_open_by_dev(j_dev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
+ sb, &fs_holder_ops);
+ if (IS_ERR(bdev_file)) {
+ ext4_msg(sb, KERN_ERR,
+ "failed to open journal device unknown-block(%u,%u) %ld",
+ MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
+ return bdev_file;
+ }
+ bdev = file_bdev(bdev_file);
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
ext4_msg(sb, KERN_ERR,
"blocksize too small for journal device");
+ errno = -EINVAL;
goto out_bdev;
}
sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
offset = EXT4_MIN_BLOCK_SIZE % blocksize;
- set_blocksize(bdev, blocksize);
- if (!(bh = __bread(bdev, sb_block, blocksize))) {
+ set_blocksize(bdev_file, blocksize);
+ bh = __bread(bdev, sb_block, blocksize);
+ if (!bh) {
ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
"external journal");
+ errno = -EINVAL;
goto out_bdev;
}
@@ -4947,59 +5960,74 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
- ext4_msg(sb, KERN_ERR, "external journal has "
- "bad superblock");
- brelse(bh);
- goto out_bdev;
+ ext4_msg(sb, KERN_ERR, "external journal has bad superblock");
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
if ((le32_to_cpu(es->s_feature_ro_compat) &
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
- ext4_msg(sb, KERN_ERR, "external journal has "
- "corrupt superblock");
- brelse(bh);
- goto out_bdev;
+ es->s_checksum != ext4_superblock_csum(es)) {
+ ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
ext4_msg(sb, KERN_ERR, "journal UUID does not match");
- brelse(bh);
- goto out_bdev;
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
- len = ext4_blocks_count(es);
- start = sb_block + 1;
- brelse(bh); /* we're done with the superblock */
+ *j_start = sb_block + 1;
+ *j_len = ext4_blocks_count(es);
+ brelse(bh);
+ return bdev_file;
+
+out_bh:
+ brelse(bh);
+out_bdev:
+ bdev_fput(bdev_file);
+ return ERR_PTR(errno);
+}
- journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
- start, len, blocksize);
- if (!journal) {
+static journal_t *ext4_open_dev_journal(struct super_block *sb,
+ dev_t j_dev)
+{
+ journal_t *journal;
+ ext4_fsblk_t j_start;
+ ext4_fsblk_t j_len;
+ struct file *bdev_file;
+ int errno = 0;
+
+ bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
+
+ journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
+ j_len, sb->s_blocksize);
+ if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "failed to create device journal");
+ errno = PTR_ERR(journal);
goto out_bdev;
}
- journal->j_private = sb;
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
- wait_on_buffer(journal->j_sb_buffer);
- if (!buffer_uptodate(journal->j_sb_buffer)) {
- ext4_msg(sb, KERN_ERR, "I/O error on journal device");
- goto out_journal;
- }
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
ext4_msg(sb, KERN_ERR, "External journal has more than one "
"user (unsupported) - %d",
be32_to_cpu(journal->j_superblock->s_nr_users));
+ errno = -EINVAL;
goto out_journal;
}
- EXT4_SB(sb)->journal_bdev = bdev;
+ journal->j_private = sb;
+ EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
ext4_init_journal_params(sb, journal);
return journal;
out_journal:
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
out_bdev:
- ext4_blkdev_put(bdev);
- return NULL;
+ bdev_fput(bdev_file);
+ return ERR_PTR(errno);
}
static int ext4_load_journal(struct super_block *sb,
@@ -5011,8 +6039,10 @@ static int ext4_load_journal(struct super_block *sb,
dev_t journal_dev;
int err = 0;
int really_read_only;
+ int journal_dev_ro;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return -EFSCORRUPTED;
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -5022,7 +6052,31 @@ static int ext4_load_journal(struct super_block *sb,
} else
journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
- really_read_only = bdev_read_only(sb->s_bdev);
+ if (journal_inum && journal_dev) {
+ ext4_msg(sb, KERN_ERR,
+ "filesystem has both journal inode and journal device!");
+ return -EINVAL;
+ }
+
+ if (journal_inum) {
+ journal = ext4_open_inode_journal(sb, journal_inum);
+ if (IS_ERR(journal))
+ return PTR_ERR(journal);
+ } else {
+ journal = ext4_open_dev_journal(sb, journal_dev);
+ if (IS_ERR(journal))
+ return PTR_ERR(journal);
+ }
+
+ journal_dev_ro = bdev_read_only(journal->j_dev);
+ really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
+
+ if (journal_dev_ro && !sb_rdonly(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "journal device read-only, try mounting with '-o ro'");
+ err = -EROFS;
+ goto err_out;
+ }
/*
* Are we loading a blank journal or performing recovery after a
@@ -5037,27 +6091,14 @@ static int ext4_load_journal(struct super_block *sb,
ext4_msg(sb, KERN_ERR, "write access "
"unavailable, cannot proceed "
"(try mounting with noload)");
- return -EROFS;
+ err = -EROFS;
+ goto err_out;
}
ext4_msg(sb, KERN_INFO, "write access will "
"be enabled during recovery");
}
}
- if (journal_inum && journal_dev) {
- ext4_msg(sb, KERN_ERR, "filesystem has both journal "
- "and inode journals!");
- return -EINVAL;
- }
-
- if (journal_inum) {
- if (!(journal = ext4_get_journal(sb, journal_inum)))
- return -EINVAL;
- } else {
- if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
- return -EINVAL;
- }
-
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
@@ -5065,52 +6106,71 @@ static int ext4_load_journal(struct super_block *sb,
err = jbd2_journal_wipe(journal, !really_read_only);
if (!err) {
char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+ __le16 orig_state;
+ bool changed = false;
+
if (save)
memcpy(save, ((char *) es) +
EXT4_S_ERR_START, EXT4_S_ERR_LEN);
err = jbd2_journal_load(journal);
- if (save)
+ if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
+ save, EXT4_S_ERR_LEN)) {
memcpy(((char *) es) + EXT4_S_ERR_START,
save, EXT4_S_ERR_LEN);
+ changed = true;
+ }
kfree(save);
+ orig_state = es->s_state;
+ es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
+ EXT4_ERROR_FS);
+ if (orig_state != es->s_state)
+ changed = true;
+ /* Write out restored error information to the superblock */
+ if (changed && !really_read_only) {
+ int err2;
+ err2 = ext4_commit_super(sb);
+ err = err ? : err2;
+ }
}
if (err) {
ext4_msg(sb, KERN_ERR, "error loading journal");
- jbd2_journal_destroy(journal);
- return err;
+ goto err_out;
}
EXT4_SB(sb)->s_journal = journal;
- ext4_clear_journal_err(sb, es);
+ err = ext4_clear_journal_err(sb, es);
+ if (err) {
+ ext4_journal_destroy(EXT4_SB(sb), journal);
+ return err;
+ }
if (!really_read_only && journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
es->s_journal_dev = cpu_to_le32(journal_devnum);
-
- /* Make sure we flush the recovery flag to disk. */
- ext4_commit_super(sb, 1);
+ ext4_commit_super(sb);
+ }
+ if (!really_read_only && journal_inum &&
+ journal_inum != le32_to_cpu(es->s_journal_inum)) {
+ es->s_journal_inum = cpu_to_le32(journal_inum);
+ ext4_commit_super(sb);
}
return 0;
+
+err_out:
+ ext4_journal_destroy(EXT4_SB(sb), journal);
+ return err;
}
-static int ext4_commit_super(struct super_block *sb, int sync)
+/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
+static void ext4_update_super(struct super_block *sb)
{
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
- int error = 0;
-
- if (!sbh || block_device_ejected(sb))
- return error;
-
- /*
- * The superblock bh should be mapped, but it might not be if the
- * device was hot-removed. Not much we can do but fail the I/O.
- */
- if (!buffer_mapped(sbh))
- return error;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *sbh = sbi->s_sbh;
+ lock_buffer(sbh);
/*
* If the file system is mounted read-only, don't update the
* superblock write time. This avoids updating the superblock
@@ -5121,29 +6181,79 @@ static int ext4_commit_super(struct super_block *sb, int sync)
* the clock is set in the future, and this will cause e2fsck
* to complain and force a full file system check.
*/
- if (!(sb->s_flags & SB_RDONLY))
+ if (!sb_rdonly(sb))
ext4_update_tstamp(es, s_wtime);
- if (sb->s_bdev->bd_part)
- es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1));
- else
- es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
+ es->s_kbytes_written =
+ cpu_to_le64(sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+ sbi->s_sectors_written_start) >> 1));
+ if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
ext4_free_blocks_count_set(es,
- EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeclusters_counter)));
- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+ EXT4_C2B(sbi, percpu_counter_sum_positive(
+ &sbi->s_freeclusters_counter)));
+ if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
es->s_free_inodes_count =
cpu_to_le32(percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeinodes_counter));
- BUFFER_TRACE(sbh, "marking dirty");
+ &sbi->s_freeinodes_counter));
+ /* Copy error information to the on-disk superblock */
+ spin_lock(&sbi->s_error_lock);
+ if (sbi->s_add_error_count > 0) {
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ if (!es->s_first_error_time && !es->s_first_error_time_hi) {
+ __ext4_update_tstamp(&es->s_first_error_time,
+ &es->s_first_error_time_hi,
+ sbi->s_first_error_time);
+ strtomem_pad(es->s_first_error_func,
+ sbi->s_first_error_func, 0);
+ es->s_first_error_line =
+ cpu_to_le32(sbi->s_first_error_line);
+ es->s_first_error_ino =
+ cpu_to_le32(sbi->s_first_error_ino);
+ es->s_first_error_block =
+ cpu_to_le64(sbi->s_first_error_block);
+ es->s_first_error_errcode =
+ ext4_errno_to_code(sbi->s_first_error_code);
+ }
+ __ext4_update_tstamp(&es->s_last_error_time,
+ &es->s_last_error_time_hi,
+ sbi->s_last_error_time);
+ strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0);
+ es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
+ es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
+ es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
+ es->s_last_error_errcode =
+ ext4_errno_to_code(sbi->s_last_error_code);
+ /*
+ * Start the daily error reporting function if it hasn't been
+ * started already
+ */
+ if (!es->s_error_count)
+ mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
+ le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
+ sbi->s_add_error_count = 0;
+ }
+ spin_unlock(&sbi->s_error_lock);
+
ext4_superblock_csum_set(sb);
- if (sync)
- lock_buffer(sbh);
+ unlock_buffer(sbh);
+}
+
+static int ext4_commit_super(struct super_block *sb)
+{
+ struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+
+ if (!sbh)
+ return -EINVAL;
+
+ ext4_update_super(sb);
+
+ lock_buffer(sbh);
+ /* Buffer got discarded which means block device got invalidated */
+ if (!buffer_mapped(sbh)) {
+ unlock_buffer(sbh);
+ return -EIO;
+ }
+
if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
/*
* Oh, dear. A previous attempt to write the
@@ -5158,19 +6268,21 @@ static int ext4_commit_super(struct super_block *sb, int sync)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- mark_buffer_dirty(sbh);
- if (sync) {
- unlock_buffer(sbh);
- error = __sync_dirty_buffer(sbh,
- REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
- if (buffer_write_io_error(sbh)) {
- ext4_msg(sb, KERN_ERR, "I/O error while writing "
- "superblock");
- clear_buffer_write_io_error(sbh);
- set_buffer_uptodate(sbh);
- }
+ get_bh(sbh);
+ /* Clear potential dirty bit if it was journalled update */
+ clear_buffer_dirty(sbh);
+ sbh->b_end_io = end_buffer_write_sync;
+ submit_bh(REQ_OP_WRITE | REQ_SYNC |
+ (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+ wait_on_buffer(sbh);
+ if (buffer_write_io_error(sbh)) {
+ ext4_msg(sb, KERN_ERR, "I/O error while writing "
+ "superblock");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ return -EIO;
}
- return error;
+ return 0;
}
/*
@@ -5178,26 +6290,39 @@ static int ext4_commit_super(struct super_block *sb, int sync)
* remounting) the filesystem readonly, then we will end up with a
* consistent fs on disk. Record that fact.
*/
-static void ext4_mark_recovery_complete(struct super_block *sb,
- struct ext4_super_block *es)
+static int ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es)
{
+ int err;
journal_t *journal = EXT4_SB(sb)->s_journal;
if (!ext4_has_feature_journal(sb)) {
- BUG_ON(journal != NULL);
- return;
+ if (journal != NULL) {
+ ext4_error(sb, "Journal got removed while the fs was "
+ "mounted!");
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
jbd2_journal_lock_updates(journal);
- if (jbd2_journal_flush(journal) < 0)
+ err = jbd2_journal_flush(journal, 0);
+ if (err < 0)
goto out;
- if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
+ if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
+ ext4_has_feature_orphan_present(sb))) {
+ if (!ext4_orphan_file_empty(sb)) {
+ ext4_error(sb, "Orphan file not empty on read-only fs.");
+ err = -EFSCORRUPTED;
+ goto out;
+ }
ext4_clear_feature_journal_needs_recovery(sb);
- ext4_commit_super(sb, 1);
+ ext4_clear_feature_orphan_present(sb);
+ ext4_commit_super(sb);
}
-
out:
jbd2_journal_unlock_updates(journal);
+ return err;
}
/*
@@ -5205,14 +6330,17 @@ out:
* has recorded an error from a previous lifetime, move that error to the
* main filesystem now.
*/
-static void ext4_clear_journal_err(struct super_block *sb,
+static int ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es)
{
journal_t *journal;
int j_errno;
const char *errstr;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (!ext4_has_feature_journal(sb)) {
+ ext4_error(sb, "Journal got removed while the fs was mounted!");
+ return -EFSCORRUPTED;
+ }
journal = EXT4_SB(sb)->s_journal;
@@ -5228,15 +6356,18 @@ static void ext4_clear_journal_err(struct super_block *sb,
errstr = ext4_decode_error(sb, j_errno, nbuf);
ext4_warning(sb, "Filesystem error recorded "
"from previous mount: %s", errstr);
- ext4_warning(sb, "Marking fs in need of filesystem check.");
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super(sb, 1);
+ j_errno = ext4_commit_super(sb);
+ if (j_errno)
+ return j_errno;
+ ext4_warning(sb, "Marked fs in need of filesystem check.");
jbd2_journal_clear_err(journal);
jbd2_journal_update_sb_errno(journal);
}
+ return 0;
}
/*
@@ -5245,13 +6376,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
*/
int ext4_force_commit(struct super_block *sb)
{
- journal_t *journal;
-
- if (sb_rdonly(sb))
- return 0;
-
- journal = EXT4_SB(sb)->s_journal;
- return ext4_journal_force_commit(journal);
+ return ext4_journal_force_commit(EXT4_SB(sb)->s_journal);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -5261,8 +6386,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sbi)))
- return 0;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
@@ -5291,7 +6417,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
needs_barrier = true;
if (needs_barrier) {
int err;
- err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(sb->s_bdev);
if (!ret)
ret = err;
}
@@ -5310,12 +6436,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
static int ext4_freeze(struct super_block *sb)
{
int error = 0;
- journal_t *journal;
-
- if (sb_rdonly(sb))
- return 0;
-
- journal = EXT4_SB(sb)->s_journal;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
if (journal) {
/* Now we set up the journal barrier. */
@@ -5325,15 +6446,17 @@ static int ext4_freeze(struct super_block *sb)
* Don't clear the needs_recovery flag if we failed to
* flush the journal.
*/
- error = jbd2_journal_flush(journal);
+ error = jbd2_journal_flush(journal, 0);
if (error < 0)
goto out;
/* Journal blocked and flushed, clear needs_recovery flag. */
ext4_clear_feature_journal_needs_recovery(sb);
+ if (ext4_orphan_file_empty(sb))
+ ext4_clear_feature_orphan_present(sb);
}
- error = ext4_commit_super(sb, 1);
+ error = ext4_commit_super(sb);
out:
if (journal)
/* we rely on upper layer to stop further updates */
@@ -5347,15 +6470,17 @@ out:
*/
static int ext4_unfreeze(struct super_block *sb)
{
- if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
+ if (ext4_emergency_state(sb))
return 0;
if (EXT4_SB(sb)->s_journal) {
/* Reset the needs_recovery flag before the fs is unlocked. */
ext4_set_feature_journal_needs_recovery(sb);
+ if (ext4_has_feature_orphan_file(sb))
+ ext4_set_feature_orphan_present(sb);
}
- ext4_commit_super(sb, 1);
+ ext4_commit_super(sb);
return 0;
}
@@ -5375,24 +6500,22 @@ struct ext4_mount_options {
#endif
};
-static int ext4_remount(struct super_block *sb, int *flags, char *data)
+static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
{
+ struct ext4_fs_context *ctx = fc->fs_private;
struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
- int enable_quota = 0;
ext4_group_t g;
- unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err = 0;
+ int alloc_ctx;
#ifdef CONFIG_QUOTA
+ int enable_quota = 0;
int i, j;
char *to_free[EXT4_MAXQUOTAS];
#endif
- char *orig_data = kstrdup(data, GFP_KERNEL);
- if (data && !orig_data)
- return -ENOMEM;
/* Store the original options */
old_sb_flags = sb->s_flags;
@@ -5413,20 +6536,40 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!old_opts.s_qf_names[i]) {
for (j = 0; j < i; j++)
kfree(old_opts.s_qf_names[j]);
- kfree(orig_data);
return -ENOMEM;
}
} else
old_opts.s_qf_names[i] = NULL;
#endif
- if (sbi->s_journal && sbi->s_journal->j_task->io_context)
- journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
+ if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+ ctx->journal_ioprio =
+ sbi->s_journal->j_task->io_context->ioprio;
+ else
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
- if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
- err = -EINVAL;
- goto restore_opts;
}
+ if ((ctx->spec & EXT4_SPEC_s_stripe) &&
+ ext4_is_stripe_incompatible(sb, ctx->s_stripe)) {
+ ext4_msg(sb, KERN_WARNING,
+ "stripe (%lu) is not aligned with cluster size (%u), "
+ "stripe is disabled",
+ ctx->s_stripe, sbi->s_cluster_ratio);
+ ctx->s_stripe = 0;
+ }
+
+ /*
+ * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
+ * two calls to ext4_should_dioread_nolock() to return inconsistent
+ * values, triggering WARN_ON in ext4_add_complete_io(). we grab
+ * here s_writepages_rwsem to avoid race between writepages ops and
+ * remount.
+ */
+ alloc_ctx = ext4_writepages_down_write(sb);
+ ext4_apply_options(fc, sb);
+ ext4_writepages_up_write(sb, alloc_ctx);
+
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
@@ -5447,12 +6590,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
- if (test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- err = -EINVAL;
- goto restore_opts;
- }
} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -5468,15 +6605,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
- ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
- "dax flag with busy inodes while remounting");
- sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
+ if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) &&
+ !test_opt(sb, DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount");
+ err = -EINVAL;
+ goto restore_opts;
}
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
- ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
-
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -5484,19 +6619,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal) {
ext4_init_journal_params(sb, sbi->s_journal);
- set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
}
- if (*flags & SB_LAZYTIME)
- sb->s_flags |= SB_LAZYTIME;
+ /* Flush outstanding errors before changing fs state */
+ flush_work(&sbi->s_sb_upd_work);
- if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
+ if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
+ if (ext4_emergency_state(sb)) {
err = -EROFS;
goto restore_opts;
}
- if (*flags & SB_RDONLY) {
+ if (fc->sb_flags & SB_RDONLY) {
err = sync_filesystem(sb);
if (err < 0)
goto restore_opts;
@@ -5519,10 +6654,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
(sbi->s_mount_state & EXT4_VALID_FS))
es->s_state = cpu_to_le16(sbi->s_mount_state);
- if (sbi->s_journal)
+ if (sbi->s_journal) {
+ /*
+ * We let remount-ro finish even if marking fs
+ * as clean failed...
+ */
ext4_mark_recovery_complete(sb, es);
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ }
} else {
/* Make sure we can mount this feature set readwrite */
if (ext4_has_feature_readonly(sb) ||
@@ -5553,7 +6691,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* around from a previously readonly bdev mount,
* require a full umount/remount for now.
*/
- if (es->s_last_orphan) {
+ if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
ext4_msg(sb, KERN_WARNING, "Couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
@@ -5568,48 +6706,49 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* been changed by e2fsck since we originally mounted
* the partition.)
*/
- if (sbi->s_journal)
- ext4_clear_journal_err(sb, es);
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ if (sbi->s_journal) {
+ err = ext4_clear_journal_err(sb, es);
+ if (err)
+ goto restore_opts;
+ }
+ sbi->s_mount_state = (le16_to_cpu(es->s_state) &
+ ~EXT4_FC_REPLAY);
err = ext4_setup_super(sb, es, 0);
if (err)
goto restore_opts;
sb->s_flags &= ~SB_RDONLY;
- if (ext4_has_feature_mmp(sb))
- if (ext4_multi_mount_protect(sb,
- le64_to_cpu(es->s_mmp_block))) {
- err = -EROFS;
+ if (ext4_has_feature_mmp(sb)) {
+ err = ext4_multi_mount_protect(sb,
+ le64_to_cpu(es->s_mmp_block));
+ if (err)
goto restore_opts;
- }
+ }
+#ifdef CONFIG_QUOTA
enable_quota = 1;
+#endif
}
}
/*
- * Reinitialize lazy itable initialization thread based on
- * current settings
+ * Handle creation of system zone data early because it can fail.
+ * Releasing of existing data is done when we are sure remount will
+ * succeed.
*/
- if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
- ext4_unregister_li_request(sb);
- else {
- ext4_group_t first_not_zeroed;
- first_not_zeroed = ext4_has_uninit_itable(sb);
- ext4_register_li_request(sb, first_not_zeroed);
+ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
+ err = ext4_setup_system_zone(sb);
+ if (err)
+ goto restore_opts;
}
- ext4_setup_system_zone(sb);
if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
- err = ext4_commit_super(sb, 1);
+ err = ext4_commit_super(sb);
if (err)
goto restore_opts;
}
#ifdef CONFIG_QUOTA
- /* Release old quota file names */
- for (i = 0; i < EXT4_MAXQUOTAS; i++)
- kfree(old_opts.s_qf_names[i]);
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
@@ -5619,14 +6758,48 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
}
+ /* Release old quota file names */
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ kfree(old_opts.s_qf_names[i]);
#endif
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
+ ext4_release_system_zone(sb);
+
+ /*
+ * Reinitialize lazy itable initialization thread based on
+ * current settings
+ */
+ if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
+ ext4_unregister_li_request(sb);
+ else {
+ ext4_group_t first_not_zeroed;
+ first_not_zeroed = ext4_has_uninit_itable(sb);
+ ext4_register_li_request(sb, first_not_zeroed);
+ }
+
+ if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
+ ext4_stop_mmpd(sbi);
+
+ /*
+ * Handle aborting the filesystem as the last thing during remount to
+ * avoid obsure errors during remount when some option changes fail to
+ * apply due to shutdown filesystem.
+ */
+ if (test_opt2(sb, ABORT))
+ ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
- *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
- ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
- kfree(orig_data);
return 0;
restore_opts:
+ /*
+ * If there was a failing r/w to ro transition, we may need to
+ * re-enable quota
+ */
+ if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
+ sb_any_quota_suspended(sb))
+ dquot_resume(sb, -1);
+
+ alloc_ctx = ext4_writepages_down_write(sb);
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -5635,6 +6808,10 @@ restore_opts:
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
+ ext4_writepages_up_write(sb, alloc_ctx);
+
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
+ ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
@@ -5645,10 +6822,34 @@ restore_opts:
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(to_free[i]);
#endif
- kfree(orig_data);
+ if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
+ ext4_stop_mmpd(sbi);
return err;
}
+static int ext4_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ int ret;
+ bool old_ro = sb_rdonly(sb);
+
+ fc->s_fs_info = EXT4_SB(sb);
+
+ ret = ext4_check_opt_consistency(fc, sb);
+ if (ret < 0)
+ return ret;
+
+ ret = __ext4_remount(fc, sb);
+ if (ret < 0)
+ return ret;
+
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.",
+ &sb->s_uuid,
+ (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : "");
+
+ return 0;
+}
+
#ifdef CONFIG_QUOTA
static int ext4_statfs_project(struct super_block *sb,
kprojid_t projid, struct kstatfs *buf)
@@ -5668,22 +6869,29 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
- if (limit && buf->f_blocks > limit) {
+ if (limit) {
+ uint64_t remaining = 0;
+
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@@ -5698,7 +6906,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
ext4_fsblk_t overhead = 0, resv_blocks;
- u64 fsid;
s64 bfree;
resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
@@ -5719,10 +6926,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
buf->f_namelen = EXT4_NAME_LEN;
- fsid = le64_to_cpup((void *)es->s_uuid) ^
- le64_to_cpup((void *)es->s_uuid + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = uuid_to_fsid(es->s_uuid);
#ifdef CONFIG_QUOTA
if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
@@ -5756,6 +6960,10 @@ static int ext4_write_dquot(struct dquot *dquot)
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to commit dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
@@ -5772,6 +6980,10 @@ static int ext4_acquire_dquot(struct dquot *dquot)
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_acquire(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to acquire dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
@@ -5782,29 +6994,47 @@ static int ext4_release_dquot(struct dquot *dquot)
{
int ret, err;
handle_t *handle;
+ bool freeze_protected = false;
+
+ /*
+ * Trying to sb_start_intwrite() in a running transaction
+ * can result in a deadlock. Further, running transactions
+ * are already protected from freezing.
+ */
+ if (!ext4_journal_current_handle()) {
+ sb_start_intwrite(dquot->dq_sb);
+ freeze_protected = true;
+ }
handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to release dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
+
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
+
return ret;
}
static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
struct super_block *sb = dquot->dq_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- /* Are we journaling quotas? */
- if (ext4_has_feature_quota(sb) ||
- sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ if (ext4_is_quota_journalled(sb)) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
} else {
@@ -5818,7 +7048,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -5828,16 +7058,6 @@ static int ext4_write_info(struct super_block *sb, int type)
return ret;
}
-/*
- * Turn on quotas during mount time - we need to find
- * the quota file and such...
- */
-static int ext4_quota_on_mount(struct super_block *sb, int type)
-{
- return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
- EXT4_SB(sb)->s_jquota_fmt, type);
-}
-
static void lockdep_set_quota_inode(struct inode *inode, int subclass)
{
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -5866,6 +7086,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
/* Quotafile not on the same filesystem? */
if (path->dentry->d_sb != sb)
return -EXDEV;
+
+ /* Quota already enabled for this file? */
+ if (IS_NOQUOTA(d_inode(path->dentry)))
+ return -EBUSY;
+
/* Journaling quota? */
if (EXT4_SB(sb)->s_qf_names[type]) {
/* Quotafile not in fs root? */
@@ -5882,29 +7107,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
}
- /*
- * When we journal data on quota file, we have to flush journal to see
- * all updates to the file when we bypass pagecache...
- */
- if (EXT4_SB(sb)->s_journal &&
- ext4_should_journal_data(d_inode(path->dentry))) {
- /*
- * We don't need to lock updates but journal_flush() could
- * otherwise be livelocked...
- */
- jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
- if (err)
- return err;
- }
-
lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
err = dquot_quota_on(sb, type, format_id, path);
- if (err) {
- lockdep_set_quota_inode(path->dentry->d_inode,
- I_DATA_SEM_NORMAL);
- } else {
+ if (!err) {
struct inode *inode = d_inode(path->dentry);
handle_t *handle;
@@ -5924,10 +7129,29 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
ext4_journal_stop(handle);
unlock_inode:
inode_unlock(inode);
+ if (err)
+ dquot_quota_off(sb, type);
}
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
return err;
}
+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
+{
+ switch (type) {
+ case USRQUOTA:
+ return qf_inum == EXT4_USR_QUOTA_INO;
+ case GRPQUOTA:
+ return qf_inum == EXT4_GRP_QUOTA_INO;
+ case PRJQUOTA:
+ return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
+ default:
+ BUG();
+ }
+}
+
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
@@ -5944,9 +7168,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
if (!qf_inums[type])
return -EPERM;
+ if (!ext4_check_quota_inum(type, qf_inums[type])) {
+ ext4_error(sb, "Bad quota inum: %lu, type: %d",
+ qf_inums[type], type);
+ return -EUCLEAN;
+ }
+
qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
if (IS_ERR(qf_inode)) {
- ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+ ext4_error(sb, "Bad quota inode: %lu, type: %d",
+ qf_inums[type], type);
return PTR_ERR(qf_inode);
}
@@ -5962,7 +7193,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
}
/* Enable usage tracking for all quota types. */
-static int ext4_enable_quotas(struct super_block *sb)
+int ext4_enable_quotas(struct super_block *sb)
{
int type, err = 0;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
@@ -5985,11 +7216,11 @@ static int ext4_enable_quotas(struct super_block *sb)
if (err) {
ext4_warning(sb,
"Failed to enable quota tracking "
- "(type=%d, err=%d). Please run "
- "e2fsck to fix.", type, err);
- for (type--; type >= 0; type--)
- dquot_quota_off(sb, type);
+ "(type=%d, err=%d, ino=%lu). "
+ "Please run e2fsck to fix.", type,
+ err, qf_inums[type]);
+ ext4_quotas_off(sb, type);
return err;
}
}
@@ -6014,6 +7245,13 @@ static int ext4_quota_off(struct super_block *sb, int type)
err = dquot_quota_off(sb, type);
if (err || ext4_has_feature_quota(sb))
goto out_put;
+ /*
+ * When the filesystem was remounted read-only first, we cannot cleanup
+ * inode flags here. Bad luck but people should be using QUOTA feature
+ * these days anyway.
+ */
+ if (sb_rdonly(sb))
+ goto out_put;
inode_lock(inode);
/*
@@ -6028,7 +7266,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
}
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
@@ -6062,8 +7300,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
bh = ext4_bread(NULL, inode, blk, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -6092,7 +7329,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
- if (EXT4_SB(sb)->s_journal && !handle) {
+ if (!handle) {
ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
" cancelled because transaction is not started",
(unsigned long long)off, (unsigned long long)len);
@@ -6120,14 +7357,14 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
if (!bh)
goto out;
BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
if (err) {
brelse(bh);
return err;
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, len);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -6143,12 +7380,6 @@ out:
}
#endif
-static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
-}
-
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static inline void register_as_ext2(void)
{
@@ -6205,32 +7436,38 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
return 1;
}
+static void ext4_kill_sb(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;
+
+ kill_block_super(sb);
+
+ if (bdev_file)
+ bdev_fput(bdev_file);
+}
+
static struct file_system_type ext4_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext4",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "ext4",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = ext4_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
+ FS_LBS,
};
MODULE_ALIAS_FS("ext4");
-/* Shared across all ext4 file systems */
-wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-
static int __init ext4_init_fs(void)
{
- int i, err;
+ int err;
ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
ext4_li_info = NULL;
- mutex_init(&ext4_li_mtx);
/* Build-time check for flags consistency */
ext4_check_flag_values();
- for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
- init_waitqueue_head(&ext4__ioend_wq[i]);
-
err = ext4_init_es();
if (err)
return err;
@@ -6261,6 +7498,11 @@ static int __init ext4_init_fs(void)
err = init_inodecache();
if (err)
goto out1;
+
+ err = ext4_fc_init_dentry_cache();
+ if (err)
+ goto out05;
+
register_as_ext3();
register_as_ext2();
err = register_filesystem(&ext4_fs_type);
@@ -6271,6 +7513,8 @@ static int __init ext4_init_fs(void)
out:
unregister_as_ext2();
unregister_as_ext3();
+ ext4_fc_destroy_dentry_cache();
+out05:
destroy_inodecache();
out1:
ext4_exit_mballoc();
@@ -6296,6 +7540,7 @@ static void __exit ext4_exit_fs(void)
unregister_as_ext2();
unregister_as_ext3();
unregister_filesystem(&ext4_fs_type);
+ ext4_fc_destroy_dentry_cache();
destroy_inodecache();
ext4_exit_mballoc();
ext4_exit_sysfs();
@@ -6309,6 +7554,5 @@ static void __exit ext4_exit_fs(void)
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
MODULE_DESCRIPTION("Fourth Extended Filesystem");
MODULE_LICENSE("GPL");
-MODULE_SOFTDEP("pre: crc32c");
module_init(ext4_init_fs)
module_exit(ext4_exit_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index dd05af983092..645240cc0229 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -27,7 +27,7 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *cpage = NULL;
+ struct buffer_head *bh = NULL;
const void *caddr;
unsigned int max_size;
const char *paddr;
@@ -39,28 +39,90 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
caddr = EXT4_I(inode)->i_data;
max_size = sizeof(EXT4_I(inode)->i_data);
} else {
- cpage = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(cpage))
- return ERR_CAST(cpage);
- caddr = page_address(cpage);
+ bh = ext4_bread(NULL, inode, 0, 0);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "bad symlink.");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ caddr = bh->b_data;
max_size = inode->i_sb->s_blocksize;
}
paddr = fscrypt_get_symlink(inode, caddr, max_size, done);
- if (cpage)
- put_page(cpage);
+ brelse(bh);
return paddr;
}
+static int ext4_encrypted_symlink_getattr(struct mnt_idmap *idmap,
+ const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ ext4_getattr(idmap, path, stat, request_mask, query_flags);
+
+ return fscrypt_symlink_getattr(path, stat);
+}
+
+static void ext4_free_link(void *bh)
+{
+ brelse(bh);
+}
+
+static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ struct buffer_head *bh;
+ char *inline_link;
+
+ /*
+ * Create a new inlined symlink is not supported, just provide a
+ * method to read the leftovers.
+ */
+ if (ext4_has_inline_data(inode)) {
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ inline_link = ext4_read_inline_link(inode);
+ if (!IS_ERR(inline_link))
+ set_delayed_call(callback, kfree_link, inline_link);
+ return inline_link;
+ }
+
+ if (!dentry) {
+ bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
+ if (IS_ERR(bh) || !bh)
+ return ERR_PTR(-ECHILD);
+ if (!ext4_buffer_uptodate(bh)) {
+ brelse(bh);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ bh = ext4_bread(NULL, inode, 0, 0);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "bad symlink.");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ }
+
+ set_delayed_call(callback, ext4_free_link, bh);
+ nd_terminate_link(bh->b_data, inode->i_size,
+ inode->i_sb->s_blocksize - 1);
+ return bh->b_data;
+}
+
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
- .getattr = ext4_getattr,
+ .getattr = ext4_encrypted_symlink_getattr,
.listxattr = ext4_listxattr,
};
const struct inode_operations ext4_symlink_inode_operations = {
- .get_link = page_get_link,
+ .get_link = ext4_get_link,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6c9fc9e21c13..0018e09b867e 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -24,11 +24,15 @@ typedef enum {
attr_session_write_kbytes,
attr_lifetime_write_kbytes,
attr_reserved_clusters,
+ attr_sra_exceeded_retry_limit,
attr_inode_readahead,
attr_trigger_test_error,
attr_first_error_time,
attr_last_error_time,
+ attr_clusters_in_group,
+ attr_mb_order,
attr_feature,
+ attr_pointer_pi,
attr_pointer_ui,
attr_pointer_ul,
attr_pointer_u64,
@@ -62,11 +66,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%lu\n",
- (part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
+ return sysfs_emit(buf, "%lu\n",
+ (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
sbi->s_sectors_written_start) >> 1);
}
@@ -74,12 +75,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(sbi->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
@@ -109,7 +107,7 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
int ret;
ret = kstrtoull(skip_spaces(buf), 0, &val);
- if (ret || val >= clusters)
+ if (ret || val >= clusters || (s64)val < 0)
return -EINVAL;
atomic64_set(&sbi->s_resv_clusters, val);
@@ -135,8 +133,8 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
{
if (!sbi->s_journal)
- return snprintf(buf, PAGE_SIZE, "<none>\n");
- return snprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "<none>\n");
+ return sysfs_emit(buf, "%d\n",
task_pid_vnr(sbi->s_journal->j_task));
}
@@ -183,12 +181,18 @@ static struct ext4_attr ext4_attr_##_name = { \
#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \
EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
+#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname)
+
#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -205,27 +209,34 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
EXT4_ATTR_FUNC(session_write_kbytes, 0444);
EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
ext4_sb_info, s_inode_readahead_blks);
+EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
+ ext4_sb_info, s_mb_group_prealloc);
+EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
+ ext4_sb_info, s_mb_best_avail_max_trim_order);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
+EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count);
+EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode);
EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode);
@@ -240,6 +251,11 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
+EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
+EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
+EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -249,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(reserved_clusters),
+ ATTR_LIST(sra_exceeded_retry_limit),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@@ -257,6 +274,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(mb_max_linear_groups),
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
@@ -266,7 +284,10 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(warning_ratelimit_burst),
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
+ ATTR_LIST(mb_best_avail_max_trim_order),
ATTR_LIST(errors_count),
+ ATTR_LIST(warning_count),
+ ATTR_LIST(msg_count),
ATTR_LIST(first_error_ino),
ATTR_LIST(last_error_ino),
ATTR_LIST(first_error_block),
@@ -283,6 +304,11 @@ static struct attribute *ext4_attrs[] = {
#ifdef CONFIG_EXT4_DEBUG
ATTR_LIST(simulate_fail),
#endif
+ ATTR_LIST(mb_prefetch),
+ ATTR_LIST(mb_prefetch_limit),
+ ATTR_LIST(last_trim_minblks),
+ ATTR_LIST(sb_update_sec),
+ ATTR_LIST(sb_update_kb),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -295,13 +321,20 @@ EXT4_ATTR_FEATURE(meta_bg_resize);
EXT4_ATTR_FEATURE(encryption);
EXT4_ATTR_FEATURE(test_dummy_encryption_v2);
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
EXT4_ATTR_FEATURE(casefold);
#endif
#ifdef CONFIG_FS_VERITY
EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
+EXT4_ATTR_FEATURE(fast_commit);
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
+EXT4_ATTR_FEATURE(encrypted_casefold);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+EXT4_ATTR_FEATURE(blocksize_gt_pagesize);
+#endif
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
@@ -311,13 +344,20 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(encryption),
ATTR_LIST(test_dummy_encryption_v2),
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(casefold),
#endif
#ifdef CONFIG_FS_VERITY
ATTR_LIST(verity),
#endif
ATTR_LIST(metadata_csum_seed),
+ ATTR_LIST(fast_commit),
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
+ ATTR_LIST(encrypted_casefold),
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ATTR_LIST(blocksize_gt_pagesize),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(ext4_feat);
@@ -337,24 +377,56 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
{
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
((time64_t)hi << 32) + le32_to_cpu(lo));
}
#define print_tstamp(buf, es, tstamp) \
__print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)
+static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ void *ptr = calc_ptr(a, sbi);
+
+ if (!ptr)
+ return 0;
+
+ switch (a->attr_id) {
+ case attr_inode_readahead:
+ case attr_clusters_in_group:
+ case attr_mb_order:
+ case attr_pointer_pi:
+ case attr_pointer_ui:
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
+ return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
+ case attr_pointer_ul:
+ return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr));
+ case attr_pointer_u8:
+ return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr));
+ case attr_pointer_u64:
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr));
+ return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr));
+ case attr_pointer_string:
+ return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr);
+ case attr_pointer_atomic:
+ return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr));
+ }
+ return 0;
+}
+
static ssize_t ext4_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
s_kobj);
struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
- void *ptr = calc_ptr(a, sbi);
switch (a->attr_id) {
case attr_delayed_allocation_blocks:
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(s64) EXT4_C2B(sbi,
percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
case attr_session_write_kbytes:
@@ -362,79 +434,49 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
case attr_lifetime_write_kbytes:
return lifetime_write_kbytes_show(sbi, buf);
case attr_reserved_clusters:
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)
atomic64_read(&sbi->s_resv_clusters));
- case attr_inode_readahead:
- case attr_pointer_ui:
- if (!ptr)
- return 0;
- if (a->attr_ptr == ptr_ext4_super_block_offset)
- return snprintf(buf, PAGE_SIZE, "%u\n",
- le32_to_cpup(ptr));
- else
- return snprintf(buf, PAGE_SIZE, "%u\n",
- *((unsigned int *) ptr));
- case attr_pointer_ul:
- if (!ptr)
- return 0;
- return snprintf(buf, PAGE_SIZE, "%lu\n",
- *((unsigned long *) ptr));
- case attr_pointer_u8:
- if (!ptr)
- return 0;
- return snprintf(buf, PAGE_SIZE, "%u\n",
- *((unsigned char *) ptr));
- case attr_pointer_u64:
- if (!ptr)
- return 0;
- if (a->attr_ptr == ptr_ext4_super_block_offset)
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- le64_to_cpup(ptr));
- else
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- *((unsigned long long *) ptr));
- case attr_pointer_string:
- if (!ptr)
- return 0;
- return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size,
- (char *) ptr);
- case attr_pointer_atomic:
- if (!ptr)
- return 0;
- return snprintf(buf, PAGE_SIZE, "%d\n",
- atomic_read((atomic_t *) ptr));
+ case attr_sra_exceeded_retry_limit:
+ return sysfs_emit(buf, "%llu\n",
+ (unsigned long long)
+ percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
case attr_feature:
- return snprintf(buf, PAGE_SIZE, "supported\n");
+ return sysfs_emit(buf, "supported\n");
case attr_first_error_time:
return print_tstamp(buf, sbi->s_es, s_first_error_time);
case attr_last_error_time:
return print_tstamp(buf, sbi->s_es, s_last_error_time);
case attr_journal_task:
return journal_task_show(sbi, buf);
+ default:
+ return ext4_generic_attr_show(a, sbi, buf);
}
-
- return 0;
}
-static ssize_t ext4_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf, size_t len)
+static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t len)
{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
- void *ptr = calc_ptr(a, sbi);
- unsigned long t;
int ret;
+ unsigned int t;
+ unsigned long lt;
+ void *ptr = calc_ptr(a, sbi);
+
+ if (!ptr)
+ return 0;
switch (a->attr_id) {
- case attr_reserved_clusters:
- return reserved_clusters_store(sbi, buf, len);
+ case attr_pointer_pi:
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if ((int)t < 0)
+ return -EINVAL;
+ *((unsigned int *) ptr) = t;
+ return len;
case attr_pointer_ui:
- if (!ptr)
- return 0;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
if (ret)
return ret;
if (a->attr_ptr == ptr_ext4_super_block_offset)
@@ -442,20 +484,50 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
else
*((unsigned int *) ptr) = t;
return len;
+ case attr_mb_order:
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t > 64)
+ return -EINVAL;
+ *((unsigned int *) ptr) = t;
+ return len;
+ case attr_clusters_in_group:
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t > sbi->s_clusters_per_group)
+ return -EINVAL;
+ *((unsigned int *) ptr) = t;
+ return len;
case attr_pointer_ul:
- if (!ptr)
- return 0;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtoul(skip_spaces(buf), 0, &lt);
if (ret)
return ret;
- *((unsigned long *) ptr) = t;
+ *((unsigned long *) ptr) = lt;
return len;
+ }
+ return 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ switch (a->attr_id) {
+ case attr_reserved_clusters:
+ return reserved_clusters_store(sbi, buf, len);
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
return trigger_test_error(sbi, buf, len);
+ default:
+ return ext4_generic_attr_store(a, sbi, buf, len);
}
- return 0;
}
static void ext4_sb_release(struct kobject *kobj)
@@ -465,23 +537,33 @@ static void ext4_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
+static void ext4_feat_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
static const struct sysfs_ops ext4_attr_ops = {
.show = ext4_attr_show,
.store = ext4_attr_store,
};
-static struct kobj_type ext4_sb_ktype = {
+static const struct kobj_type ext4_sb_ktype = {
.default_groups = ext4_groups,
.sysfs_ops = &ext4_attr_ops,
.release = ext4_sb_release,
};
-static struct kobj_type ext4_feat_ktype = {
+static const struct kobj_type ext4_feat_ktype = {
.default_groups = ext4_feat_groups,
.sysfs_ops = &ext4_attr_ops,
- .release = (void (*)(struct kobject *))kfree,
+ .release = ext4_feat_release,
};
+void ext4_notify_error_sysfs(struct ext4_sb_info *sbi)
+{
+ sysfs_notify(&sbi->s_kobj, NULL, "errors_count");
+}
+
static struct kobject *ext4_root;
static struct kobject *ext4_feat;
@@ -508,8 +590,14 @@ int ext4_register_sysfs(struct super_block *sb)
proc_create_single_data("es_shrinker_info", S_IRUGO,
sbi->s_proc, ext4_seq_es_shrinker_info_show,
sb);
+ proc_create_single_data("fc_info", 0444, sbi->s_proc,
+ ext4_fc_info_show, sb);
proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_ops, sb);
+ proc_create_single_data("mb_stats", 0444, sbi->s_proc,
+ ext4_seq_mb_stats_show, sb);
+ proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc,
+ &ext4_mb_seq_structs_summary_ops, sb);
}
return 0;
}
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index bcbe3668c1d4..ce84aa2786c7 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -11,14 +11,16 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ struct address_space *mapping = inode->i_mapping;
+
/*
* We don't need to call ext4_break_layouts() because the blocks we
* are truncating were never visible to userspace.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
- truncate_inode_pages(inode->i_mapping, inode->i_size);
+ filemap_invalidate_lock(mapping);
+ truncate_inode_pages(mapping, inode->i_size);
ext4_truncate(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
}
/*
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index dec1244dd062..415d9c4d8a32 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -42,21 +42,16 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
loff_t pos)
{
while (count) {
- size_t n = min_t(size_t, count,
- PAGE_SIZE - offset_in_page(pos));
- struct page *page;
- void *addr;
+ struct folio *folio;
+ size_t n;
- page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+ folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT,
NULL);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- addr = kmap_atomic(page);
- memcpy(buf, addr + offset_in_page(pos), n);
- kunmap_atomic(addr);
-
- put_page(page);
+ n = memcpy_from_file_folio(buf, folio, pos, count);
+ folio_put(folio);
buf += n;
pos += n;
@@ -72,28 +67,26 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
static int pagecache_write(struct inode *inode, const void *buf, size_t count,
loff_t pos)
{
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+
if (pos + count > inode->i_sb->s_maxbytes)
return -EFBIG;
while (count) {
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
- struct page *page;
- void *fsdata;
- void *addr;
+ struct folio *folio;
+ void *fsdata = NULL;
int res;
- res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
- &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata);
if (res)
return res;
- addr = kmap_atomic(page);
- memcpy(addr + offset_in_page(pos), buf, n);
- kunmap_atomic(addr);
+ memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n);
- res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
- page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata);
if (res < 0)
return res;
if (res != n)
@@ -113,6 +106,9 @@ static int ext4_begin_enable_verity(struct file *filp)
handle_t *handle;
int err;
+ if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EINVAL;
+
if (ext4_verity_in_progress(inode))
return -EBUSY;
@@ -198,55 +194,76 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
struct inode *inode = file_inode(filp);
const int credits = 2; /* superblock and inode for ext4_orphan_del() */
handle_t *handle;
+ struct ext4_iloc iloc;
int err = 0;
- int err2;
-
- if (desc != NULL) {
- /* Succeeded; write the verity descriptor. */
- err = ext4_write_verity_descriptor(inode, desc, desc_size,
- merkle_tree_size);
- /* Write all pages before clearing VERITY_IN_PROGRESS. */
- if (!err)
- err = filemap_write_and_wait(inode->i_mapping);
- }
+ /*
+ * If an error already occurred (which fs/verity/ signals by passing
+ * desc == NULL), then only clean-up is needed.
+ */
+ if (desc == NULL)
+ goto cleanup;
- /* If we failed, truncate anything we wrote past i_size. */
- if (desc == NULL || err)
- ext4_truncate(inode);
+ /* Append the verity descriptor. */
+ err = ext4_write_verity_descriptor(inode, desc, desc_size,
+ merkle_tree_size);
+ if (err)
+ goto cleanup;
/*
- * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
- * deleting the inode from the orphan list, even if something failed.
- * If everything succeeded, we'll also set the verity bit in the same
- * transaction.
+ * Write all pages (both data and verity metadata). Note that this must
+ * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages
+ * beyond i_size won't be written properly. For crash consistency, this
+ * also must happen before the verity inode flag gets persisted.
*/
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto cleanup;
- ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ /*
+ * Finally, set the verity inode flag and remove the inode from the
+ * orphan list (in a single transaction).
+ */
handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto cleanup;
}
- err2 = ext4_orphan_del(handle, inode);
- if (err2)
- goto out_stop;
+ err = ext4_orphan_del(handle, inode);
+ if (err)
+ goto stop_and_cleanup;
- if (desc != NULL && !err) {
- struct ext4_iloc iloc;
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto stop_and_cleanup;
+
+ ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+ ext4_set_inode_flags(inode, false);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (err)
+ goto stop_and_cleanup;
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto out_stop;
- ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
- ext4_set_inode_flags(inode);
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
- }
-out_stop:
ext4_journal_stop(handle);
- return err ?: err2;
+
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ return 0;
+
+stop_and_cleanup:
+ ext4_journal_stop(handle);
+cleanup:
+ /*
+ * Verity failed to be enabled, so clean up by truncating any verity
+ * metadata that was written beyond i_size (both from cache and from
+ * disk), removing the inode from the orphan list (if it wasn't done
+ * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS.
+ */
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+ ext4_orphan_del(NULL, inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ return err;
}
static int ext4_get_verity_descriptor_location(struct inode *inode,
@@ -279,16 +296,14 @@ static int ext4_get_verity_descriptor_location(struct inode *inode,
last_extent = path[path->p_depth].p_ext;
if (!last_extent) {
EXT4_ERROR_INODE(inode, "verity file has no extents");
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return -EFSCORRUPTED;
}
end_lblk = le32_to_cpu(last_extent->ee_block) +
ext4_ext_get_actual_len(last_extent);
- desc_size_pos = (u64)end_lblk << inode->i_blkbits;
- ext4_ext_drop_refs(path);
- kfree(path);
+ desc_size_pos = EXT4_LBLK_TO_B(inode, end_lblk);
+ ext4_free_ext_path(path);
if (desc_size_pos < sizeof(desc_size_disk))
goto bad;
@@ -346,31 +361,36 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- struct page *page;
+ struct folio *folio;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
- page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
- if (!page || !PageUptodate(page)) {
- if (page)
- put_page(page);
+ folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
+ if (!IS_ERR(folio))
+ folio_put(folio);
else if (num_ra_pages > 1)
- page_cache_readahead_unbounded(inode->i_mapping, NULL,
- index, num_ra_pages, 0);
- page = read_mapping_page(inode->i_mapping, index, NULL);
+ page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
+ folio = read_mapping_folio(inode->i_mapping, index, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
- return page;
+ return folio_file_page(folio, index);
}
static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
- u64 index, int log_blocksize)
+ u64 pos, unsigned int size)
{
- loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize);
+ pos += ext4_verity_metadata_pos(inode);
- return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+ return pagecache_write(inode, buf, size, pos);
}
const struct fsverity_operations ext4_verityops = {
+ .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) -
+ (int)offsetof(struct ext4_inode_info, vfs_inode),
.begin_enable_verity = ext4_begin_enable_verity,
.end_enable_verity = ext4_end_enable_verity,
.get_verity_descriptor = ext4_get_verity_descriptor,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9b29a40738ac..2e02efbddaac 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,30 +81,30 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
struct mb_cache_entry **);
static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
size_t value_count);
+static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value,
+ size_t value_count);
static void ext4_xattr_rehash(struct ext4_xattr_header *);
static const struct xattr_handler * const ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
- [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
+ [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
#endif
[EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
[EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
#endif
+ [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
-const struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler * const ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
#endif
+ &ext4_xattr_hurd_handler,
NULL
};
@@ -121,7 +121,11 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
#ifdef CONFIG_LOCKDEP
void ext4_xattr_inode_set_class(struct inode *ea_inode)
{
+ struct ext4_inode_info *ei = EXT4_I(ea_inode);
+
lockdep_set_subclass(&ea_inode->i_rwsem, 1);
+ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */
+ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA);
}
#endif
@@ -135,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
__u32 dummy_csum = 0;
int offset = offsetof(struct ext4_xattr_header, h_checksum);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(csum, (__u8 *)hdr, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
offset += sizeof(dummy_csum);
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
+ csum = ext4_chksum(csum, (__u8 *)hdr + offset,
EXT4_BLOCK_SIZE(inode->i_sb) - offset);
return cpu_to_le32(csum);
@@ -152,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
struct ext4_xattr_header *hdr = BHDR(bh);
int ret = 1;
- if (ext4_has_metadata_csum(inode->i_sb)) {
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
lock_buffer(bh);
ret = (hdr->h_checksum == ext4_xattr_block_csum(inode,
bh->b_blocknr, hdr));
@@ -164,43 +168,97 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
static void ext4_xattr_block_csum_set(struct inode *inode,
struct buffer_head *bh)
{
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode,
bh->b_blocknr, BHDR(bh));
}
-static inline const struct xattr_handler *
-ext4_xattr_handler(int name_index)
+static inline const char *ext4_xattr_prefix(int name_index,
+ struct dentry *dentry)
{
const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
handler = ext4_xattr_handler_map[name_index];
- return handler;
+
+ if (!xattr_handler_can_list(handler, dentry))
+ return NULL;
+
+ return xattr_prefix(handler);
}
static int
-ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
- void *value_start)
+check_xattrs(struct inode *inode, struct buffer_head *bh,
+ struct ext4_xattr_entry *entry, void *end, void *value_start,
+ const char *function, unsigned int line)
{
struct ext4_xattr_entry *e = entry;
+ int err = -EFSCORRUPTED;
+ char *err_str;
+
+ if (bh) {
+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+ BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+ err_str = "invalid header";
+ goto errout;
+ }
+ if (buffer_verified(bh))
+ return 0;
+ if (!ext4_xattr_block_csum_verify(inode, bh)) {
+ err = -EFSBADCRC;
+ err_str = "invalid checksum";
+ goto errout;
+ }
+ } else {
+ struct ext4_xattr_ibody_header *header = value_start;
+
+ header -= 1;
+ if (end - (void *)header < sizeof(*header) + sizeof(u32)) {
+ err_str = "in-inode xattr block too small";
+ goto errout;
+ }
+ if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ err_str = "bad magic number in in-inode xattr";
+ goto errout;
+ }
+ }
/* Find the end of the names list */
while (!IS_LAST_ENTRY(e)) {
struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
- if ((void *)next >= end)
- return -EFSCORRUPTED;
- if (strnlen(e->e_name, e->e_name_len) != e->e_name_len)
- return -EFSCORRUPTED;
+ if ((void *)next >= end) {
+ err_str = "e_name out of bounds";
+ goto errout;
+ }
+ if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) {
+ err_str = "bad e_name length";
+ goto errout;
+ }
e = next;
}
/* Check the values */
while (!IS_LAST_ENTRY(entry)) {
u32 size = le32_to_cpu(entry->e_value_size);
+ unsigned long ea_ino = le32_to_cpu(entry->e_value_inum);
- if (size > EXT4_XATTR_SIZE_MAX)
- return -EFSCORRUPTED;
+ if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) {
+ err_str = "ea_inode specified without ea_inode feature enabled";
+ goto errout;
+ }
+ if (ea_ino && ((ea_ino == EXT4_ROOT_INO) ||
+ !ext4_valid_inum(inode->i_sb, ea_ino))) {
+ err_str = "invalid ea_ino";
+ goto errout;
+ }
+ if (ea_ino && !size) {
+ err_str = "invalid size in ea xattr";
+ goto errout;
+ }
+ if (size > EXT4_XATTR_SIZE_MAX) {
+ err_str = "e_value size too large";
+ goto errout;
+ }
if (size != 0 && entry->e_value_inum == 0) {
u16 offs = le16_to_cpu(entry->e_value_offs);
@@ -212,71 +270,56 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
* the padded and unpadded sizes, since the size may
* overflow to 0 when adding padding.
*/
- if (offs > end - value_start)
- return -EFSCORRUPTED;
+ if (offs > end - value_start) {
+ err_str = "e_value out of bounds";
+ goto errout;
+ }
value = value_start + offs;
if (value < (void *)e + sizeof(u32) ||
size > end - value ||
- EXT4_XATTR_SIZE(size) > end - value)
- return -EFSCORRUPTED;
+ EXT4_XATTR_SIZE(size) > end - value) {
+ err_str = "overlapping e_value ";
+ goto errout;
+ }
}
entry = EXT4_XATTR_NEXT(entry);
}
-
+ if (bh)
+ set_buffer_verified(bh);
return 0;
+
+errout:
+ if (bh)
+ __ext4_error_inode(inode, function, line, 0, -err,
+ "corrupted xattr block %llu: %s",
+ (unsigned long long) bh->b_blocknr,
+ err_str);
+ else
+ __ext4_error_inode(inode, function, line, 0, -err,
+ "corrupted in-inode xattr: %s", err_str);
+ return err;
}
static inline int
__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
const char *function, unsigned int line)
{
- int error = -EFSCORRUPTED;
-
- if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
- BHDR(bh)->h_blocks != cpu_to_le32(1))
- goto errout;
- if (buffer_verified(bh))
- return 0;
-
- error = -EFSBADCRC;
- if (!ext4_xattr_block_csum_verify(inode, bh))
- goto errout;
- error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
- bh->b_data);
-errout:
- if (error)
- __ext4_error_inode(inode, function, line, 0, -error,
- "corrupted xattr block %llu",
- (unsigned long long) bh->b_blocknr);
- else
- set_buffer_verified(bh);
- return error;
+ return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size,
+ bh->b_data, function, line);
}
#define ext4_xattr_check_block(inode, bh) \
__ext4_xattr_check_block((inode), (bh), __func__, __LINE__)
-static int
+int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
{
- int error = -EFSCORRUPTED;
-
- if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
- (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
- goto errout;
- error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
-errout:
- if (error)
- __ext4_error_inode(inode, function, line, 0, -error,
- "corrupted in-inode xattr");
- return error;
+ return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header),
+ function, line);
}
-#define xattr_check_inode(inode, header, end) \
- __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
-
static int
xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
void *end, int name_index, const char *name, int sorted)
@@ -299,7 +342,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
cmp = name_len - entry->e_name_len;
if (!cmp)
cmp = memcmp(name, entry->e_name, name_len);
- if (cmp <= 0 && (sorted || cmp == 0))
+ if (!cmp || (cmp < 0 && sorted))
break;
}
*pentry = entry;
@@ -309,29 +352,29 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
static u32
ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
{
- return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
+ return ext4_chksum(sbi->s_csum_seed, buffer, size);
}
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
- return ((u64)ea_inode->i_ctime.tv_sec << 32) |
+ return ((u64) inode_get_ctime_sec(ea_inode) << 32) |
(u32) inode_peek_iversion_raw(ea_inode);
}
static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
{
- ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
+ inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0);
inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff);
}
static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
{
- return (u32)ea_inode->i_atime.tv_sec;
+ return (u32) inode_get_atime_sec(ea_inode);
}
static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
{
- ea_inode->i_atime.tv_sec = hash;
+ inode_set_atime(ea_inode, hash, 0);
}
/*
@@ -376,7 +419,7 @@ free_bhs:
return ret;
}
-#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode)))
static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
u32 ea_inode_hash, struct inode **ea_inode)
@@ -384,7 +427,18 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
struct inode *inode;
int err;
- inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
+ /*
+ * We have to check for this corruption early as otherwise
+ * iget_locked() could wait indefinitely for the state of our
+ * parent inode.
+ */
+ if (parent->i_ino == ea_ino) {
+ ext4_error(parent->i_sb,
+ "Parent and EA inode have the same ino %lu", ea_ino);
+ return -EFSCORRUPTED;
+ }
+
+ inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
ext4_error(parent->i_sb,
@@ -392,23 +446,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
err);
return err;
}
-
- if (is_bad_inode(inode)) {
- ext4_error(parent->i_sb,
- "error while reading EA inode %lu is_bad_inode",
- ea_ino);
- err = -EIO;
- goto error;
- }
-
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
- ext4_error(parent->i_sb,
- "EA inode %lu does not have EXT4_EA_INODE_FL flag",
- ea_ino);
- err = -EINVAL;
- goto error;
- }
-
ext4_xattr_inode_set_class(inode);
/*
@@ -422,16 +459,28 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE);
ext4_xattr_inode_set_ref(inode, 1);
} else {
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_XATTR);
inode->i_flags |= S_NOQUOTA;
inode_unlock(inode);
}
*ea_inode = inode;
return 0;
-error:
- iput(inode);
- return err;
+}
+
+/* Remove entry from mbcache when EA inode is getting evicted */
+void ext4_evict_ea_inode(struct inode *inode)
+{
+ struct mb_cache_entry *oe;
+
+ if (!EA_INODE_CACHE(inode))
+ return;
+ /* Wait for entry to get unused so that we can remove it */
+ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
+ ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
+ }
}
static int
@@ -453,8 +502,22 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
tmp_data = cpu_to_le32(hash);
e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len,
&tmp_data, 1);
+ /* All good? */
+ if (e_hash == entry->e_hash)
+ return 0;
+
+ /*
+ * Not good. Maybe the entry hash was calculated
+ * using the buggy signed char version?
+ */
+ e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len,
+ &tmp_data, 1);
+ /* Still no match - bad */
if (e_hash != entry->e_hash)
return -EFSCORRUPTED;
+
+ /* Let people know about old hash */
+ pr_warn_once("ext4: filesystem with signed xattr name hash");
}
return 0;
}
@@ -587,10 +650,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
+ end = ITAIL(inode, raw_inode);
entry = IFIRST(header);
error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
if (error)
@@ -639,7 +699,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
{
int error;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (strlen(name) > 255)
@@ -662,11 +722,10 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
size_t rest = buffer_size;
for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
- const struct xattr_handler *handler =
- ext4_xattr_handler(entry->e_name_index);
+ const char *prefix;
- if (handler && (!handler->list || handler->list(dentry))) {
- const char *prefix = handler->prefix ?: handler->name;
+ prefix = ext4_xattr_prefix(entry->e_name_index, dentry);
+ if (prefix) {
size_t prefix_len = strlen(prefix);
size_t size = prefix_len + entry->e_name_len + 1;
@@ -722,7 +781,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
- void *end;
int error;
if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
@@ -732,14 +790,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
buffer, buffer_size);
-cleanup:
brelse(iloc.bh);
return error;
}
@@ -789,9 +842,13 @@ static void ext4_xattr_update_super_block(handle_t *handle,
return;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+ if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE) == 0) {
+ lock_buffer(EXT4_SB(sb)->s_sbh);
ext4_set_feature_xattr(sb);
- ext4_handle_dirty_super(handle, sb);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
+ ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
}
}
@@ -803,7 +860,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_entry *entry;
qsize_t ea_inode_refs = 0;
- void *end;
int ret;
lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
@@ -814,10 +870,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
goto out;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- ret = xattr_check_inode(inode, header, end);
- if (ret)
- goto out;
for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry))
@@ -914,7 +966,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
* so we need to reserve credits for this eventuality
*/
if (inode && ext4_has_inline_data(inode))
- credits += ext4_writepage_trans_blocks(inode) + 1;
+ credits += ext4_chunk_trans_extent(inode, 1) + 1;
/* We are done if ea_inode feature is not enabled. */
if (!ext4_has_feature_ea_inode(sb))
@@ -970,45 +1022,36 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
- struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
struct ext4_iloc iloc;
- s64 ref_count;
- u32 hash;
+ u64 ref_count;
int ret;
- inode_lock(ea_inode);
+ inode_lock_nested(ea_inode, I_MUTEX_XATTR);
ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
if (ret)
goto out;
ref_count = ext4_xattr_inode_get_ref(ea_inode);
+ if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) {
+ ext4_error_inode(ea_inode, __func__, __LINE__, 0,
+ "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d",
+ ea_inode->i_ino, ref_count, ref_change);
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
ref_count += ref_change;
ext4_xattr_inode_set_ref(ea_inode, ref_count);
if (ref_change > 0) {
- WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
- ea_inode->i_ino, ref_count);
-
if (ref_count == 1) {
WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
ea_inode->i_ino, ea_inode->i_nlink);
set_nlink(ea_inode, 1);
ext4_orphan_del(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_create(ea_inode_cache,
- GFP_NOFS, hash,
- ea_inode->i_ino,
- true /* reusable */);
- }
}
} else {
- WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
- ea_inode->i_ino, ref_count);
-
if (ref_count == 0) {
WARN_ONCE(ea_inode->i_nlink != 1,
"EA inode %lu i_nlink=%u",
@@ -1016,12 +1059,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
clear_nlink(ea_inode);
ext4_orphan_add(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_delete(ea_inode_cache, hash,
- ea_inode->i_ino);
- }
}
}
@@ -1127,15 +1164,28 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
{
struct inode *ea_inode;
struct ext4_xattr_entry *entry;
+ struct ext4_iloc iloc;
bool dirty = false;
unsigned int ea_ino;
int err;
int credits;
+ void *end;
+
+ if (block_csum)
+ end = (void *)bh->b_data + bh->b_size;
+ else {
+ err = ext4_get_inode_loc(parent, &iloc);
+ if (err) {
+ EXT4_ERROR_INODE(parent, "parent inode loc (error %d)", err);
+ return;
+ }
+ end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
+ }
/* One credit for dec ref on ea_inode, one for orphan list addition, */
credits = 2 + extra_credits;
- for (entry = first; !IS_LAST_ENTRY(entry);
+ for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry)) {
if (!entry->e_value_inum)
continue;
@@ -1164,7 +1214,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
continue;
}
if (err > 0) {
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle,
+ parent->i_sb, bh, EXT4_JTR_NONE);
if (err) {
ext4_warning_inode(ea_inode,
"Re-get write access err=%d",
@@ -1225,10 +1276,12 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
int error = 0;
BUFFER_TRACE(bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
+retry_ref:
lock_buffer(bh);
hash = le32_to_cpu(BHDR(bh)->h_hash);
ref = le32_to_cpu(BHDR(bh)->h_refcount);
@@ -1238,9 +1291,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
* This must happen under buffer lock for
* ext4_xattr_block_set() to reliably detect freed block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+ bh->b_blocknr);
+ if (oe) {
+ unlock_buffer(bh);
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto retry_ref;
+ }
+ }
get_bh(bh);
unlock_buffer(bh);
@@ -1264,7 +1326,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ce = mb_cache_entry_get(ea_block_cache, hash,
bh->b_blocknr);
if (ce) {
- ce->e_reusable = 1;
+ set_bit(MBE_REUSABLE_B, &ce->e_flags);
mb_cache_entry_put(ea_block_cache, ce);
}
}
@@ -1354,8 +1416,7 @@ retry:
block = 0;
while (wsize < bufsize) {
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
csize = (bufsize - wsize) > blocksize ? blocksize :
bufsize - wsize;
bh = ext4_getblk(handle, ea_inode, block, 0);
@@ -1367,11 +1428,18 @@ retry:
"ext4_getblk() return bh = NULL");
return -EFSCORRUPTED;
}
- ret = ext4_journal_get_write_access(handle, bh);
+ ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (ret)
goto out;
memcpy(bh->b_data, buf, csize);
+ /*
+ * Zero out block tail to avoid writing uninitialized memory
+ * to disk.
+ */
+ if (csize < blocksize)
+ memset(bh->b_data + csize, 0, blocksize - csize);
set_buffer_uptodate(bh);
ext4_handle_dirty_metadata(handle, ea_inode, bh);
@@ -1405,6 +1473,13 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
int err;
+ if (inode->i_sb->s_root == NULL) {
+ ext4_warning(inode->i_sb,
+ "refuse to create EA inode when umounting");
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Let the next inode be the goal, so we try and allocate the EA inode
* in the same group, or nearby one.
@@ -1424,6 +1499,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
if (!err)
err = ext4_inode_attach_jinode(ea_inode);
if (err) {
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode,
+ "cleanup dec ref error %d", err);
iput(ea_inode);
return ERR_PTR(err);
}
@@ -1458,7 +1536,10 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
if (!ce)
return NULL;
- ea_data = kvmalloc(value_len, GFP_KERNEL);
+ WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
+ !(current->flags & PF_MEMALLOC_NOFS));
+
+ ea_data = kvmalloc(value_len, GFP_NOFS);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
return NULL;
@@ -1466,11 +1547,11 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
while (ce) {
ea_inode = ext4_iget(inode->i_sb, ce->e_value,
- EXT4_IGET_NORMAL);
- if (!IS_ERR(ea_inode) &&
- !is_bad_inode(ea_inode) &&
- (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
- i_size_read(ea_inode) == value_len &&
+ EXT4_IGET_EA_INODE);
+ if (IS_ERR(ea_inode))
+ goto next_entry;
+ ext4_xattr_inode_set_class(ea_inode);
+ if (i_size_read(ea_inode) == value_len &&
!ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
!ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data,
value_len) &&
@@ -1480,9 +1561,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
kvfree(ea_data);
return ea_inode;
}
-
- if (!IS_ERR(ea_inode))
- iput(ea_inode);
+ iput(ea_inode);
+ next_entry:
ce = mb_cache_entry_find_next(ea_inode_cache, ce);
}
kvfree(ea_data);
@@ -1492,45 +1572,49 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
/*
* Add value of the EA in an inode.
*/
-static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
- const void *value, size_t value_len,
- struct inode **ret_inode)
+static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
+ struct inode *inode, const void *value, size_t value_len)
{
struct inode *ea_inode;
u32 hash;
int err;
+ /* Account inode & space to quota even if sharing... */
+ err = ext4_xattr_inode_alloc_quota(inode, value_len);
+ if (err)
+ return ERR_PTR(err);
+
hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
if (ea_inode) {
err = ext4_xattr_inode_inc_ref(handle, ea_inode);
- if (err) {
- iput(ea_inode);
- return err;
- }
-
- *ret_inode = ea_inode;
- return 0;
+ if (err)
+ goto out_err;
+ return ea_inode;
}
/* Create an inode for the EA value */
ea_inode = ext4_xattr_inode_create(handle, inode, hash);
- if (IS_ERR(ea_inode))
- return PTR_ERR(ea_inode);
+ if (IS_ERR(ea_inode)) {
+ ext4_xattr_inode_free_quota(inode, NULL, value_len);
+ return ea_inode;
+ }
err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
if (err) {
- ext4_xattr_inode_dec_ref(handle, ea_inode);
- iput(ea_inode);
- return err;
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
+ goto out_err;
}
if (EA_INODE_CACHE(inode))
mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
ea_inode->i_ino, true /* reusable */);
-
- *ret_inode = ea_inode;
- return 0;
+ return ea_inode;
+out_err:
+ iput(ea_inode);
+ ext4_xattr_inode_free_quota(inode, NULL, value_len);
+ return ERR_PTR(err);
}
/*
@@ -1542,6 +1626,7 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
struct ext4_xattr_search *s,
handle_t *handle, struct inode *inode,
+ struct inode *new_ea_inode,
bool is_block)
{
struct ext4_xattr_entry *last, *next;
@@ -1549,7 +1634,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
size_t min_offs = s->end - s->base, name_len = strlen(i->name);
int in_inode = i->in_inode;
struct inode *old_ea_inode = NULL;
- struct inode *new_ea_inode = NULL;
size_t old_size, new_size;
int ret;
@@ -1610,7 +1694,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
* If storing the value in an external inode is an option,
* reserve space for xattr entries/names in the external
* attribute block so that a long value does not occupy the
- * whole space and prevent futher entries being added.
+ * whole space and prevent further entries being added.
*/
if (ext4_has_feature_ea_inode(inode->i_sb) &&
new_size && is_block &&
@@ -1634,43 +1718,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
old_ea_inode = NULL;
goto out;
}
- }
- if (i->value && in_inode) {
- WARN_ON_ONCE(!i->value_len);
- ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
- if (ret)
- goto out;
-
- ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
- i->value_len,
- &new_ea_inode);
- if (ret) {
- new_ea_inode = NULL;
- ext4_xattr_inode_free_quota(inode, NULL, i->value_len);
- goto out;
- }
- }
-
- if (old_ea_inode) {
/* We are ready to release ref count on the old_ea_inode. */
ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
- if (ret) {
- /* Release newly required ref count on new_ea_inode. */
- if (new_ea_inode) {
- int err;
-
- err = ext4_xattr_inode_dec_ref(handle,
- new_ea_inode);
- if (err)
- ext4_warning_inode(new_ea_inode,
- "dec ref new_ea_inode err=%d",
- err);
- ext4_xattr_inode_free_quota(inode, new_ea_inode,
- i->value_len);
- }
+ if (ret)
goto out;
- }
ext4_xattr_inode_free_quota(inode, old_ea_inode,
le32_to_cpu(here->e_value_size));
@@ -1708,6 +1760,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
memmove(here, (void *)here + size,
(void *)last - (void *)here + sizeof(__u32));
memset(last, 0, size);
+
+ /*
+ * Update i_inline_off - moved ibody region might contain
+ * system.data attribute. Handling a failure here won't
+ * cause other complications for setting an xattr.
+ */
+ if (!is_block && ext4_has_inline_data(inode)) {
+ ret = ext4_find_inline_data_nolock(inode);
+ if (ret) {
+ ext4_warning_inode(inode,
+ "unable to update i_inline_off");
+ goto out;
+ }
+ }
} else if (s->not_found) {
/* Insert new name. */
size_t size = EXT4_XATTR_LEN(name_len);
@@ -1780,7 +1846,6 @@ update_hash:
ret = 0;
out:
iput(old_ea_inode);
- iput(new_ea_inode);
return ret;
}
@@ -1843,14 +1908,30 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
size_t old_ea_inode_quota = 0;
unsigned int ea_ino;
-
#define header(x) ((struct ext4_xattr_header *)(x))
+ /* If we need EA inode, prepare it before locking the buffer */
+ if (i->value && i->in_inode) {
+ WARN_ON_ONCE(!i->value_len);
+
+ ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
+ i->value, i->value_len);
+ if (IS_ERR(ea_inode)) {
+ error = PTR_ERR(ea_inode);
+ ea_inode = NULL;
+ goto cleanup;
+ }
+ }
+
if (s->base) {
+ int offset = (char *)s->here - bs->bh->b_data;
+
BUFFER_TRACE(bs->bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, bs->bh);
+ error = ext4_journal_get_write_access(handle, sb, bs->bh,
+ EXT4_JTR_NONE);
if (error)
goto cleanup;
+
lock_buffer(bs->bh);
if (header(s->base)->h_refcount == cpu_to_le32(1)) {
@@ -1861,12 +1942,23 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
* ext4_xattr_block_set() to reliably detect modified
* block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bs->bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache,
+ hash, bs->bh->b_blocknr);
+ if (oe) {
+ /*
+ * Xattr block is getting reused. Leave
+ * it alone.
+ */
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto clone_block;
+ }
+ }
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s, handle, inode,
- true /* is_block */);
+ ea_inode, true /* is_block */);
ext4_xattr_block_csum_set(inode, bs->bh);
unlock_buffer(bs->bh);
if (error == -EFSCORRUPTED)
@@ -1878,55 +1970,51 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (error)
goto cleanup;
goto inserted;
- } else {
- int offset = (char *)s->here - bs->bh->b_data;
+ }
+clone_block:
+ unlock_buffer(bs->bh);
+ ea_bdebug(bs->bh, "cloning");
+ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ s->first = ENTRY(header(s->base)+1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->here = ENTRY(s->base + offset);
+ s->end = s->base + bs->bh->b_size;
- unlock_buffer(bs->bh);
- ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
- error = -ENOMEM;
- if (s->base == NULL)
+ /*
+ * If existing entry points to an xattr inode, we need
+ * to prevent ext4_xattr_set_entry() from decrementing
+ * ref count on it because the reference belongs to the
+ * original block. In this case, make the entry look
+ * like it has an empty value.
+ */
+ if (!s->not_found && s->here->e_value_inum) {
+ ea_ino = le32_to_cpu(s->here->e_value_inum);
+ error = ext4_xattr_inode_iget(inode, ea_ino,
+ le32_to_cpu(s->here->e_hash),
+ &tmp_inode);
+ if (error)
goto cleanup;
- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
- s->first = ENTRY(header(s->base)+1);
- header(s->base)->h_refcount = cpu_to_le32(1);
- s->here = ENTRY(s->base + offset);
- s->end = s->base + bs->bh->b_size;
-
- /*
- * If existing entry points to an xattr inode, we need
- * to prevent ext4_xattr_set_entry() from decrementing
- * ref count on it because the reference belongs to the
- * original block. In this case, make the entry look
- * like it has an empty value.
- */
- if (!s->not_found && s->here->e_value_inum) {
- ea_ino = le32_to_cpu(s->here->e_value_inum);
- error = ext4_xattr_inode_iget(inode, ea_ino,
- le32_to_cpu(s->here->e_hash),
- &tmp_inode);
- if (error)
- goto cleanup;
-
- if (!ext4_test_inode_state(tmp_inode,
- EXT4_STATE_LUSTRE_EA_INODE)) {
- /*
- * Defer quota free call for previous
- * inode until success is guaranteed.
- */
- old_ea_inode_quota = le32_to_cpu(
- s->here->e_value_size);
- }
- iput(tmp_inode);
- s->here->e_value_inum = 0;
- s->here->e_value_size = 0;
+ if (!ext4_test_inode_state(tmp_inode,
+ EXT4_STATE_LUSTRE_EA_INODE)) {
+ /*
+ * Defer quota free call for previous
+ * inode until success is guaranteed.
+ */
+ old_ea_inode_quota = le32_to_cpu(
+ s->here->e_value_size);
}
+ iput(tmp_inode);
+
+ s->here->e_value_inum = 0;
+ s->here->e_value_size = 0;
}
} else {
/* Allocate a buffer where we construct the new block. */
s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
- /* assert(header == s->base) */
error = -ENOMEM;
if (s->base == NULL)
goto cleanup;
@@ -1938,33 +2026,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
s->end = s->base + sb->s_blocksize;
}
- error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */);
+ error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode,
+ true /* is_block */);
if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
- if (i->value && s->here->e_value_inum) {
- /*
- * A ref count on ea_inode has been taken as part of the call to
- * ext4_xattr_set_entry() above. We would like to drop this
- * extra ref but we have to wait until the xattr block is
- * initialized and has its own ref count on the ea_inode.
- */
- ea_ino = le32_to_cpu(s->here->e_value_inum);
- error = ext4_xattr_inode_iget(inode, ea_ino,
- le32_to_cpu(s->here->e_hash),
- &ea_inode);
- if (error) {
- ea_inode = NULL;
+inserted:
+ if (!IS_LAST_ENTRY(s->first)) {
+ new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce);
+ if (IS_ERR(new_bh)) {
+ error = PTR_ERR(new_bh);
+ new_bh = NULL;
goto cleanup;
}
- }
-inserted:
- if (!IS_LAST_ENTRY(s->first)) {
- new_bh = ext4_xattr_block_cache_find(inode, header(s->base),
- &ce);
if (new_bh) {
/* We found an identical block in the cache. */
if (new_bh == bs->bh)
@@ -1972,8 +2049,9 @@ inserted:
else {
u32 ref;
+#ifdef EXT4_XATTR_DEBUG
WARN_ON_ONCE(dquot_initialize_needed(inode));
-
+#endif
/* The old block is released after updating
the inode. */
error = dquot_alloc_block(inode,
@@ -1981,25 +2059,21 @@ inserted:
if (error)
goto cleanup;
BUFFER_TRACE(new_bh, "get_write_access");
- error = ext4_journal_get_write_access(handle,
- new_bh);
+ error = ext4_journal_get_write_access(
+ handle, sb, new_bh,
+ EXT4_JTR_NONE);
if (error)
goto cleanup_dquot;
lock_buffer(new_bh);
/*
* We have to be careful about races with
- * freeing, rehashing or adding references to
- * xattr block. Once we hold buffer lock xattr
- * block's state is stable so we can check
- * whether the block got freed / rehashed or
- * not. Since we unhash mbcache entry under
- * buffer lock when freeing / rehashing xattr
- * block, checking whether entry is still
- * hashed is reliable. Same rules hold for
- * e_reusable handling.
+ * adding references to xattr block. Once we
+ * hold buffer lock xattr block's state is
+ * stable so we can check the additional
+ * reference fits.
*/
- if (hlist_bl_unhashed(&ce->e_hash_list) ||
- !ce->e_reusable) {
+ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ if (ref > EXT4_XATTR_REFCOUNT_MAX) {
/*
* Undo everything and check mbcache
* again.
@@ -2014,10 +2088,9 @@ inserted:
new_bh = NULL;
goto inserted;
}
- ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
- if (ref >= EXT4_XATTR_REFCOUNT_MAX)
- ce->e_reusable = 0;
+ if (ref == EXT4_XATTR_REFCOUNT_MAX)
+ clear_bit(MBE_REUSABLE_B, &ce->e_flags);
ea_bdebug(new_bh, "reusing; refcount now=%d",
ref);
ext4_xattr_block_csum_set(inode, new_bh);
@@ -2041,23 +2114,16 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal, block;
+#ifdef EXT4_XATTR_DEBUG
WARN_ON_ONCE(dquot_initialize_needed(inode));
-
+#endif
goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
-
- /* non-extent files can't have physical blocks past 2^32 */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
if (error)
goto cleanup;
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
-
ea_idebug(inode, "creating block %llu",
(unsigned long long)block);
@@ -2086,7 +2152,8 @@ getblk_failed:
}
lock_buffer(new_bh);
- error = ext4_journal_get_create_access(handle, new_bh);
+ error = ext4_journal_get_create_access(handle, sb,
+ new_bh, EXT4_JTR_NONE);
if (error) {
unlock_buffer(new_bh);
error = -EIO;
@@ -2123,17 +2190,16 @@ getblk_failed:
cleanup:
if (ea_inode) {
- int error2;
-
- error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
- if (error2)
- ext4_warning_inode(ea_inode, "dec ref error=%d",
- error2);
+ if (error) {
+ int error2;
- /* If there was an error, revert the quota charge. */
- if (error)
+ error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (error2)
+ ext4_warning_inode(ea_inode, "dec ref error=%d",
+ error2);
ext4_xattr_inode_free_quota(inode, ea_inode,
i_size_read(ea_inode));
+ }
iput(ea_inode);
}
if (ce)
@@ -2163,17 +2229,15 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_inode *raw_inode;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return 0;
+
raw_inode = ext4_raw_inode(&is->iloc);
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
- is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ is->s.end = ITAIL(inode, raw_inode);
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
- error = xattr_check_inode(inode, header, is->s.end);
- if (error)
- return error;
/* Find the named attribute. */
error = xattr_find_entry(inode, &is->s.here, is->s.end,
i->name_index, i->name, 0);
@@ -2184,43 +2248,44 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return 0;
}
-int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_search *s = &is->s;
+ struct inode *ea_inode = NULL;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return -ENOSPC;
- error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
- if (error)
- return error;
- header = IHDR(inode, ext4_raw_inode(&is->iloc));
- if (!IS_LAST_ENTRY(s->first)) {
- header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
- } else {
- header->h_magic = cpu_to_le32(0);
- ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+
+ /* If we need EA inode, prepare it before locking the buffer */
+ if (i->value && i->in_inode) {
+ WARN_ON_ONCE(!i->value_len);
+
+ ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
+ i->value, i->value_len);
+ if (IS_ERR(ea_inode))
+ return PTR_ERR(ea_inode);
}
- return 0;
-}
+ error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode,
+ false /* is_block */);
+ if (error) {
+ if (ea_inode) {
+ int error2;
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
-{
- struct ext4_xattr_ibody_header *header;
- struct ext4_xattr_search *s = &is->s;
- int error;
+ error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (error2)
+ ext4_warning_inode(ea_inode, "dec ref error=%d",
+ error2);
- if (EXT4_I(inode)->i_extra_isize == 0)
- return -ENOSPC;
- error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
- if (error)
+ ext4_xattr_inode_free_quota(inode, ea_inode,
+ i_size_read(ea_inode));
+ iput(ea_inode);
+ }
return error;
+ }
header = IHDR(inode, ext4_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
@@ -2229,6 +2294,7 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
header->h_magic = cpu_to_le32(0);
ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
}
+ iput(ea_inode);
return 0;
}
@@ -2324,6 +2390,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
error = -ENOSPC;
goto cleanup;
}
+ WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}
error = ext4_reserve_inode_write(handle, inode, &is.iloc);
@@ -2397,7 +2464,7 @@ retry_inode:
* external inode if possible.
*/
if (ext4_has_feature_ea_inode(inode->i_sb) &&
- !i.in_inode) {
+ i.value_len && !i.in_inode) {
i.in_inode = 1;
goto retry_inode;
}
@@ -2406,7 +2473,8 @@ retry_inode:
}
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
+ inode_inc_iversion(inode);
if (!value)
no_expand = 0;
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -2418,6 +2486,7 @@ retry_inode:
if (IS_SYNC(inode))
ext4_handle_sync(handle);
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
cleanup:
brelse(is.iloc.bh);
@@ -2488,6 +2557,8 @@ retry:
error = ext4_xattr_set_handle(handle, inode, name_index, name,
value, value_len, flags);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
+ handle);
error2 = ext4_journal_stop(handle);
if (error == -ENOSPC &&
ext4_should_retry_alloc(sb, &retries))
@@ -2543,13 +2614,13 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
.in_inode = !!entry->e_value_inum,
};
struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
+ int needs_kvfree = 0;
int error;
is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
- buffer = kmalloc(value_size, GFP_NOFS);
b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
- if (!is || !bs || !buffer || !b_entry_name) {
+ if (!is || !bs || !b_entry_name) {
error = -ENOMEM;
goto out;
}
@@ -2561,12 +2632,18 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
/* Save the entry name and the entry value */
if (entry->e_value_inum) {
+ buffer = kvmalloc(value_size, GFP_NOFS);
+ if (!buffer) {
+ error = -ENOMEM;
+ goto out;
+ }
+ needs_kvfree = 1;
error = ext4_xattr_inode_get(inode, entry, buffer, value_size);
if (error)
goto out;
} else {
size_t value_offs = le16_to_cpu(entry->e_value_offs);
- memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size);
+ buffer = (void *)IFIRST(header) + value_offs;
}
memcpy(b_entry_name, entry->e_name, entry->e_name_len);
@@ -2581,25 +2658,26 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
if (error)
goto out;
- /* Remove the chosen entry from the inode */
- error = ext4_xattr_ibody_set(handle, inode, &i, is);
- if (error)
- goto out;
-
i.value = buffer;
i.value_len = value_size;
error = ext4_xattr_block_find(inode, &i, bs);
if (error)
goto out;
- /* Add entry which was removed from the inode into the block */
+ /* Move ea entry from the inode into the block */
error = ext4_xattr_block_set(handle, inode, &i, bs);
if (error)
goto out;
- error = 0;
+
+ /* Remove the chosen entry from the inode */
+ i.value = NULL;
+ i.value_len = 0;
+ error = ext4_xattr_ibody_set(handle, inode, &i, is);
+
out:
kfree(b_entry_name);
- kfree(buffer);
+ if (needs_kvfree && buffer)
+ kvfree(buffer);
if (is)
brelse(is->iloc.bh);
if (bs)
@@ -2706,14 +2784,10 @@ retry:
*/
base = IFIRST(header);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ end = ITAIL(inode, raw_inode);
min_offs = end - base;
total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
-
ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
if (ifree >= isize_diff)
goto shift;
@@ -2774,6 +2848,9 @@ shift:
(void *)header, total_ino);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ if (ext4_has_inline_data(inode))
+ error = ext4_find_inline_data_nolock(inode);
+
cleanup:
if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) {
ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.",
@@ -2797,33 +2874,31 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
if (*ea_inode_array == NULL) {
/*
* Start with 15 inodes, so it fits into a power-of-two size.
- * If *ea_inode_array is NULL, this is essentially offsetof()
*/
- (*ea_inode_array) =
- kmalloc(offsetof(struct ext4_xattr_inode_array,
- inodes[EIA_MASK]),
- GFP_NOFS);
+ (*ea_inode_array) = kmalloc(
+ struct_size(*ea_inode_array, inodes, EIA_MASK),
+ GFP_NOFS);
if (*ea_inode_array == NULL)
return -ENOMEM;
(*ea_inode_array)->count = 0;
} else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
/* expand the array once all 15 + n * 16 slots are full */
struct ext4_xattr_inode_array *new_array = NULL;
- int count = (*ea_inode_array)->count;
- /* if new_array is NULL, this is essentially offsetof() */
new_array = kmalloc(
- offsetof(struct ext4_xattr_inode_array,
- inodes[count + EIA_INCR]),
- GFP_NOFS);
+ struct_size(*ea_inode_array, inodes,
+ (*ea_inode_array)->count + EIA_INCR),
+ GFP_NOFS);
if (new_array == NULL)
return -ENOMEM;
memcpy(new_array, *ea_inode_array,
- offsetof(struct ext4_xattr_inode_array, inodes[count]));
+ struct_size(*ea_inode_array, inodes,
+ (*ea_inode_array)->count));
kfree(*ea_inode_array);
*ea_inode_array = new_array;
}
- (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode;
+ (*ea_inode_array)->count++;
+ (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode;
return 0;
}
@@ -2863,7 +2938,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
goto cleanup;
}
- error = ext4_journal_get_write_access(handle, iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb,
+ iloc.bh, EXT4_JTR_NONE);
if (error) {
EXT4_ERROR_INODE(inode, "write access (error %d)",
error);
@@ -2927,6 +3003,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
error);
goto cleanup;
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
}
error = 0;
cleanup:
@@ -2952,8 +3029,6 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
*
* Create a new entry in the extended attribute block cache, and insert
* it unless such an entry is already in the cache.
- *
- * Returns 0, or a negative error number on failure.
*/
static void
ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
@@ -2981,8 +3056,7 @@ ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
*
* Compare two extended attribute blocks for equality.
*
- * Returns 0 if the blocks are equal, 1 if they differ, and
- * a negative error number on errors.
+ * Returns 0 if the blocks are equal, 1 if they differ.
*/
static int
ext4_xattr_cmp(struct ext4_xattr_header *header1,
@@ -3021,8 +3095,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
*
* Find an identical extended attribute block.
*
- * Returns a pointer to the block found, or NULL if such a block was
- * not found or an error occurred.
+ * Returns a pointer to the block found, or NULL if such a block was not
+ * found, or an error pointer if an error occurred while reading ea block.
*/
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *inode,
@@ -3044,11 +3118,11 @@ ext4_xattr_block_cache_find(struct inode *inode,
bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
if (IS_ERR(bh)) {
- if (PTR_ERR(bh) == -ENOMEM)
- return NULL;
- bh = NULL;
- EXT4_ERROR_INODE(inode, "block %lu read error",
- (unsigned long)ce->e_value);
+ if (PTR_ERR(bh) != -ENOMEM)
+ EXT4_ERROR_INODE(inode, "block %lu read error",
+ (unsigned long)ce->e_value);
+ mb_cache_entry_put(ea_block_cache, ce);
+ return bh;
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
*pce = ce;
return bh;
@@ -3075,7 +3149,29 @@ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
while (name_len--) {
hash = (hash << NAME_HASH_SHIFT) ^
(hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
- *name++;
+ (unsigned char)*name++;
+ }
+ while (value_count--) {
+ hash = (hash << VALUE_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+ le32_to_cpu(*value++);
+ }
+ return cpu_to_le32(hash);
+}
+
+/*
+ * ext4_xattr_hash_entry_signed()
+ *
+ * Compute the hash of an extended attribute incorrectly.
+ */
+static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count)
+{
+ __u32 hash = 0;
+
+ while (name_len--) {
+ hash = (hash << NAME_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+ (signed char)*name++;
}
while (value_count--) {
hash = (hash << VALUE_HASH_SHIFT) ^
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ffe21ac77f78..1fedf44d4fb6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -32,8 +32,7 @@ struct ext4_xattr_header {
__le32 h_refcount; /* reference count */
__le32 h_blocks; /* number of disk blocks used */
__le32 h_hash; /* hash value of all attributes */
- __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */
- /* id = inum if refcount=1, blknum otherwise */
+ __le32 h_checksum; /* crc32c(uuid+blknum+xattrblock) */
__u32 h_reserved[3]; /* zero right now */
};
@@ -68,6 +67,9 @@ struct ext4_xattr_entry {
((void *)raw_inode + \
EXT4_GOOD_OLD_INODE_SIZE + \
EXT4_I(inode)->i_extra_isize))
+#define ITAIL(inode, raw_inode) \
+ ((void *)(raw_inode) + \
+ EXT4_SB((inode)->i_sb)->s_inode_size)
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
/*
@@ -84,7 +86,7 @@ struct ext4_xattr_entry {
/*
* The minimum size of EA value when you start storing it in an external inode
* size of block - size of header - size of 1 entry - 4 null bytes
-*/
+ */
#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \
((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
@@ -95,6 +97,19 @@ struct ext4_xattr_entry {
#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+/*
+ * If we want to add an xattr to the inode, we should make sure that
+ * i_extra_isize is not 0 and that the inode size is not less than
+ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
+ * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data
+ * |--------------------------|------------|------|---------|---|-------|
+ */
+#define EXT4_INODE_HAS_XATTR_SPACE(inode) \
+ ((EXT4_I(inode)->i_extra_isize != 0) && \
+ (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \
+ sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \
+ EXT4_INODE_SIZE((inode)->i_sb)))
+
struct ext4_xattr_info {
const char *name;
const void *value;
@@ -117,13 +132,14 @@ struct ext4_xattr_ibody_find {
};
struct ext4_xattr_inode_array {
- unsigned int count; /* # of used items in the array */
- struct inode *inodes[];
+ unsigned int count;
+ struct inode *inodes[] __counted_by(count);
};
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_hurd_handler;
#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
@@ -177,21 +193,29 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
+extern void ext4_evict_ea_inode(struct inode *inode);
-extern const struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler * const ext4_xattr_handlers[];
extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
const char *name,
void *buffer, size_t buffer_size);
-extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
+extern int
+__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
+ void *end, const char *function, unsigned int line);
+
+#define xattr_check_inode(inode, header, end) \
+ __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
+
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
new file mode 100644
index 000000000000..8a5842e4cd95
--- /dev/null
+++ b/fs/ext4/xattr_hurd.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/xattr_hurd.c
+ * Handler for extended gnu attributes for the Hurd.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org>
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static bool
+ext4_xattr_hurd_list(struct dentry *dentry)
+{
+ return test_opt(dentry->d_sb, XATTR_USER);
+}
+
+static int
+ext4_xattr_hurd_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_hurd_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_hurd_handler = {
+ .prefix = XATTR_HURD_PREFIX,
+ .list = ext4_xattr_hurd_list,
+ .get = ext4_xattr_hurd_get,
+ .set = ext4_xattr_hurd_set,
+};
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 197a9d8a15ef..776cf11d24ca 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -23,6 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
static int
ext4_xattr_security_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index e9389e5d75c3..9811eb0ab276 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -30,6 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
static int
ext4_xattr_trusted_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d4546184b34b..4b70bf4e7626 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -31,6 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
static int
ext4_xattr_user_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)