summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
authorMark Brown <broonie@kernel.org>2020-08-25 11:01:46 +0100
committerMark Brown <broonie@kernel.org>2020-08-25 11:01:46 +0100
commit3bec5b6aae830355e786e204b20a7cea38c3a8ed (patch)
treefd597b87faf55ceb2a207ee94f4feca6276696db /fs/ext4
parenta577f3456c0a2fac3dee037c483753e6e68f3e49 (diff)
parentd012a7190fc1fd72ed48911e77ca97ba4521bccd (diff)
Merge tag 'v5.9-rc2' into regulator-5.9
Linux 5.9-rc2
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c16
-rw-r--r--fs/ext4/block_validity.c159
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h114
-rw-r--r--fs/ext4/ext4_jbd2.c25
-rw-r--r--fs/ext4/extents.c44
-rw-r--r--fs/ext4/file.c13
-rw-r--r--fs/ext4/hash.c4
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c20
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c60
-rw-r--r--fs/ext4/ioctl.c99
-rw-r--r--fs/ext4/mballoc.c291
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/move_extent.c4
-rw-r--r--fs/ext4/namei.c66
-rw-r--r--fs/ext4/page-io.c6
-rw-r--r--fs/ext4/readpage.c15
-rw-r--r--fs/ext4/super.c404
-rw-r--r--fs/ext4/sysfs.c13
-rw-r--r--fs/ext4/verity.c5
-rw-r--r--fs/ext4/xattr.c5
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/ext4/xattr_hurd.c51
27 files changed, 1022 insertions, 424 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 1afa5a4bcb5f..619dd35ddd48 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -110,7 +110,7 @@ config EXT4_KUNIT_TESTS
This builds the ext4 KUnit tests.
KUnit tests run during boot and output the results to the debug log
- in TAP format (http://testanything.org/). Only useful for kernel devs
+ in TAP format (https://testanything.org/). Only useful for kernel devs
running KUnit test harness and are not for inclusion into a production
build.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 4ccb3c9189d8..2e42f47a7f98 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -9,7 +9,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
+ xattr_user.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ba46d87cdf1..48c3df47748d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -413,7 +413,8 @@ verified:
* Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -441,6 +442,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
return ERR_PTR(-ENOMEM);
}
+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, return if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
if (bitmap_uptodate(bh))
goto verify;
@@ -487,10 +494,11 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
set_buffer_new(bh);
- trace_ext4_read_block_bitmap_load(sb, block_group);
+ trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO |
+ (ignore_locked ? REQ_RAHEAD : 0), bh);
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
int err;
- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 16e9b2fda03a..c54ba52f2dd4 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -24,6 +24,7 @@ struct ext4_system_zone {
struct rb_node node;
ext4_fsblk_t start_blk;
unsigned int count;
+ u32 ino;
};
static struct kmem_cache *ext4_system_zone_cachep;
@@ -45,7 +46,8 @@ void ext4_exit_system_zone(void)
static inline int can_merge(struct ext4_system_zone *entry1,
struct ext4_system_zone *entry2)
{
- if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+ if ((entry1->start_blk + entry1->count) == entry2->start_blk &&
+ entry1->ino == entry2->ino)
return 1;
return 0;
}
@@ -66,9 +68,9 @@ static void release_system_zone(struct ext4_system_blocks *system_blks)
*/
static int add_system_zone(struct ext4_system_blocks *system_blks,
ext4_fsblk_t start_blk,
- unsigned int count)
+ unsigned int count, u32 ino)
{
- struct ext4_system_zone *new_entry = NULL, *entry;
+ struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
@@ -79,30 +81,21 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
n = &(*n)->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
- else {
- if (start_blk + count > (entry->start_blk +
- entry->count))
- entry->count = (start_blk + count -
- entry->start_blk);
- new_node = *n;
- new_entry = rb_entry(new_node, struct ext4_system_zone,
- node);
- break;
- }
+ else /* Unexpected overlap of system zones. */
+ return -EFSCORRUPTED;
}
- if (!new_entry) {
- new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
- GFP_KERNEL);
- if (!new_entry)
- return -ENOMEM;
- new_entry->start_blk = start_blk;
- new_entry->count = count;
- new_node = &new_entry->node;
-
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &system_blks->root);
- }
+ new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+ GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+ new_entry->start_blk = start_blk;
+ new_entry->count = count;
+ new_entry->ino = ino;
+ new_node = &new_entry->node;
+
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &system_blks->root);
/* Can we merge to the left? */
node = rb_prev(new_node);
@@ -151,40 +144,6 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_CONT "\n");
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with filesystem metadata blocks.
- */
-static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
- struct ext4_system_blocks *system_blks,
- ext4_fsblk_t start_blk,
- unsigned int count)
-{
- struct ext4_system_zone *entry;
- struct rb_node *n;
-
- if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
- (start_blk + count < start_blk) ||
- (start_blk + count > ext4_blocks_count(sbi->s_es)))
- return 0;
-
- if (system_blks == NULL)
- return 1;
-
- n = system_blks->root.rb_node;
- while (n) {
- entry = rb_entry(n, struct ext4_system_zone, node);
- if (start_blk + count - 1 < entry->start_blk)
- n = n->rb_left;
- else if (start_blk >= (entry->start_blk + entry->count))
- n = n->rb_right;
- else
- return 0;
- }
- return 1;
-}
-
static int ext4_protect_reserved_inode(struct super_block *sb,
struct ext4_system_blocks *system_blks,
u32 ino)
@@ -214,19 +173,18 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
if (n == 0) {
i++;
} else {
- if (!ext4_data_block_valid_rcu(sbi, system_blks,
- map.m_pblk, n)) {
- err = -EFSCORRUPTED;
- __ext4_error(sb, __func__, __LINE__, -err,
- map.m_pblk, "blocks %llu-%llu "
- "from inode %u overlap system zone",
- map.m_pblk,
- map.m_pblk + map.m_len - 1, ino);
+ err = add_system_zone(system_blks, map.m_pblk, n, ino);
+ if (err < 0) {
+ if (err == -EFSCORRUPTED) {
+ __ext4_error(sb, __func__, __LINE__,
+ -err, map.m_pblk,
+ "blocks %llu-%llu from inode %u overlap system zone",
+ map.m_pblk,
+ map.m_pblk + map.m_len - 1,
+ ino);
+ }
break;
}
- err = add_system_zone(system_blks, map.m_pblk, n);
- if (err < 0)
- break;
i += n;
}
}
@@ -262,14 +220,6 @@ int ext4_setup_system_zone(struct super_block *sb)
int flex_size = ext4_flex_bg_size(sbi);
int ret;
- if (!test_opt(sb, BLOCK_VALIDITY)) {
- if (sbi->system_blks)
- ext4_release_system_zone(sb);
- return 0;
- }
- if (sbi->system_blks)
- return 0;
-
system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
if (!system_blks)
return -ENOMEM;
@@ -277,22 +227,25 @@ int ext4_setup_system_zone(struct super_block *sb)
for (i=0; i < ngroups; i++) {
cond_resched();
if (ext4_bg_has_super(sb, i) &&
- ((i < 5) || ((i % flex_size) == 0)))
- add_system_zone(system_blks,
+ ((i < 5) || ((i % flex_size) == 0))) {
+ ret = add_system_zone(system_blks,
ext4_group_first_block_no(sb, i),
- ext4_bg_num_gdb(sb, i) + 1);
+ ext4_bg_num_gdb(sb, i) + 1, 0);
+ if (ret)
+ goto err;
+ }
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(system_blks,
- ext4_block_bitmap(sb, gdp), 1);
+ ext4_block_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
- ext4_inode_bitmap(sb, gdp), 1);
+ ext4_inode_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group);
+ sbi->s_itb_per_group, 0);
if (ret)
goto err;
}
@@ -341,11 +294,24 @@ void ext4_release_system_zone(struct super_block *sb)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with some other filesystem metadata blocks.
+ */
+int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
unsigned int count)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_system_blocks *system_blks;
- int ret;
+ struct ext4_system_zone *entry;
+ struct rb_node *n;
+ int ret = 1;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es)))
+ return 0;
/*
* Lock the system zone to prevent it being released concurrently
@@ -354,8 +320,22 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
*/
rcu_read_lock();
system_blks = rcu_dereference(sbi->system_blks);
- ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
- count);
+ if (system_blks == NULL)
+ goto out_rcu;
+
+ n = system_blks->root.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ ret = (entry->ino == inode->i_ino);
+ break;
+ }
+ }
+out_rcu:
rcu_read_unlock();
return ret;
}
@@ -374,8 +354,7 @@ int ext4_check_blockref(const char *function, unsigned int line,
while (bref < p+max) {
blk = le32_to_cpu(*bref++);
if (blk &&
- unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- blk, 1))) {
+ unlikely(!ext4_inode_block_valid(inode, blk, 1))) {
ext4_error_inode(inode, function, line, blk,
"invalid block");
return -EFSCORRUPTED;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c654205f648d..1d82336b1cd4 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -675,6 +675,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
struct qstr qstr = {.name = str, .len = len };
const struct dentry *parent = READ_ONCE(dentry->d_parent);
const struct inode *inode = READ_ONCE(parent->d_inode);
+ char strbuf[DNAME_INLINE_LEN];
if (!inode || !IS_CASEFOLDED(inode) ||
!EXT4_SB(inode->i_sb)->s_encoding) {
@@ -683,6 +684,21 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
return memcmp(str, name->name, len);
}
+ /*
+ * If the dentry name is stored in-line, then it may be concurrently
+ * modified by a rename. If this happens, the VFS will eventually retry
+ * the lookup, so it doesn't matter what ->d_compare() returns.
+ * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * string. Therefore, we have to copy the name into a temporary buffer.
+ */
+ if (len <= DNAME_INLINE_LEN - 1) {
+ memcpy(strbuf, str, len);
+ strbuf[len] = 0;
+ qstr.name = strbuf;
+ /* prevent compiler from optimizing out the temporary buffer */
+ barrier();
+ }
+
return ext4_ci_compare(inode, name, &qstr, false);
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b08841f70b69..523e00d7b392 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -426,28 +426,59 @@ struct flex_groups {
#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */
+
+#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */
+
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
-
-/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
+/* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \
+ EXT4_UNRM_FL | \
+ EXT4_COMPR_FL | \
+ EXT4_SYNC_FL | \
+ EXT4_IMMUTABLE_FL | \
+ EXT4_APPEND_FL | \
+ EXT4_NODUMP_FL | \
+ EXT4_NOATIME_FL | \
+ EXT4_JOURNAL_DATA_FL | \
+ EXT4_NOTAIL_FL | \
+ EXT4_DIRSYNC_FL | \
+ EXT4_TOPDIR_FL | \
+ EXT4_EXTENTS_FL | \
+ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
+ EXT4_DAX_FL | \
+ EXT4_PROJINHERIT_FL | \
+ EXT4_CASEFOLD_FL)
+
+/* User visible flags */
+#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \
+ EXT4_DIRTY_FL | \
+ EXT4_COMPRBLK_FL | \
+ EXT4_NOCOMPR_FL | \
+ EXT4_ENCRYPT_FL | \
+ EXT4_INDEX_FL | \
+ EXT4_VERITY_FL | \
+ EXT4_INLINE_DATA_FL)
+
+/* Flags we can manipulate with through FS_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
EXT4_IMMUTABLE_FL | \
EXT4_APPEND_FL | \
EXT4_NODUMP_FL | \
EXT4_NOATIME_FL | \
- EXT4_PROJINHERIT_FL)
+ EXT4_PROJINHERIT_FL | \
+ EXT4_DAX_FL)
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
- EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
+ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
+ EXT4_DAX_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
@@ -459,6 +490,10 @@ struct flex_groups {
/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+/* Flags which are mutually exclusive to DAX */
+#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
+ EXT4_JOURNAL_DATA_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -499,6 +534,7 @@ enum {
EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
+ EXT4_INODE_DAX = 25, /* Inode is DAX */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */
@@ -659,8 +695,6 @@ enum {
/*
* ioctl commands
*/
-#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
@@ -677,17 +711,11 @@ enum {
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
-#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
-#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
-#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
/* ioctl codes 19--39 are reserved for fscrypt */
#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
-#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
-#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
-
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
/*
@@ -712,8 +740,6 @@ enum {
/*
* ioctl commands in 32 bit emulation
*/
-#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS
#define EXT4_IOC32_GETVERSION _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
@@ -1044,6 +1070,7 @@ struct ext4_inode_info {
struct timespec64 i_crtime;
/* mballoc */
+ atomic_t i_prealloc_active;
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
@@ -1135,9 +1162,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
-#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */
#else
-#define EXT4_MOUNT_DAX 0
+#define EXT4_MOUNT_DAX_ALWAYS 0
#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
@@ -1162,6 +1189,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
+#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1180,6 +1208,8 @@ struct ext4_inode_info {
blocks */
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
+#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */
+#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
specified journal checksum */
@@ -1489,10 +1519,13 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
+ unsigned int s_mb_max_inode_prealloc;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
+ unsigned int s_mb_prefetch;
+ unsigned int s_mb_prefetch_limit;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -1560,6 +1593,8 @@ struct ext4_sb_info {
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+ atomic_t s_warning_count;
+ atomic_t s_msg_count;
/* Encryption context for '-o test_dummy_encryption' */
struct fscrypt_dummy_context s_dummy_enc_ctx;
@@ -1573,6 +1608,9 @@ struct ext4_sb_info {
#ifdef CONFIG_EXT4_DEBUG
unsigned long s_simulate_fail;
#endif
+ /* Record the errseq of the backing block device */
+ errseq_t s_bdev_wb_err;
+ spinlock_t s_bdev_wb_lock;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1992,6 +2030,7 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
*/
#define EXT4_FLAGS_RESIZING 0
#define EXT4_FLAGS_SHUTDOWN 1
+#define EXT4_FLAGS_BDEV_IS_DAX 2
static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
{
@@ -2300,9 +2339,15 @@ struct ext4_lazy_init {
struct mutex li_list_mtx;
};
+enum ext4_li_mode {
+ EXT4_LI_MODE_PREFETCH_BBITMAP,
+ EXT4_LI_MODE_ITABLE,
+};
+
struct ext4_li_request {
struct super_block *lr_super;
- struct ext4_sb_info *lr_sbi;
+ enum ext4_li_mode lr_mode;
+ ext4_group_t lr_first_not_zeroed;
ext4_group_t lr_next_group;
struct list_head lr_request;
unsigned long lr_next_sched;
@@ -2433,7 +2478,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
- ext4_group_t block_group);
+ ext4_group_t block_group,
+ bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh);
@@ -2638,9 +2684,15 @@ extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_discard_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
+extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+ ext4_group_t group,
+ unsigned int nr, int *cnt);
+extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr);
+
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
@@ -2705,7 +2757,7 @@ extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
@@ -2752,8 +2804,7 @@ extern int ext4_search_dir(struct buffer_head *bh,
struct ext4_filename *fname,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir);
-extern int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+extern int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2911,12 +2962,6 @@ do { \
#endif
-extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
- __u32 compat);
-extern int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat);
-extern int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat);
extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -3132,6 +3177,7 @@ struct ext4_group_info {
(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3146,6 +3192,8 @@ struct ext4_group_info {
(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
@@ -3350,9 +3398,9 @@ extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
-extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
- ext4_fsblk_t start_blk,
- unsigned int count);
+extern int ext4_inode_block_valid(struct inode *inode,
+ ext4_fsblk_t start_blk,
+ unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 0c76cdd44d90..760b9ee49dc0 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -195,6 +195,28 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line,
jbd2_journal_abort_handle(handle);
}
+static void ext4_check_bdev_write_error(struct super_block *sb)
+{
+ struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ /*
+ * If the block device has write error flag, it may have failed to
+ * async write out metadata buffers in the background. In this case,
+ * we could read old data from disk and write it out again, which
+ * may lead to on-disk filesystem inconsistency.
+ */
+ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
+ spin_lock(&sbi->s_bdev_wb_lock);
+ err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
+ spin_unlock(&sbi->s_bdev_wb_lock);
+ if (err)
+ ext4_error_err(sb, -err,
+ "Error while async write back metadata");
+ }
+}
+
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
@@ -202,6 +224,9 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
might_sleep();
+ if (bh->b_bdev->bd_super)
+ ext4_check_bdev_write_error(bh->b_bdev->bd_super);
+
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7d088ff1e902..a0481582187a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* i_mutex. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
@@ -340,7 +340,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
*/
if (lblock + len <= lblock)
return 0;
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+ return ext4_inode_block_valid(inode, block, len);
}
static int ext4_valid_extent_idx(struct inode *inode,
@@ -348,7 +348,7 @@ static int ext4_valid_extent_idx(struct inode *inode,
{
ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+ return ext4_inode_block_valid(inode, block, 1);
}
static int ext4_valid_extent_entries(struct inode *inode,
@@ -507,14 +507,10 @@ __read_extent_tree_block(const char *function, unsigned int line,
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- if (!ext4_has_feature_journal(inode->i_sb) ||
- (inode->i_ino !=
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
- if (err)
- goto errout;
- }
+ err = __ext4_ext_check(function, line, inode,
+ ext_block_hdr(bh), depth, pblk);
+ if (err)
+ goto errout;
set_buffer_verified(bh);
/*
* If this is a leaf block, cache all of its entries
@@ -693,10 +689,8 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path)
return;
depth = path->p_depth;
for (i = 0; i <= depth; i++, path++) {
- if (path->p_bh) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
+ brelse(path->p_bh);
+ path->p_bh = NULL;
}
}
@@ -1915,7 +1909,7 @@ out:
/*
* ext4_ext_insert_extent:
- * tries to merge requsted extent into the existing extent or
+ * tries to merge requested extent into the existing extent or
* inserts requested extent as new one into the tree,
* creating new leaf in the no-space case.
*/
@@ -2844,7 +2838,7 @@ again:
* in use to avoid freeing it when removing blocks.
*/
if (sbi->s_cluster_ratio > 1) {
- pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+ pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
partial.pclu = EXT4_B2C(sbi, pblk);
partial.state = nofree;
}
@@ -3125,7 +3119,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
*
*
* Splits extent [a, b] into two extents [a, @split) and [@split, b], states
- * of which are deterimined by split_flag.
+ * of which are determined by split_flag.
*
* There are two cases:
* a> the extent are splitted into two extent.
@@ -3650,7 +3644,7 @@ static int ext4_split_convert_extents(handle_t *handle,
eof_block = map->m_lblk + map->m_len;
/*
* It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
+ * zeroout only if extent is fully inside i_size or new_size.
*/
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -4272,7 +4266,7 @@ got_allocated_blocks:
* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free().
*/
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
ext4_free_blocks(handle, inode, NULL, newblock,
@@ -4495,7 +4489,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/*
- * Round up offset. This is not fallocate, we neet to zero out
+ * Round up offset. This is not fallocate, we need to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
* range.
@@ -5299,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
}
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_es_remove_extent(inode, punch_start,
EXT_MAX_BLOCKS - punch_start);
@@ -5313,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start, SHIFT_LEFT);
@@ -5445,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
if (IS_ERR(path)) {
@@ -5579,7 +5573,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
}
ex1 = path1[path1->p_depth].p_ext;
ex2 = path2[path2->p_depth].p_ext;
- /* Do we have somthing to swap ? */
+ /* Do we have something to swap ? */
if (unlikely(!ex2 || !ex1))
goto finish;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a01e31a032c..7d61069531d3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -145,10 +145,9 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1) &&
- !EXT4_I(inode)->i_reserved_data_blocks)
- {
+ !EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
@@ -428,6 +427,10 @@ restart:
*/
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
!ext4_overwrite_io(inode, offset, count))) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
inode_unlock_shared(inode);
*ilock_shared = false;
inode_lock(inode);
@@ -544,6 +547,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
is_sync_kiocb(iocb) || unaligned_io || extend);
+ if (ret == -ENOTBLK)
+ ret = 0;
if (extend)
ret = ext4_handle_inode_extension(inode, offset, ret, count);
@@ -810,7 +815,7 @@ out:
return err;
}
-static int ext4_file_open(struct inode * inode, struct file * filp)
+static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 3e133793a5a3..2924261226e0 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -233,7 +233,7 @@ static int __ext4fs_dirhash(const char *name, int len,
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -247,7 +247,7 @@ static int __ext4fs_dirhash(const char *name, int len,
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_TEA:
p = name;
while (len > 0) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 54d324e80fe5..df25d38d6539 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1116,7 +1116,7 @@ got:
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index be2b66eb65f7..80c9f33800be 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* i_mutex. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
@@ -858,8 +858,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
else if (ext4_should_journal_data(inode))
flags |= EXT4_FREE_BLOCKS_FORGET;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
- count)) {
+ if (!ext4_inode_block_valid(inode, block_to_free, count)) {
EXT4_ERROR_INODE(inode, "attempt to clear invalid "
"blocks %llu len %lu",
(unsigned long long) block_to_free, count);
@@ -1004,8 +1003,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
if (!nr)
continue; /* A hole */
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- nr, 1)) {
+ if (!ext4_inode_block_valid(inode, nr, 1)) {
EXT4_ERROR_INODE(inode,
"invalid indirect mapped "
"block %lu (level %d)",
@@ -1182,21 +1180,21 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
@@ -1436,7 +1434,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
if (++n >= n2)
break;
@@ -1445,7 +1443,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
if (++n >= n2)
break;
@@ -1454,7 +1452,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c3a1ad2db122..75c97bca0815 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -276,7 +276,7 @@ static int ext4_create_inline_data(handle_t *handle,
len = 0;
}
- /* Insert the the xttr entry. */
+ /* Insert the xttr entry. */
i.value = value;
i.value_len = len;
@@ -1706,7 +1706,7 @@ int ext4_delete_inline_entry(handle_t *handle,
if (err)
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ err = ext4_generic_delete_entry(dir, de_del, bh,
inline_start, inline_size, 0);
if (err)
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 40ec5c7ef0d3..3a196d81f594 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
*/
if ((ei->i_reserved_data_blocks == 0) &&
!inode_is_open_for_write(inode))
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
}
static int __check_block_validity(struct inode *inode, const char *func,
@@ -394,8 +394,7 @@ static int __check_block_validity(struct inode *inode, const char *func,
(inode->i_ino ==
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
return 0;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
- map->m_len)) {
+ if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
"(length %d)", (unsigned long) map->m_lblk,
@@ -1096,7 +1095,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
}
if (unlikely(err)) {
page_zero_new_buffers(page, from, to);
- } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
@@ -3288,7 +3287,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
if (PageChecked(page))
return 0;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return jbd2_journal_try_to_free_buffers(journal, page);
else
return try_to_free_buffers(page);
}
@@ -3737,7 +3736,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
@@ -4056,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (stop_block > first_block) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
@@ -4163,7 +4162,7 @@ int ext4_truncate(struct inode *inode)
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
- return 0;
+ goto out_trace;
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
@@ -4172,16 +4171,14 @@ int ext4_truncate(struct inode *inode)
int has_inline = 1;
err = ext4_inline_data_truncate(inode, &has_inline);
- if (err)
- return err;
- if (has_inline)
- return 0;
+ if (err || has_inline)
+ goto out_trace;
}
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
if (ext4_inode_attach_jinode(inode) < 0)
- return 0;
+ goto out_trace;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4190,8 +4187,10 @@ int ext4_truncate(struct inode *inode)
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_trace;
+ }
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4211,7 +4210,7 @@ int ext4_truncate(struct inode *inode)
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
err = ext4_ext_truncate(handle, inode);
@@ -4242,6 +4241,7 @@ out_stop:
err = err2;
ext4_journal_stop(handle);
+out_trace:
trace_ext4_truncate_exit(inode);
return err;
}
@@ -4403,9 +4403,11 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
-static bool ext4_should_use_dax(struct inode *inode)
+static bool ext4_should_enable_dax(struct inode *inode)
{
- if (!test_opt(inode->i_sb, DAX))
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (test_opt2(inode->i_sb, DAX_NEVER))
return false;
if (!S_ISREG(inode->i_mode))
return false;
@@ -4417,14 +4419,21 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
return false;
- return true;
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
+ return false;
+ if (test_opt(inode->i_sb, DAX_ALWAYS))
+ return true;
+
+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}
-void ext4_set_inode_flags(struct inode *inode)
+void ext4_set_inode_flags(struct inode *inode, bool init)
{
unsigned int flags = EXT4_I(inode)->i_flags;
unsigned int new_fl = 0;
+ WARN_ON_ONCE(IS_DAX(inode) && init);
+
if (flags & EXT4_SYNC_FL)
new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
@@ -4435,8 +4444,13 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (ext4_should_use_dax(inode))
+
+ /* Because of the way inode_set_flags() works we must preserve S_DAX
+ * here if already set. */
+ new_fl |= (inode->i_flags & S_DAX);
+ if (init && ext4_should_enable_dax(inode))
new_fl |= S_DAX;
+
if (flags & EXT4_ENCRYPT_FL)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
@@ -4650,7 +4664,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
if (ext4_has_feature_64bit(sb))
@@ -4746,7 +4760,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = 0;
if (ei->i_file_acl &&
- !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad extended attribute block %llu",
ei->i_file_acl);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2162db0c747d..36eca3bc036a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
reset_inode_seed(inode);
reset_inode_seed(inode_bl);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
@@ -292,6 +292,38 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
return 0;
}
+static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (test_opt2(inode->i_sb, DAX_NEVER) ||
+ test_opt(inode->i_sb, DAX_ALWAYS))
+ return;
+
+ if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
+ d_mark_dontcache(inode);
+}
+
+static bool dax_compatible(struct inode *inode, unsigned int oldflags,
+ unsigned int flags)
+{
+ if (flags & EXT4_DAX_FL) {
+ if ((oldflags & EXT4_DAX_MUT_EXCL) ||
+ ext4_test_inode_state(inode,
+ EXT4_STATE_VERITY_IN_PROGRESS)) {
+ return false;
+ }
+ }
+
+ if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
+ return false;
+
+ return true;
+}
+
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
{
@@ -300,7 +332,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
int err = -EPERM, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
- unsigned int jflag;
struct super_block *sb = inode->i_sb;
/* Is it quota file? Do not allow user to mess with it */
@@ -309,9 +340,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
oldflags = ei->i_flags;
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT4_JOURNAL_DATA_FL;
-
err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
if (err)
goto flags_out;
@@ -320,10 +348,16 @@ static int ext4_ioctl_setflags(struct inode *inode,
* The JOURNAL_DATA flag can only be changed by
* the relevant capability.
*/
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
+
+ if (!dax_compatible(inode, oldflags, flags)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
@@ -369,6 +403,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
if (err)
goto flags_err;
+ ext4_dax_dontcache(inode, flags);
+
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
@@ -381,7 +417,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_clear_inode_flag(inode, i);
}
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
+
inode->i_ctime = current_time(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
@@ -390,17 +427,18 @@ flags_err:
if (err)
goto flags_out;
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
/*
* Changes to the journaling mode can cause unsafe changes to
- * S_DAX if we are using the DAX mount option.
+ * S_DAX if the inode is DAX
*/
- if (test_opt(inode->i_sb, DAX)) {
+ if (IS_DAX(inode)) {
err = -EBUSY;
goto flags_out;
}
- err = ext4_change_inode_journal_flag(inode, jflag);
+ err = ext4_change_inode_journal_flag(inode,
+ flags & EXT4_JOURNAL_DATA_FL);
if (err)
goto flags_out;
}
@@ -527,12 +565,15 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
xflags |= FS_XFLAG_NOATIME;
if (iflags & EXT4_PROJINHERIT_FL)
xflags |= FS_XFLAG_PROJINHERIT;
+ if (iflags & EXT4_DAX_FL)
+ xflags |= FS_XFLAG_DAX;
return xflags;
}
#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
- FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \
+ FS_XFLAG_DAX)
/* Transfer xflags flags to internal */
static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
@@ -551,6 +592,8 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
iflags |= EXT4_NOATIME_FL;
if (xflags & FS_XFLAG_PROJINHERIT)
iflags |= EXT4_PROJINHERIT_FL;
+ if (xflags & FS_XFLAG_DAX)
+ iflags |= EXT4_DAX_FL;
return iflags;
}
@@ -776,12 +819,12 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case FS_IOC_GETFSMAP:
return ext4_ioc_getfsmap(sb, (void __user *)arg);
- case EXT4_IOC_GETFLAGS:
+ case FS_IOC_GETFLAGS:
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (S_ISREG(inode->i_mode))
flags &= ~EXT4_PROJINHERIT_FL;
return put_user(flags, (int __user *) arg);
- case EXT4_IOC_SETFLAGS: {
+ case FS_IOC_SETFLAGS: {
int err;
if (!inode_owner_or_capable(inode))
@@ -1086,12 +1129,12 @@ resizefs_out:
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+ case FS_IOC_GET_ENCRYPTION_PWSALT: {
#ifdef CONFIG_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1131,7 +1174,7 @@ resizefs_out:
return -EOPNOTSUPP;
#endif
}
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
@@ -1193,7 +1236,7 @@ resizefs_out:
case EXT4_IOC_GET_ES_CACHE:
return ext4_ioctl_get_es_cache(filp, arg);
- case EXT4_IOC_FSGETXATTR:
+ case FS_IOC_FSGETXATTR:
{
struct fsxattr fa;
@@ -1204,7 +1247,7 @@ resizefs_out:
return -EFAULT;
return 0;
}
- case EXT4_IOC_FSSETXATTR:
+ case FS_IOC_FSSETXATTR:
{
struct fsxattr fa, old_fa;
int err;
@@ -1270,11 +1313,11 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case EXT4_IOC32_GETFLAGS:
- cmd = EXT4_IOC_GETFLAGS;
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
break;
- case EXT4_IOC32_SETFLAGS:
- cmd = EXT4_IOC_SETFLAGS;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
break;
case EXT4_IOC32_GETVERSION:
cmd = EXT4_IOC_GETVERSION;
@@ -1318,9 +1361,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_RESIZE_FS:
case FITRIM:
case EXT4_IOC_PRECACHE_EXTENTS:
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
- case EXT4_IOC_GET_ENCRYPTION_PWSALT:
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
case FS_IOC_GET_ENCRYPTION_POLICY_EX:
case FS_IOC_ADD_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY:
@@ -1334,8 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
- case EXT4_IOC_FSGETXATTR:
- case EXT4_IOC_FSSETXATTR:
+ case FS_IOC_FSGETXATTR:
+ case FS_IOC_FSSETXATTR:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a9083113a8c0..132c118d12e1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -922,7 +922,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
@@ -1279,9 +1279,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
-
return 0;
err:
@@ -1743,10 +1740,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
}
-/*
- * regular allocator, for general purposes allocation
- */
-
static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b,
int finish_group)
@@ -2119,13 +2112,11 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
- free = grp->bb_free;
- if (free == 0)
- return false;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return false;
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ free = grp->bb_free;
+ if (free == 0)
return false;
fragments = grp->bb_fragments;
@@ -2142,8 +2133,10 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
((group % flex_size) == 0))
return false;
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
- (free / fragments) >= ac->ac_g_ex.fe_len)
+ if (free < ac->ac_g_ex.fe_len)
+ return false;
+
+ if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
return true;
if (grp->bb_largest_free_order < ac->ac_2order)
@@ -2177,6 +2170,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
{
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
ext4_grpblk_t free;
int ret = 0;
@@ -2195,7 +2189,25 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, group, NULL);
+ int ret;
+
+ /* cr=0/1 is a very optimistic search to find large
+ * good chunks almost for free. If buddy data is not
+ * ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg,
+ * since this gets used for metadata block allocation,
+ * and we want to make sure we locate metadata blocks
+ * in the first block group in the flex_bg if possible.
+ */
+ if (cr < 2 &&
+ (!sbi->s_log_groups_per_flex ||
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+ return 0;
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
if (ret)
return ret;
}
@@ -2209,15 +2221,95 @@ out:
return ret;
}
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+ unsigned int nr, int *cnt)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ /*
+ * Prefetch block groups with free blocks; but don't
+ * bother if it is marked uninitialized on disk, since
+ * it won't require I/O to read. Also only try to
+ * prefetch once, so we avoid getblk() call, which can
+ * be expensive.
+ */
+ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh) && cnt)
+ (*cnt)++;
+ brelse(bh);
+ }
+ }
+ if (++group >= ngroups)
+ group = 0;
+ }
+ blk_finish_plug(&plug);
+ return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr)
+{
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ if (!group)
+ group = ext4_get_groups_count(sb);
+ group--;
+ grp = ext4_get_group_info(sb, group);
+
+ if (EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+ }
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t ngroups, group, i;
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
int cr = -1;
int err = 0, first_err = 0;
+ unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
+ int lost;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
@@ -2237,8 +2329,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
goto out;
/*
- * ac->ac2_order is set only if the fe_len is a power of 2
- * if ac2_order is set we also set criteria to 0 so that we
+ * ac->ac_2order is set only if the fe_len is a power of 2
+ * if ac->ac_2order is set we also set criteria to 0 so that we
* try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
@@ -2282,6 +2374,7 @@ repeat:
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ prefetch_grp = group;
for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
@@ -2293,6 +2386,29 @@ repeat:
if (group >= ngroups)
group = 0;
+ /*
+ * Batch reads of the block allocation bitmaps
+ * to get multiple READs in flight; limit
+ * prefetching at cr=0/1, otherwise mballoc can
+ * spend a lot of time loading imperfect groups
+ */
+ if ((prefetch_grp == group) &&
+ (cr > 1 ||
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
+ unsigned int curr_ios = prefetch_ios;
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ nr = (group / sbi->s_mb_prefetch) *
+ sbi->s_mb_prefetch;
+ nr = nr + sbi->s_mb_prefetch - group;
+ }
+ prefetch_grp = ext4_mb_prefetch(sb, group,
+ nr, &prefetch_ios);
+ if (prefetch_ios == curr_ios)
+ nr = 0;
+ }
+
/* This now checks without needing the buddy page */
ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
@@ -2341,22 +2457,24 @@ repeat:
* We've been searching too long. Let's try to allocate
* the best chunk we've found so far
*/
-
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
* found block(s)
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
*/
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len, lost);
+
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
cr = 3;
- atomic_inc(&sbi->s_mb_lost_chunks);
goto repeat;
}
}
@@ -2367,6 +2485,10 @@ out:
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
ac->ac_flags, cr, err);
+
+ if (nr)
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
return err;
}
@@ -2439,7 +2561,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
sg.info.bb_counters[i] : 0);
- seq_printf(seq, " ]\n");
+ seq_puts(seq, " ]\n");
return 0;
}
@@ -2613,6 +2735,26 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freebuddy;
}
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO */
+ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4
+ */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;
err_freebuddy:
@@ -2736,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -3090,7 +3233,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (!ext4_data_block_valid(sbi, block, len)) {
+ if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata", block, block+len);
/* File system mounted not to panic on error
@@ -3674,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
}
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_inode_info *ei;
+
+ if (pa->pa_deleted) {
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+ pa->pa_len);
+ return;
+ }
+
+ pa->pa_deleted = 1;
+
+ if (pa->pa_type == MB_INODE_PA) {
+ ei = EXT4_I(pa->pa_inode);
+ atomic_dec(&ei->i_prealloc_active);
+ }
+}
+
static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
@@ -3706,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
return;
}
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
@@ -3830,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
+ atomic_inc(&ei->i_prealloc_active);
}
/*
@@ -4040,7 +4204,7 @@ repeat:
}
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
/* we can trust pa_free ... */
free += pa->pa_free;
@@ -4103,7 +4267,7 @@ out_dbg:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -4121,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
- trace_ext4_discard_preallocations(inode);
+ trace_ext4_discard_preallocations(inode,
+ atomic_read(&ei->i_prealloc_active), needed);
INIT_LIST_HEAD(&list);
+ if (needed == 0)
+ needed = UINT_MAX;
+
repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list)) {
- pa = list_entry(ei->i_prealloc_list.next,
+ while (!list_empty(&ei->i_prealloc_list) && needed) {
+ pa = list_entry(ei->i_prealloc_list.prev,
struct ext4_prealloc_space, pa_inode_list);
BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
spin_lock(&pa->pa_lock);
@@ -4146,10 +4314,11 @@ repeat:
}
if (pa->pa_deleted == 0) {
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &list);
+ needed--;
continue;
}
@@ -4399,7 +4568,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
- /* we have to define context: we'll we work with a file or
+ /* we have to define context: we'll work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
@@ -4450,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
@@ -4549,10 +4718,29 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
}
/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int count, delta;
+
+ count = atomic_read(&ei->i_prealloc_active);
+ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+ if (count > sbi->s_mb_max_inode_prealloc + delta) {
+ count -= sbi->s_mb_max_inode_prealloc;
+ ext4_discard_preallocations(inode, count);
+ }
+}
+
+/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct inode *inode = ac->ac_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -4564,21 +4752,31 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
+
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if (likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
}
- }
- if (pa) {
- /*
- * We want to add the pa to the right bucket.
- * Remove it from the list and while adding
- * make sure the list to which we are adding
- * doesn't grow big.
- */
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+
+ if (pa->pa_type == MB_INODE_PA) {
+ /*
+ * treat per-inode prealloc list as a lru list, then try
+ * to trim the least recently used PA.
+ */
spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
+ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
- ext4_mb_add_n_trim(ac);
}
+
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
@@ -4588,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
+ ext4_mb_trim_inode_pa(inode);
return 0;
}
@@ -4708,7 +4907,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
- seq = *this_cpu_ptr(&discard_pa_seq);
+ seq = this_cpu_read(discard_pa_seq);
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
@@ -4915,7 +5114,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_data_block_valid(sbi, block, count)) {
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks not in datazone - "
"block = %llu, count = %lu", block, count);
goto error_return;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6b4d17c2935d..e75b4749aa1c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -73,6 +73,10 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512
+/*
+ * maximum length of inode prealloc list
+ */
+#define MB_DEFAULT_MAX_INODE_PREALLOC 512
struct ext4_free_data {
/* this links the free block information from sb_info */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1ed86fb6c302..0d601b822875 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
out:
if (*moved_len) {
- ext4_discard_preallocations(orig_inode);
- ext4_discard_preallocations(donor_inode);
+ ext4_discard_preallocations(orig_inode, 0);
+ ext4_discard_preallocations(donor_inode, 0);
}
ext4_ext_drop_refs(path);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 56738b538ddf..153a9fbe1dd0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1396,8 +1396,8 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
- bh->b_size, offset))
+ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
+ buf_size, offset))
return -1;
*res_dir = de;
return 1;
@@ -1858,7 +1858,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
blocksize, hinfo, map);
map -= count;
dx_sort_map(map, count);
- /* Split the existing block in the middle, size-wise */
+ /* Ensure that neither split block is over half full */
size = 0;
move = 0;
for (i = count-1; i >= 0; i--) {
@@ -1868,8 +1868,18 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
size += map[i].size;
move++;
}
- /* map index at which we will split */
- split = count - move;
+ /*
+ * map index at which we will split
+ *
+ * If the sum of active entries didn't exceed half the block size, just
+ * split it in half by count; each resulting block will have at least
+ * half the space free.
+ */
+ if (i > 0)
+ split = count - move;
+ else
+ split = count/2;
+
hash2 = map[split].hash;
continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
@@ -2455,8 +2465,7 @@ cleanup:
* ext4_generic_delete_entry deletes a directory entry by merging it
* with the previous entry
*/
-int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2472,7 +2481,7 @@ int ext4_generic_delete_entry(handle_t *handle,
de = (struct ext4_dir_entry_2 *)entry_buf;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- bh->b_data, bh->b_size, i))
+ entry_buf, buf_size, i))
return -EFSCORRUPTED;
if (de == de_del) {
if (pde)
@@ -2517,8 +2526,7 @@ static int ext4_delete_entry(handle_t *handle,
if (unlikely(err))
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del,
- bh, bh->b_data,
+ err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
dir->i_sb->s_blocksize, csum_size);
if (err)
goto out;
@@ -3193,30 +3201,33 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
* in separate transaction */
retval = dquot_initialize(dir);
if (retval)
- return retval;
+ goto out_trace;
retval = dquot_initialize(d_inode(dentry));
if (retval)
- return retval;
+ goto out_trace;
- retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
- if (!bh)
- goto end_unlink;
+ if (IS_ERR(bh)) {
+ retval = PTR_ERR(bh);
+ goto out_trace;
+ }
+ if (!bh) {
+ retval = -ENOENT;
+ goto out_trace;
+ }
inode = d_inode(dentry);
- retval = -EFSCORRUPTED;
- if (le32_to_cpu(de->inode) != inode->i_ino)
- goto end_unlink;
+ if (le32_to_cpu(de->inode) != inode->i_ino) {
+ retval = -EFSCORRUPTED;
+ goto out_bh;
+ }
handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- handle = NULL;
- goto end_unlink;
+ goto out_bh;
}
if (IS_DIRSYNC(dir))
@@ -3224,12 +3235,12 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
- goto end_unlink;
+ goto out_handle;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
- goto end_unlink;
+ goto out_handle;
if (inode->i_nlink == 0)
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
dentry->d_name.len, dentry->d_name.name);
@@ -3251,10 +3262,11 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
d_invalidate(dentry);
#endif
-end_unlink:
+out_handle:
+ ext4_journal_stop(handle);
+out_bh:
brelse(bh);
- if (handle)
- ext4_journal_stop(handle);
+out_trace:
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index de6fe969f773..defd2e10dfd1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -402,6 +402,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
@@ -418,7 +419,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
{
int ret;
- if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+ if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
+ !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
submit_and_retry:
ext4_io_submit(io);
}
@@ -506,7 +508,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* (e.g. holes) to be unnecessarily encrypted, but this is rare and
* can't happen in the common case of blocksize == PAGE_SIZE.
*/
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
unsigned int enc_bytes = round_up(len, i_blocksize(inode));
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5761e9961682..f014c5e473a9 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -140,7 +140,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
case STEP_VERITY:
if (ctx->enabled_steps & (1 << STEP_VERITY)) {
INIT_WORK(&ctx->work, verity_work);
@@ -148,7 +148,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
default:
__read_end_io(ctx->bio);
}
@@ -195,7 +195,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio,
{
unsigned int post_read_steps = 0;
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (ext4_need_verity(inode, first_idx))
@@ -230,6 +230,7 @@ int ext4_mpage_readpages(struct inode *inode,
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
+ sector_t next_block;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -258,7 +259,8 @@ int ext4_mpage_readpages(struct inode *inode,
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
+ block_in_file = next_block =
+ (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (ext4_readpage_limit(inode) +
blocksize - 1) >> blkbits;
@@ -358,7 +360,8 @@ int ext4_mpage_readpages(struct inode *inode,
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != blocks[0] - 1)) {
+ if (bio && (last_block_in_bio != blocks[0] - 1 ||
+ !fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
bio = NULL;
@@ -370,6 +373,8 @@ int ext4_mpage_readpages(struct inode *inode,
*/
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
+ fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
+ GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c668f6b42374..ea425b49b345 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -66,10 +66,10 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
static int ext4_show_options(struct seq_file *seq, struct dentry *root);
static int ext4_commit_super(struct super_block *sb, int sync);
-static void ext4_mark_recovery_complete(struct super_block *sb,
+static int ext4_mark_recovery_complete(struct super_block *sb,
struct ext4_super_block *es);
-static void ext4_clear_journal_err(struct super_block *sb,
- struct ext4_super_block *es);
+static int ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -522,9 +522,6 @@ static void ext4_handle_error(struct super_block *sb)
smp_wmb();
sb->s_flags |= SB_RDONLY;
} else if (test_opt(sb, ERRORS_PANIC)) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
}
@@ -725,23 +722,20 @@ void __ext4_abort(struct super_block *sb, const char *function,
va_end(args);
if (sb_rdonly(sb) == 0) {
- ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ if (EXT4_SB(sb)->s_journal)
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
* Make sure updated value of ->s_mount_flags will be visible
* before ->s_flags update
*/
smp_wmb();
sb->s_flags |= SB_RDONLY;
- if (EXT4_SB(sb)->s_journal)
- jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
}
- if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
+ if (test_opt(sb, ERRORS_PANIC) && !system_going_down())
panic("EXT4-fs panic from previous error\n");
- }
}
void __ext4_msg(struct super_block *sb,
@@ -750,6 +744,7 @@ void __ext4_msg(struct super_block *sb,
struct va_format vaf;
va_list args;
+ atomic_inc(&EXT4_SB(sb)->s_msg_count);
if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
return;
@@ -760,9 +755,12 @@ void __ext4_msg(struct super_block *sb,
va_end(args);
}
-#define ext4_warning_ratelimit(sb) \
- ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
- "EXT4-fs warning")
+static int ext4_warning_ratelimit(struct super_block *sb)
+{
+ atomic_inc(&EXT4_SB(sb)->s_warning_count);
+ return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning");
+}
void __ext4_warning(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
@@ -1129,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
+ atomic_set(&ei->i_prealloc_active, 0);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
@@ -1222,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode)
{
invalidate_inode_buffers(inode);
clear_inode(inode);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
@@ -1294,8 +1293,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
if (!page_has_buffers(page))
return 0;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_DIRECT_RECLAIM);
+ return jbd2_journal_try_to_free_buffers(journal, page);
+
return try_to_free_buffers(page);
}
@@ -1324,6 +1323,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
return -EINVAL;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EOPNOTSUPP;
+
res = ext4_convert_inline_data(inode);
if (res)
return res;
@@ -1349,7 +1351,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
* Update inode->i_flags - S_ENCRYPTED will be enabled,
* S_DAX may be disabled
*/
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
}
return res;
}
@@ -1376,7 +1378,7 @@ retry:
* Update inode->i_flags - S_ENCRYPTED will be enabled,
* S_DAX may be disabled
*/
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
res = ext4_mark_inode_dirty(handle, inode);
if (res)
EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
@@ -1511,10 +1513,12 @@ enum {
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
+ Opt_inlinecrypt,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
+ Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
Opt_nowarn_on_error, Opt_mblk_io_submit,
Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
@@ -1523,6 +1527,7 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+ Opt_prefetch_block_bitmaps,
};
static const match_table_t tokens = {
@@ -1581,6 +1586,9 @@ static const match_table_t tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
{Opt_dax, "dax"},
+ {Opt_dax_always, "dax=always"},
+ {Opt_dax_inode, "dax=inode"},
+ {Opt_dax_never, "dax=never"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
{Opt_warn_on_error, "warn_on_error"},
@@ -1609,8 +1617,10 @@ static const match_table_t tokens = {
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_inlinecrypt, "inlinecrypt"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
+ {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1729,6 +1739,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
#define MOPT_NO_EXT3 0x0200
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_STRING 0x0400
+#define MOPT_SKIP 0x0800
static const struct mount_opts {
int token;
@@ -1778,7 +1789,13 @@ static const struct mount_opts {
{Opt_min_batch_time, 0, MOPT_GTE0},
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
- {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
+ {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
+ {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+ {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+ {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
{Opt_stripe, 0, MOPT_GTE0},
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
@@ -1821,6 +1838,8 @@ static const struct mount_opts {
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_test_dummy_encryption, 0, MOPT_STRING},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+ {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
+ MOPT_SET},
{Opt_err, 0, 0}
};
@@ -1938,6 +1957,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_nolazytime:
sb->s_flags &= ~SB_LAZYTIME;
return 1;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ sb->s_flags |= SB_INLINECRYPT;
+#else
+ ext4_msg(sb, KERN_ERR, "inline encryption not supported");
+#endif
+ return 1;
}
for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -2123,13 +2149,56 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
- } else if (token == Opt_dax) {
+ } else if (token == Opt_dax || token == Opt_dax_always ||
+ token == Opt_dax_inode || token == Opt_dax_never) {
#ifdef CONFIG_FS_DAX
- ext4_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= m->mount_opt;
+ switch (token) {
+ case Opt_dax:
+ case Opt_dax_always:
+ if (is_remount &&
+ (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
+ fail_dax_change_remount:
+ ext4_msg(sb, KERN_ERR, "can't change "
+ "dax mount option while remounting");
+ return -1;
+ }
+ if (is_remount &&
+ (test_opt(sb, DATA_FLAGS) ==
+ EXT4_MOUNT_JOURNAL_DATA)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -1;
+ }
+ ext4_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
+ sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ break;
+ case Opt_dax_never:
+ if (is_remount &&
+ (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
+ goto fail_dax_change_remount;
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ break;
+ case Opt_dax_inode:
+ if (is_remount &&
+ ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
+ goto fail_dax_change_remount;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ /* Strictly for printing options */
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
+ break;
+ }
#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
return -1;
#endif
} else if (token == Opt_data_err_abort) {
@@ -2293,7 +2362,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
- (m->flags & MOPT_CLEAR_ERR))
+ (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
continue;
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
@@ -2353,6 +2422,20 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
fscrypt_show_test_dummy_encryption(seq, sep, sb);
+ if (sb->s_flags & SB_INLINECRYPT)
+ SEQ_OPTS_PUTS("inlinecrypt");
+
+ if (test_opt(sb, DAX_ALWAYS)) {
+ if (IS_EXT2_SB(sb))
+ SEQ_OPTS_PUTS("dax");
+ else
+ SEQ_OPTS_PUTS("dax=always");
+ } else if (test_opt2(sb, DAX_NEVER)) {
+ SEQ_OPTS_PUTS("dax=never");
+ } else if (test_opt2(sb, DAX_INODE)) {
+ SEQ_OPTS_PUTS("dax=inode");
+ }
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -2383,6 +2466,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
err = -EROFS;
+ goto done;
}
if (read_only)
goto done;
@@ -3138,15 +3222,34 @@ static void print_daily_error_info(struct timer_list *t)
static int ext4_run_li_request(struct ext4_li_request *elr)
{
struct ext4_group_desc *gdp = NULL;
- ext4_group_t group, ngroups;
- struct super_block *sb;
+ struct super_block *sb = elr->lr_super;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t group = elr->lr_next_group;
unsigned long timeout = 0;
+ unsigned int prefetch_ios = 0;
int ret = 0;
- sb = elr->lr_super;
- ngroups = EXT4_SB(sb)->s_groups_count;
+ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
+ elr->lr_next_group = ext4_mb_prefetch(sb, group,
+ EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
+ if (prefetch_ios)
+ ext4_mb_prefetch_fini(sb, elr->lr_next_group,
+ prefetch_ios);
+ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
+ prefetch_ios);
+ if (group >= elr->lr_next_group) {
+ ret = 1;
+ if (elr->lr_first_not_zeroed != ngroups &&
+ !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
+ elr->lr_next_group = elr->lr_first_not_zeroed;
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ ret = 0;
+ }
+ }
+ return ret;
+ }
- for (group = elr->lr_next_group; group < ngroups; group++) {
+ for (; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
ret = 1;
@@ -3164,9 +3267,10 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
timeout = jiffies;
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
+ trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
timeout = (jiffies - timeout) *
- elr->lr_sbi->s_li_wait_mult;
+ EXT4_SB(elr->lr_super)->s_li_wait_mult;
elr->lr_timeout = timeout;
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -3181,15 +3285,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
- struct ext4_sb_info *sbi;
-
if (!elr)
return;
- sbi = elr->lr_sbi;
-
list_del(&elr->lr_request);
- sbi->s_li_request = NULL;
+ EXT4_SB(elr->lr_super)->s_li_request = NULL;
kfree(elr);
}
@@ -3398,7 +3498,6 @@ static int ext4_li_info_new(void)
static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
ext4_group_t start)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
@@ -3406,8 +3505,13 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
return NULL;
elr->lr_super = sb;
- elr->lr_sbi = sbi;
- elr->lr_next_group = start;
+ elr->lr_first_not_zeroed = start;
+ if (test_opt(sb, PREFETCH_BLOCK_BITMAPS))
+ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
+ else {
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ elr->lr_next_group = start;
+ }
/*
* Randomize first schedule time of the request to
@@ -3437,8 +3541,9 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE))
+ if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) &&
+ (first_not_zeroed == ngroups || sb_rdonly(sb) ||
+ !test_opt(sb, INIT_INODE_TABLE)))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
@@ -4017,7 +4122,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"both data=journal and delalloc");
goto failed_mount;
}
- if (test_opt(sb, DAX)) {
+ if (test_opt(sb, DAX_ALWAYS)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and dax");
goto failed_mount;
@@ -4127,13 +4232,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (bdev_dax_supported(sb->s_bdev, blocksize))
+ set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
if (ext4_has_feature_inline_data(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
" that may contain inline data");
goto failed_mount;
}
- if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
ext4_msg(sb, KERN_ERR,
"DAX unsupported by block device.");
goto failed_mount;
@@ -4632,11 +4740,13 @@ no_journal:
ext4_set_resv_clusters(sb);
- err = ext4_setup_system_zone(sb);
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to initialize system "
- "zone (%d)", err);
- goto failed_mount4a;
+ if (test_opt(sb, BLOCK_VALIDITY)) {
+ err = ext4_setup_system_zone(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize system "
+ "zone (%d)", err);
+ goto failed_mount4a;
+ }
}
ext4_ext_init(sb);
@@ -4699,12 +4809,23 @@ no_journal:
}
#endif /* CONFIG_QUOTA */
+ /*
+ * Save the original bdev mapping's wb_err value which could be
+ * used to detect the metadata async write error.
+ */
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+ if (!sb_rdonly(sb))
+ errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ &sbi->s_bdev_wb_err);
+ sb->s_bdev->bd_super = sb;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
if (needs_recovery) {
ext4_msg(sb, KERN_INFO, "recovery complete");
- ext4_mark_recovery_complete(sb, es);
+ err = ext4_mark_recovery_complete(sb, es);
+ if (err)
+ goto failed_mount8;
}
if (EXT4_SB(sb)->s_journal) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -4738,6 +4859,8 @@ no_journal:
ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+ atomic_set(&sbi->s_warning_count, 0);
+ atomic_set(&sbi->s_msg_count, 0);
kfree(orig_data);
return 0;
@@ -4747,10 +4870,8 @@ cantfind_ext4:
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
-#ifdef CONFIG_QUOTA
failed_mount8:
ext4_unregister_sysfs(sb);
-#endif
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
@@ -4890,7 +5011,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
struct inode *journal_inode;
journal_t *journal;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return NULL;
journal_inode = ext4_get_journal_inode(sb, journal_inum);
if (!journal_inode)
@@ -4920,7 +5042,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
struct ext4_super_block *es;
struct block_device *bdev;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return NULL;
bdev = ext4_blkdev_get(j_dev, sb);
if (bdev == NULL)
@@ -5011,8 +5134,10 @@ static int ext4_load_journal(struct super_block *sb,
dev_t journal_dev;
int err = 0;
int really_read_only;
+ int journal_dev_ro;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return -EFSCORRUPTED;
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -5022,7 +5147,31 @@ static int ext4_load_journal(struct super_block *sb,
} else
journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
- really_read_only = bdev_read_only(sb->s_bdev);
+ if (journal_inum && journal_dev) {
+ ext4_msg(sb, KERN_ERR,
+ "filesystem has both journal inode and journal device!");
+ return -EINVAL;
+ }
+
+ if (journal_inum) {
+ journal = ext4_get_journal(sb, journal_inum);
+ if (!journal)
+ return -EINVAL;
+ } else {
+ journal = ext4_get_dev_journal(sb, journal_dev);
+ if (!journal)
+ return -EINVAL;
+ }
+
+ journal_dev_ro = bdev_read_only(journal->j_dev);
+ really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
+
+ if (journal_dev_ro && !sb_rdonly(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "journal device read-only, try mounting with '-o ro'");
+ err = -EROFS;
+ goto err_out;
+ }
/*
* Are we loading a blank journal or performing recovery after a
@@ -5037,27 +5186,14 @@ static int ext4_load_journal(struct super_block *sb,
ext4_msg(sb, KERN_ERR, "write access "
"unavailable, cannot proceed "
"(try mounting with noload)");
- return -EROFS;
+ err = -EROFS;
+ goto err_out;
}
ext4_msg(sb, KERN_INFO, "write access will "
"be enabled during recovery");
}
}
- if (journal_inum && journal_dev) {
- ext4_msg(sb, KERN_ERR, "filesystem has both journal "
- "and inode journals!");
- return -EINVAL;
- }
-
- if (journal_inum) {
- if (!(journal = ext4_get_journal(sb, journal_inum)))
- return -EINVAL;
- } else {
- if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
- return -EINVAL;
- }
-
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
@@ -5077,12 +5213,16 @@ static int ext4_load_journal(struct super_block *sb,
if (err) {
ext4_msg(sb, KERN_ERR, "error loading journal");
- jbd2_journal_destroy(journal);
- return err;
+ goto err_out;
}
EXT4_SB(sb)->s_journal = journal;
- ext4_clear_journal_err(sb, es);
+ err = ext4_clear_journal_err(sb, es);
+ if (err) {
+ EXT4_SB(sb)->s_journal = NULL;
+ jbd2_journal_destroy(journal);
+ return err;
+ }
if (!really_read_only && journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -5093,6 +5233,10 @@ static int ext4_load_journal(struct super_block *sb,
}
return 0;
+
+err_out:
+ jbd2_journal_destroy(journal);
+ return err;
}
static int ext4_commit_super(struct super_block *sb, int sync)
@@ -5105,13 +5249,6 @@ static int ext4_commit_super(struct super_block *sb, int sync)
return error;
/*
- * The superblock bh should be mapped, but it might not be if the
- * device was hot-removed. Not much we can do but fail the I/O.
- */
- if (!buffer_mapped(sbh))
- return error;
-
- /*
* If the file system is mounted read-only, don't update the
* superblock write time. This avoids updating the superblock
* write time when we are mounting the root file system
@@ -5178,26 +5315,32 @@ static int ext4_commit_super(struct super_block *sb, int sync)
* remounting) the filesystem readonly, then we will end up with a
* consistent fs on disk. Record that fact.
*/
-static void ext4_mark_recovery_complete(struct super_block *sb,
- struct ext4_super_block *es)
+static int ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es)
{
+ int err;
journal_t *journal = EXT4_SB(sb)->s_journal;
if (!ext4_has_feature_journal(sb)) {
- BUG_ON(journal != NULL);
- return;
+ if (journal != NULL) {
+ ext4_error(sb, "Journal got removed while the fs was "
+ "mounted!");
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
jbd2_journal_lock_updates(journal);
- if (jbd2_journal_flush(journal) < 0)
+ err = jbd2_journal_flush(journal);
+ if (err < 0)
goto out;
if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
ext4_clear_feature_journal_needs_recovery(sb);
ext4_commit_super(sb, 1);
}
-
out:
jbd2_journal_unlock_updates(journal);
+ return err;
}
/*
@@ -5205,14 +5348,17 @@ out:
* has recorded an error from a previous lifetime, move that error to the
* main filesystem now.
*/
-static void ext4_clear_journal_err(struct super_block *sb,
+static int ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es)
{
journal_t *journal;
int j_errno;
const char *errstr;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (!ext4_has_feature_journal(sb)) {
+ ext4_error(sb, "Journal got removed while the fs was mounted!");
+ return -EFSCORRUPTED;
+ }
journal = EXT4_SB(sb)->s_journal;
@@ -5237,6 +5383,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
jbd2_journal_clear_err(journal);
jbd2_journal_update_sb_errno(journal);
}
+ return 0;
}
/*
@@ -5379,7 +5526,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
{
struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned long old_sb_flags;
+ unsigned long old_sb_flags, vfs_flags;
struct ext4_mount_options old_opts;
int enable_quota = 0;
ext4_group_t g;
@@ -5422,6 +5569,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ /*
+ * Some options can be enabled by ext4 and/or by VFS mount flag
+ * either way we need to make sure it matches in both *flags and
+ * s_flags. Copy those selected flags from *flags to s_flags
+ */
+ vfs_flags = SB_LAZYTIME | SB_I_VERSION;
+ sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
+
if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
err = -EINVAL;
goto restore_opts;
@@ -5447,12 +5602,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
- if (test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- err = -EINVAL;
- goto restore_opts;
- }
} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -5468,12 +5617,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
- ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
- "dax flag with busy inodes while remounting");
- sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
- }
-
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
@@ -5487,9 +5630,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
}
- if (*flags & SB_LAZYTIME)
- sb->s_flags |= SB_LAZYTIME;
-
if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
@@ -5519,8 +5659,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
(sbi->s_mount_state & EXT4_VALID_FS))
es->s_state = cpu_to_le16(sbi->s_mount_state);
- if (sbi->s_journal)
+ if (sbi->s_journal) {
+ /*
+ * We let remount-ro finish even if marking fs
+ * as clean failed...
+ */
ext4_mark_recovery_complete(sb, es);
+ }
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
} else {
@@ -5563,13 +5708,24 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
/*
+ * Update the original bdev mapping's wb_err value
+ * which could be used to detect the metadata async
+ * write error.
+ */
+ errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ &sbi->s_bdev_wb_err);
+
+ /*
* Mounting a RDONLY partition read-write, so reread
* and store the current valid flag. (It may have
* been changed by e2fsck since we originally mounted
* the partition.)
*/
- if (sbi->s_journal)
- ext4_clear_journal_err(sb, es);
+ if (sbi->s_journal) {
+ err = ext4_clear_journal_err(sb, es);
+ if (err)
+ goto restore_opts;
+ }
sbi->s_mount_state = le16_to_cpu(es->s_state);
err = ext4_setup_super(sb, es, 0);
@@ -5599,7 +5755,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_register_li_request(sb, first_not_zeroed);
}
- ext4_setup_system_zone(sb);
+ /*
+ * Handle creation of system zone data early because it can fail.
+ * Releasing of existing data is done when we are sure remount will
+ * succeed.
+ */
+ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) {
+ err = ext4_setup_system_zone(sb);
+ if (err)
+ goto restore_opts;
+ }
+
if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
err = ext4_commit_super(sb, 1);
if (err)
@@ -5620,8 +5786,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
}
#endif
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
+ ext4_release_system_zone(sb);
+
+ /*
+ * Some options can be enabled by ext4 and/or by VFS mount flag
+ * either way we need to make sure it matches in both *flags and
+ * s_flags. Copy those selected flags from s_flags to *flags
+ */
+ *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags);
- *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
kfree(orig_data);
return 0;
@@ -5635,6 +5809,8 @@ restore_opts:
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
+ ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6c9fc9e21c13..bfabb799fa45 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -189,6 +189,9 @@ static struct ext4_attr ext4_attr_##_name = { \
#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -215,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -226,6 +230,8 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
+EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count);
+EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode);
EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode);
@@ -240,6 +246,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -257,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(mb_max_inode_prealloc),
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
@@ -267,6 +276,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
ATTR_LIST(errors_count),
+ ATTR_LIST(warning_count),
+ ATTR_LIST(msg_count),
ATTR_LIST(first_error_ino),
ATTR_LIST(last_error_ino),
ATTR_LIST(first_error_block),
@@ -283,6 +294,8 @@ static struct attribute *ext4_attrs[] = {
#ifdef CONFIG_EXT4_DEBUG
ATTR_LIST(simulate_fail),
#endif
+ ATTR_LIST(mb_prefetch),
+ ATTR_LIST(mb_prefetch_limit),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index dec1244dd062..bbd5e7e0632b 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -113,6 +113,9 @@ static int ext4_begin_enable_verity(struct file *filp)
handle_t *handle;
int err;
+ if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EINVAL;
+
if (ext4_verity_in_progress(inode))
return -EBUSY;
@@ -241,7 +244,7 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
if (err)
goto out_stop;
ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
out_stop:
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9b29a40738ac..cba4b877c606 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -93,6 +93,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
#ifdef CONFIG_EXT4_FS_SECURITY
[EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
#endif
+ [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
const struct xattr_handler *ext4_xattr_handlers[] = {
@@ -105,6 +106,7 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
#endif
+ &ext4_xattr_hurd_handler,
NULL
};
@@ -1354,8 +1356,7 @@ retry:
block = 0;
while (wsize < bufsize) {
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
csize = (bufsize - wsize) > blocksize ? blocksize :
bufsize - wsize;
bh = ext4_getblk(handle, ea_inode, block, 0);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ffe21ac77f78..730b91fa0dd7 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -124,6 +124,7 @@ struct ext4_xattr_inode_array {
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_hurd_handler;
#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
new file mode 100644
index 000000000000..8cfa74a56361
--- /dev/null
+++ b/fs/ext4/xattr_hurd.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/xattr_hurd.c
+ * Handler for extended gnu attributes for the Hurd.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org>
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static bool
+ext4_xattr_hurd_list(struct dentry *dentry)
+{
+ return test_opt(dentry->d_sb, XATTR_USER);
+}
+
+static int
+ext4_xattr_hurd_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_hurd_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_hurd_handler = {
+ .prefix = XATTR_HURD_PREFIX,
+ .list = ext4_xattr_hurd_list,
+ .get = ext4_xattr_hurd_get,
+ .set = ext4_xattr_hurd_set,
+};