From efbe3c2493d2f7a1e1a753780fe727b34709ebd2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:33 -0700 Subject: fs: Remove unneeded IS_DAX() check in io_is_direct() Remove the check because DAX now has it's own read/write methods and file systems which support DAX check IS_DAX() prior to IOCB_DIRECT on their own. Therefore, it does not matter if the file state is DAX when the iocb flags are created. Also remove io_is_direct() as it is just a simple flag check. Reviewed-by: Dave Chinner Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Ira Weiny Signed-off-by: Darrick J. Wong --- drivers/block/loop.c | 6 +++--- include/linux/fs.h | 7 +------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index da693e6a834e..14372df0f354 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -634,8 +634,8 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) static inline void loop_update_dio(struct loop_device *lo) { - __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | - lo->use_dio); + __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) | + lo->use_dio); } static void loop_reread_partitions(struct loop_device *lo, @@ -1028,7 +1028,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); - if (io_is_direct(lo->lo_backing_file) && inode->i_sb->s_bdev) { + if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) { /* In case of direct I/O, match underlying block size */ unsigned short bsize = bdev_logical_block_size( inode->i_sb->s_bdev); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..a87cc5845a02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3394,11 +3394,6 @@ extern void setattr_copy(struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); -static inline bool io_is_direct(struct file *filp) -{ - return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host); -} - static inline bool vma_is_dax(const struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); @@ -3423,7 +3418,7 @@ static inline int iocb_flags(struct file *file) int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; - if (io_is_direct(file)) + if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) res |= IOCB_DSYNC; -- cgit From 712b2698e4c024b561694cbcc1abba13eb0fd9ce Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:34 -0700 Subject: fs/stat: Define DAX statx attribute In order for users to determine if a file is currently operating in DAX state (effective DAX). Define a statx attribute value and set that attribute if the effective DAX flag is set. To go along with this we propose the following addition to the statx man page: STATX_ATTR_DAX The file is in the DAX (cpu direct access) state. DAX state attempts to minimize software cache effects for both I/O and memory mappings of this file. It requires a file system which has been configured to support DAX. DAX generally assumes all accesses are via cpu load / store instructions which can minimize overhead for small accesses, but may adversely affect cpu utilization for large transfers. File I/O is done directly to/from user-space buffers and memory mapped I/O may be performed with direct memory mappings that bypass kernel page cache. While the DAX property tends to result in data being transferred synchronously, it does not give the same guarantees of O_SYNC where data and the necessary metadata are transferred together. A DAX file may support being mapped with the MAP_SYNC flag, which enables a program to use CPU cache flush instructions to persist CPU store operations without an explicit fsync(2). See mmap(2) for more information. Reviewed-by: Dave Chinner Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Ira Weiny Signed-off-by: Darrick J. Wong --- fs/stat.c | 3 +++ include/uapi/linux/stat.h | 1 + 2 files changed, 4 insertions(+) diff --git a/fs/stat.c b/fs/stat.c index 030008796479..894699c74dde 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -79,6 +79,9 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; + if (IS_DAX(inode)) + stat->attributes |= STATX_ATTR_DAX; + if (inode->i_op->getattr) return inode->i_op->getattr(path, stat, request_mask, query_flags); diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index ad80a5c885d5..e5f9d5517f6b 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -169,6 +169,7 @@ struct statx { #define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ #define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ +#define STATX_ATTR_DAX 0x00002000 /* [I] File is DAX */ #endif /* _UAPI_LINUX_STAT_H */ -- cgit From 83d9088659e8f113741bb197324bd9554d159657 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:34 -0700 Subject: Documentation/dax: Update Usage section Update the Usage section to reflect the new individual dax selection functionality. Signed-off-by: Ira Weiny Acked-by: Randy Dunlap Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- Documentation/filesystems/dax.txt | 142 +++++++++++++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 679729442fd2..735fb4b54117 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -20,8 +20,144 @@ Usage If you have a block device which supports DAX, you can make a filesystem on it as usual. The DAX code currently only supports files with a block size equal to your kernel's PAGE_SIZE, so you may need to specify a block -size when creating the filesystem. When mounting it, use the "-o dax" -option on the command line or add 'dax' to the options in /etc/fstab. +size when creating the filesystem. + +Currently 3 filesystems support DAX: ext2, ext4 and xfs. Enabling DAX on them +is different. + +Enabling DAX on ext4 and ext2 +----------------------------- + +When mounting the filesystem, use the "-o dax" option on the command line or +add 'dax' to the options in /etc/fstab. This works to enable DAX on all files +within the filesystem. It is equivalent to the '-o dax=always' behavior below. + + +Enabling DAX on xfs +------------------- + +Summary +------- + + 1. There exists an in-kernel file access mode flag S_DAX that corresponds to + the statx flag STATX_ATTR_DAX. See the manpage for statx(2) for details + about this access mode. + + 2. There exists a persistent flag FS_XFLAG_DAX that can be applied to regular + files and directories. This advisory flag can be set or cleared at any + time, but doing so does not immediately affect the S_DAX state. + + 3. If the persistent FS_XFLAG_DAX flag is set on a directory, this flag will + be inherited by all regular files and subdirectories that are subsequently + created in this directory. Files and subdirectories that exist at the time + this flag is set or cleared on the parent directory are not modified by + this modification of the parent directory. + + 4. There exist dax mount options which can override FS_XFLAG_DAX in the + setting of the S_DAX flag. Given underlying storage which supports DAX the + following hold: + + "-o dax=inode" means "follow FS_XFLAG_DAX" and is the default. + + "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX." + + "-o dax=always" means "always set S_DAX ignore FS_XFLAG_DAX." + + "-o dax" is a legacy option which is an alias for "dax=always". + This may be removed in the future so "-o dax=always" is + the preferred method for specifying this behavior. + + NOTE: Modifications to and the inheritance behavior of FS_XFLAG_DAX remain + the same even when the filesystem is mounted with a dax option. However, + in-core inode state (S_DAX) will be overridden until the filesystem is + remounted with dax=inode and the inode is evicted from kernel memory. + + 5. The S_DAX policy can be changed via: + + a) Setting the parent directory FS_XFLAG_DAX as needed before files are + created + + b) Setting the appropriate dax="foo" mount option + + c) Changing the FS_XFLAG_DAX flag on existing regular files and + directories. This has runtime constraints and limitations that are + described in 6) below. + + 6. When changing the S_DAX policy via toggling the persistent FS_XFLAG_DAX flag, + the change in behaviour for existing regular files may not occur + immediately. If the change must take effect immediately, the administrator + needs to: + + a) stop the application so there are no active references to the data set + the policy change will affect + + b) evict the data set from kernel caches so it will be re-instantiated when + the application is restarted. This can be achieved by: + + i. drop-caches + ii. a filesystem unmount and mount cycle + iii. a system reboot + + +Details +------- + +There are 2 per-file dax flags. One is a persistent inode setting (FS_XFLAG_DAX) +and the other is a volatile flag indicating the active state of the feature +(S_DAX). + +FS_XFLAG_DAX is preserved within the filesystem. This persistent config +setting can be set, cleared and/or queried using the FS_IOC_FS[GS]ETXATTR ioctl +(see ioctl_xfs_fsgetxattr(2)) or an utility such as 'xfs_io'. + +New files and directories automatically inherit FS_XFLAG_DAX from +their parent directory _when_ _created_. Therefore, setting FS_XFLAG_DAX at +directory creation time can be used to set a default behavior for an entire +sub-tree. + +To clarify inheritance, here are 3 examples: + +Example A: + +mkdir -p a/b/c +xfs_io -c 'chattr +x' a +mkdir a/b/c/d +mkdir a/e + + dax: a,e + no dax: b,c,d + +Example B: + +mkdir a +xfs_io -c 'chattr +x' a +mkdir -p a/b/c/d + + dax: a,b,c,d + no dax: + +Example C: + +mkdir -p a/b/c +xfs_io -c 'chattr +x' c +mkdir a/b/c/d + + dax: c,d + no dax: a,b + + +The current enabled state (S_DAX) is set when a file inode is instantiated in +memory by the kernel. It is set based on the underlying media support, the +value of FS_XFLAG_DAX and the filesystem's dax mount option. + +statx can be used to query S_DAX. NOTE that only regular files will ever have +S_DAX set and therefore statx will never indicate that S_DAX is set on +directories. + +Setting the FS_XFLAG_DAX flag (specifically or through inheritance) occurs even +if the underlying media does not support dax and/or the filesystem is +overridden with a mount option. + Implementation Tips for Block Driver Writers @@ -94,7 +230,7 @@ sysadmins have an option to restore the lost data from a prior backup/inbuilt redundancy in the following ways: 1. Delete the affected file, and restore from a backup (sysadmin route): - This will free the file system blocks that were being used by the file, + This will free the filesystem blocks that were being used by the file, and the next time they're allocated, they will be zeroed first, which happens through the driver, and will clear bad sectors. -- cgit From dae2f8ed7992e88c8d62c54e8295ffc8475b4a80 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:37 -0700 Subject: fs: Lift XFS_IDONTCACHE to the VFS layer DAX effective mode (S_DAX) changes requires inode eviction. XFS has an advisory flag (XFS_IDONTCACHE) to prevent caching of the inode if no other additional references are taken. We lift this flag to the VFS layer and change the behavior slightly by allowing the flag to remain even if multiple references are taken. This will expedite the eviction of inodes to change S_DAX. Cc: Al Viro Reviewed-by: Dave Chinner Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 4 ++-- fs/xfs/xfs_inode.h | 3 +-- fs/xfs/xfs_super.c | 2 +- include/linux/fs.h | 6 +++++- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8bf1d15be3f6..fa8554b3308d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -479,7 +479,7 @@ xfs_iget_cache_hit( xfs_ilock(ip, lock_flags); if (!(flags & XFS_IGET_INCORE)) - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + xfs_iflags_clear(ip, XFS_ISTALE); XFS_STATS_INC(mp, xs_ig_found); return 0; @@ -561,7 +561,7 @@ xfs_iget_cache_miss( */ iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; + VFS_I(ip)->i_state |= I_DONTCACHE; ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c6a63f6764a6..95209e3256a2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -218,8 +218,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) #define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) -#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ -#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ +#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 424bb9a2d532..a7f8e387ebfe 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -702,7 +702,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); + return generic_drop_inode(inode); } static void diff --git a/include/linux/fs.h b/include/linux/fs.h index a87cc5845a02..44bd45af760f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2156,6 +2156,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * * I_CREATING New object's inode in the middle of setting up. * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2178,6 +2180,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) +#define I_DONTCACHE (1 << 16) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) @@ -3049,7 +3052,8 @@ extern int inode_needs_sync(struct inode *inode); extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { - return !inode->i_nlink || inode_unhashed(inode); + return !inode->i_nlink || inode_unhashed(inode) || + (inode->i_state & I_DONTCACHE); } extern struct inode *ilookup5_nowait(struct super_block *sb, -- cgit From 2c567af418e3f9380c2051aada58b4e5a4b5c2ad Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:37 -0700 Subject: fs: Introduce DCACHE_DONTCACHE DCACHE_DONTCACHE indicates a dentry should not be cached on final dput(). Also add a helper function to mark DCACHE_DONTCACHE on all dentries pointing to a specific inode when that inode is being set I_DONTCACHE. This facilitates dropping dentry references to inodes sooner which require eviction to swap S_DAX mode. Cc: Al Viro Signed-off-by: Ira Weiny Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/dcache.c | 19 +++++++++++++++++++ fs/xfs/xfs_icache.c | 2 +- include/linux/dcache.h | 2 ++ include/linux/fs.h | 1 + 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index b280e07e162b..0d07fb335b78 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -647,6 +647,10 @@ static inline bool retain_dentry(struct dentry *dentry) if (dentry->d_op->d_delete(dentry)) return false; } + + if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) + return false; + /* retain; LRU fodder */ dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) @@ -656,6 +660,21 @@ static inline bool retain_dentry(struct dentry *dentry) return true; } +void d_mark_dontcache(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { + spin_lock(&de->d_lock); + de->d_flags |= DCACHE_DONTCACHE; + spin_unlock(&de->d_lock); + } + inode->i_state |= I_DONTCACHE; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_mark_dontcache); + /* * Finish off a dentry we've decided to kill. * dentry->d_lock must be held, returns with it unlocked. diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fa8554b3308d..d7deab608e2c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -561,7 +561,7 @@ xfs_iget_cache_miss( */ iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) - VFS_I(ip)->i_state |= I_DONTCACHE; + d_mark_dontcache(VFS_I(ip)); ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c1488cc84fd9..a81f0c3cf352 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -177,6 +177,8 @@ struct dentry_operations { #define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */ +#define DCACHE_DONTCACHE 0x00000080 /* Purge from memory on final dput() */ + #define DCACHE_CANT_MOUNT 0x00000100 #define DCACHE_GENOCIDE 0x00000200 #define DCACHE_SHRINK_LIST 0x00000400 diff --git a/include/linux/fs.h b/include/linux/fs.h index 44bd45af760f..7c3e8c0306e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3055,6 +3055,7 @@ static inline int generic_drop_inode(struct inode *inode) return !inode->i_nlink || inode_unhashed(inode) || (inode->i_state & I_DONTCACHE); } +extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), -- cgit From ff694ab60c29cfeba81b3d5068d3c908f22110ed Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:55 -0700 Subject: fs/ext4: Narrow scope of DAX check in setflags When preventing DAX and journaling on an inode. Use the effective DAX check rather than the mount option. This will be required to support per inode DAX flags. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-2-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfc1281fc4cb..5813e5e73eab 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -393,9 +393,9 @@ flags_err: if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to - * S_DAX if we are using the DAX mount option. + * S_DAX if the inode is DAX */ - if (test_opt(inode->i_sb, DAX)) { + if (IS_DAX(inode)) { err = -EBUSY; goto flags_out; } -- cgit From 6c0d077ff8200a7b8e7131ca8e25315dec243a60 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:56 -0700 Subject: fs/ext4: Disallow verity if inode is DAX Verity and DAX are incompatible. Changing the DAX mode due to a verity flag change is wrong without a corresponding address_space_operations update. Make the 2 options mutually exclusive by returning an error if DAX was set first. (Setting DAX is already disabled if Verity is set first.) Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-3-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/verity.rst | 3 +++ fs/ext4/verity.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/Documentation/filesystems/ext4/verity.rst b/Documentation/filesystems/ext4/verity.rst index 3e4c0ee0e068..e99ff3fd09f7 100644 --- a/Documentation/filesystems/ext4/verity.rst +++ b/Documentation/filesystems/ext4/verity.rst @@ -39,3 +39,6 @@ is encrypted as well as the data itself. Verity files cannot have blocks allocated past the end of the verity metadata. + +Verity and DAX are not compatible and attempts to set both of these flags +on a file will fail. diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..f05a09fb2ae4 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -113,6 +113,9 @@ static int ext4_begin_enable_verity(struct file *filp) handle_t *handle; int err; + if (IS_DAX(inode)) + return -EINVAL; + if (ext4_verity_in_progress(inode)) return -EBUSY; -- cgit From fc626fe322f13ecdfac58cbfa6cfea797ed22623 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:57 -0700 Subject: fs/ext4: Change EXT4_MOUNT_DAX to EXT4_MOUNT_DAX_ALWAYS In prep for the new tri-state mount option which then introduces EXT4_MOUNT_DAX_NEVER. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-4-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/super.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..1a3daf2d18ef 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1123,9 +1123,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX -#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else -#define EXT4_MOUNT_DAX 0 +#define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..a10ff12194db 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4400,7 +4400,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) static bool ext4_should_use_dax(struct inode *inode) { - if (!test_opt(inode->i_sb, DAX)) + if (!test_opt(inode->i_sb, DAX_ALWAYS)) return false; if (!S_ISREG(inode->i_mode)) return false; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..7b99c44d0a91 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1775,7 +1775,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -3982,7 +3982,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); goto failed_mount; @@ -4092,7 +4092,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); @@ -5412,7 +5412,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); err = -EINVAL; @@ -5433,10 +5433,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT4_MOUNT_DAX; + sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) -- cgit From a8ab6d3885ef5e2300d683b79a9e1999403eefd9 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:58 -0700 Subject: fs/ext4: Update ext4_should_use_dax() S_DAX should only be enabled when the underlying block device supports dax. Cache the underlying support for DAX in the super block and modify ext4_should_use_dax() to check for device support prior to the over riding mount option. While we are at it change the function to ext4_should_enable_dax() as this better reflects the ask as well as matches xfs. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-5-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 15 ++++++++++----- fs/ext4/super.c | 5 ++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1a3daf2d18ef..0b4db9ce7756 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1979,6 +1979,7 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) */ #define EXT4_FLAGS_RESIZING 0 #define EXT4_FLAGS_SHUTDOWN 1 +#define EXT4_FLAGS_BDEV_IS_DAX 2 static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a10ff12194db..6532870f6a0b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4398,10 +4398,10 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } -static bool ext4_should_use_dax(struct inode *inode) +static bool ext4_should_enable_dax(struct inode *inode) { - if (!test_opt(inode->i_sb, DAX_ALWAYS)) - return false; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) @@ -4412,7 +4412,12 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; - return true; + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) + return false; + if (test_opt(inode->i_sb, DAX_ALWAYS)) + return true; + + return false; } void ext4_set_inode_flags(struct inode *inode) @@ -4430,7 +4435,7 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (ext4_should_use_dax(inode)) + if (ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7b99c44d0a91..f7d76dcaedfe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4092,13 +4092,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (bdev_dax_supported(sb->s_bdev, blocksize)) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); goto failed_mount; } - if (!bdev_dax_supported(sb->s_bdev, blocksize)) { + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); goto failed_mount; -- cgit From 043546e46dc70c25ff7e2cf6d09cbb0424fc9978 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:59 -0700 Subject: fs/ext4: Only change S_DAX on inode load To prevent complications with in memory inodes we only set S_DAX on inode load. FS_XFLAG_DAX can be changed at any time and S_DAX will change after inode eviction and reload. Add init bool to ext4_set_inode_flags() to indicate if the inode is being newly initialized. Assert that S_DAX is not set on an inode which is just being loaded. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-6-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/inode.c | 13 ++++++++++--- fs/ext4/ioctl.c | 3 ++- fs/ext4/super.c | 4 ++-- fs/ext4/verity.c | 2 +- 6 files changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0b4db9ce7756..f5291693ce6e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2693,7 +2693,7 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern void ext4_set_inode_flags(struct inode *); +extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4b8c9a9bdf0c..7941c140723f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1116,7 +1116,7 @@ got: ei->i_block_group = group; ei->i_last_alloc_group = ~0; - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, true); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6532870f6a0b..01636cf5f322 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4420,11 +4420,13 @@ static bool ext4_should_enable_dax(struct inode *inode) return false; } -void ext4_set_inode_flags(struct inode *inode) +void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; + WARN_ON_ONCE(IS_DAX(inode) && init); + if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) @@ -4435,8 +4437,13 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (ext4_should_enable_dax(inode)) + + /* Because of the way inode_set_flags() works we must preserve S_DAX + * here if already set. */ + new_fl |= (inode->i_flags & S_DAX); + if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; + if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) @@ -4650,7 +4657,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5813e5e73eab..145083e8cd1e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -381,7 +381,8 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_clear_inode_flag(inode, i); } - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); + inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f7d76dcaedfe..80eb814c47eb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1348,7 +1348,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); } return res; } @@ -1375,7 +1375,7 @@ retry: * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index f05a09fb2ae4..89a155ece323 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -244,7 +244,7 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc, if (err) goto out_stop; ext4_set_inode_flag(inode, EXT4_INODE_VERITY); - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); err = ext4_mark_iloc_dirty(handle, inode, &iloc); } out_stop: -- cgit From 9cb20f94afcd2964944f9468e38da736ee855b19 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:00 -0700 Subject: fs/ext4: Make DAX mount option a tri-state We add 'always', 'never', and 'inode' (default). '-o dax' continues to operate the same which is equivalent to 'always'. This new functionality is limited to ext4 only. Specifically we introduce a 2nd DAX mount flag EXT4_MOUNT2_DAX_NEVER and set it and EXT4_MOUNT_DAX_ALWAYS appropriately for the mode. We also force EXT4_MOUNT2_DAX_NEVER if !CONFIG_FS_DAX. Finally, EXT4_MOUNT2_DAX_INODE is used solely to detect if the user specified that option for printing. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-7-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 2 ++ fs/ext4/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f5291693ce6e..65ffb831b2b9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1168,6 +1168,8 @@ struct ext4_inode_info { blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 01636cf5f322..68fac9289109 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4402,6 +4402,8 @@ static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + if (test_opt2(inode->i_sb, DAX_NEVER)) + return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 80eb814c47eb..5e056aa20ce9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1512,7 +1512,8 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, @@ -1579,6 +1580,9 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_dax, "dax"}, + {Opt_dax_always, "dax=always"}, + {Opt_dax_inode, "dax=inode"}, + {Opt_dax_never, "dax=never"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_warn_on_error, "warn_on_error"}, @@ -1726,6 +1730,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_STRING 0x0400 +#define MOPT_SKIP 0x0800 static const struct mount_opts { int token; @@ -1775,7 +1780,13 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, + {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -2084,13 +2095,32 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif - } else if (token == Opt_dax) { + } else if (token == Opt_dax || token == Opt_dax_always || + token == Opt_dax_inode || token == Opt_dax_never) { #ifdef CONFIG_FS_DAX - ext4_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= m->mount_opt; + switch (token) { + case Opt_dax: + case Opt_dax_always: + ext4_msg(sb, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + break; + case Opt_dax_never: + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + break; + case Opt_dax_inode: + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + /* Strictly for printing options */ + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; + break; + } #else ext4_msg(sb, KERN_INFO, "dax option not supported"); + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; return -1; #endif } else if (token == Opt_data_err_abort) { @@ -2254,7 +2284,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || - (m->flags & MOPT_CLEAR_ERR)) + (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) continue; if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ @@ -2314,6 +2344,17 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (DUMMY_ENCRYPTION_ENABLED(sbi)) SEQ_OPTS_PUTS("test_dummy_encryption"); + if (test_opt(sb, DAX_ALWAYS)) { + if (IS_EXT2_SB(sb)) + SEQ_OPTS_PUTS("dax"); + else + SEQ_OPTS_PUTS("dax=always"); + } else if (test_opt2(sb, DAX_NEVER)) { + SEQ_OPTS_PUTS("dax=never"); + } else if (test_opt2(sb, DAX_INODE)) { + SEQ_OPTS_PUTS("dax=inode"); + } + ext4_show_quota_options(seq, sb); return 0; } @@ -5436,10 +5477,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS || + (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_NEVER || + (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_INODE) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " - "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS; + "dax mount option with busy inodes while remounting"); + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt |= old_opts.s_mount_opt & EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~(EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); + sbi->s_mount_opt2 |= old_opts.s_mount_opt2 & + (EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) -- cgit From fcebc7949cd2ff97407e5b77ed99a7211674c6de Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:01 -0700 Subject: fs/ext4: Remove jflag variable The jflag variable serves almost no purpose. Remove it. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-8-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 145083e8cd1e..779631e8e849 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -300,7 +300,6 @@ static int ext4_ioctl_setflags(struct inode *inode, int err = -EPERM, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags, mask, i; - unsigned int jflag; struct super_block *sb = inode->i_sb; /* Is it quota file? Do not allow user to mess with it */ @@ -309,9 +308,6 @@ static int ext4_ioctl_setflags(struct inode *inode, oldflags = ei->i_flags; - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT4_JOURNAL_DATA_FL; - err = vfs_ioc_setflags_prepare(inode, oldflags, flags); if (err) goto flags_out; @@ -320,7 +316,7 @@ static int ext4_ioctl_setflags(struct inode *inode, * The JOURNAL_DATA flag can only be changed by * the relevant capability. */ - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } @@ -391,7 +387,7 @@ flags_err: if (err) goto flags_out; - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to * S_DAX if the inode is DAX @@ -401,7 +397,8 @@ flags_err: goto flags_out; } - err = ext4_change_inode_journal_flag(inode, jflag); + err = ext4_change_inode_journal_flag(inode, + flags & EXT4_JOURNAL_DATA_FL); if (err) goto flags_out; } -- cgit From b383a73f2b832491a2f9e6e8ada26aad53b5763d Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:02 -0700 Subject: fs/ext4: Introduce DAX inode flag Add a flag ([EXT4|FS]_DAX_FL) to preserve FS_XFLAG_DAX in the ext4 inode. Set the flag to be user visible and changeable. Set the flag to be inherited. Allow applications to change the flag at any time except if it conflicts with the set of mutually exclusive flags (Currently VERITY, ENCRYPT, JOURNAL_DATA). Furthermore, restrict setting any of the exclusive flags if DAX is set. While conceptually possible, we do not allow setting EXT4_DAX_FL while at the same time clearing exclusion flags (or vice versa) for 2 reasons: 1) The DAX flag does not take effect immediately which introduces quite a bit of complexity 2) There is no clear use case for being this flexible Finally, on regular files, flag the inode to not be cached to facilitate changing S_DAX on the next creation of the inode. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-9-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 18 ++++++++++++++---- fs/ext4/inode.c | 2 +- fs/ext4/ioctl.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/super.c | 3 +++ fs/ext4/verity.c | 2 +- include/uapi/linux/fs.h | 1 + 6 files changed, 66 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 65ffb831b2b9..598e00a9453f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -415,13 +415,16 @@ struct flex_groups { #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ + #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ @@ -429,14 +432,16 @@ struct flex_groups { EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ - EXT4_PROJINHERIT_FL) + EXT4_PROJINHERIT_FL | \ + EXT4_DAX_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ - EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ @@ -448,6 +453,10 @@ struct flex_groups { /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) +/* Flags which are mutually exclusive to DAX */ +#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ + EXT4_JOURNAL_DATA_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -488,6 +497,7 @@ enum { EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 68fac9289109..778b0dbe3da6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4419,7 +4419,7 @@ static bool ext4_should_enable_dax(struct inode *inode) if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; - return false; + return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 779631e8e849..1b520d07d371 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -292,6 +292,38 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, return 0; } +static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (S_ISDIR(inode->i_mode)) + return; + + if (test_opt2(inode->i_sb, DAX_NEVER) || + test_opt(inode->i_sb, DAX_ALWAYS)) + return; + + if ((ei->i_flags ^ flags) & EXT4_DAX_FL) + d_mark_dontcache(inode); +} + +static bool dax_compatible(struct inode *inode, unsigned int oldflags, + unsigned int flags) +{ + if (flags & EXT4_DAX_FL) { + if ((oldflags & EXT4_DAX_MUT_EXCL) || + ext4_test_inode_state(inode, + EXT4_STATE_VERITY_IN_PROGRESS)) { + return false; + } + } + + if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL)) + return false; + + return true; +} + static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) { @@ -320,6 +352,12 @@ static int ext4_ioctl_setflags(struct inode *inode, if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } + + if (!dax_compatible(inode, oldflags, flags)) { + err = -EOPNOTSUPP; + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; @@ -365,6 +403,8 @@ static int ext4_ioctl_setflags(struct inode *inode, if (err) goto flags_err; + ext4_dax_dontcache(inode, flags); + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; @@ -525,12 +565,15 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) xflags |= FS_XFLAG_NOATIME; if (iflags & EXT4_PROJINHERIT_FL) xflags |= FS_XFLAG_PROJINHERIT; + if (iflags & EXT4_DAX_FL) + xflags |= FS_XFLAG_DAX; return xflags; } #define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ - FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \ + FS_XFLAG_DAX) /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) @@ -549,6 +592,8 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) iflags |= EXT4_NOATIME_FL; if (xflags & FS_XFLAG_PROJINHERIT) iflags |= EXT4_PROJINHERIT_FL; + if (xflags & FS_XFLAG_DAX) + iflags |= EXT4_DAX_FL; return iflags; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e056aa20ce9..3658e3016999 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1323,6 +1323,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + res = ext4_convert_inline_data(inode); if (res) return res; diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 89a155ece323..4fecb3e4e338 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -113,7 +113,7 @@ static int ext4_begin_enable_verity(struct file *filp) handle_t *handle; int err; - if (IS_DAX(inode)) + if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX)) return -EINVAL; if (ext4_verity_in_progress(inode)) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 379a612f8f1d..f44eb0a04afd 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -262,6 +262,7 @@ struct fsxattr { #define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#define FS_DAX_FL 0x02000000 /* Inode is DAX */ #define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */ #define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define FS_CASEFOLD_FL 0x40000000 /* Folder is case insensitive */ -- cgit From 15ee65676f1e2bbb237e9ecd317b45d8eba29237 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:03 -0700 Subject: Documentation/dax: Update DAX enablement for ext4 Update the document to reflect ext4 and xfs now behave the same. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-10-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/dax.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 735fb4b54117..265c4f808dbf 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -25,7 +25,7 @@ size when creating the filesystem. Currently 3 filesystems support DAX: ext2, ext4 and xfs. Enabling DAX on them is different. -Enabling DAX on ext4 and ext2 +Enabling DAX on ext2 ----------------------------- When mounting the filesystem, use the "-o dax" option on the command line or @@ -33,8 +33,8 @@ add 'dax' to the options in /etc/fstab. This works to enable DAX on all files within the filesystem. It is equivalent to the '-o dax=always' behavior below. -Enabling DAX on xfs -------------------- +Enabling DAX on xfs and ext4 +---------------------------- Summary ------- -- cgit