summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-03 20:33:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-03 20:33:41 -0700
commit725737e7c21d2d25a4312c2aaa82a52bd03e3126 (patch)
tree85378eed2b05ec4c176afd8b0b33faab538084a3
parent5779aa2dac9a8dcad89b3774ee354de8b453ab21 (diff)
parent61a223df421f698c253143014cfd384255b3cf1e (diff)
Merge tag 'statx-dioalign-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux
Pull STATX_DIOALIGN support from Eric Biggers: "Make statx() support reporting direct I/O (DIO) alignment information. This provides a generic interface for userspace programs to determine whether a file supports DIO, and if so with what alignment restrictions. Specifically, STATX_DIOALIGN works on block devices, and on regular files when their containing filesystem has implemented support. An interface like this has been requested for years, since the conditions for when DIO is supported in Linux have gotten increasingly complex over time. Today, DIO support and alignment requirements can be affected by various filesystem features such as multi-device support, data journalling, inline data, encryption, verity, compression, checkpoint disabling, log-structured mode, etc. Further complicating things, Linux v6.0 relaxed the traditional rule of DIO needing to be aligned to the block device's logical block size; now user buffers (but not file offsets) only need to be aligned to the DMA alignment. The approach of uplifting the XFS specific ioctl XFS_IOC_DIOINFO was discarded in favor of creating a clean new interface with statx(). For more information, see the individual commits and the man page update[1]" Link: https://lore.kernel.org/r/20220722074229.148925-1-ebiggers@kernel.org [1] * tag 'statx-dioalign-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux: xfs: support STATX_DIOALIGN f2fs: support STATX_DIOALIGN f2fs: simplify f2fs_force_buffered_io() f2fs: move f2fs_force_buffered_io() into file.c ext4: support STATX_DIOALIGN fscrypt: change fscrypt_dio_supported() to prepare for STATX_DIOALIGN vfs: support STATX_DIOALIGN on block devices statx: add direct I/O alignment information
-rw-r--r--block/bdev.c23
-rw-r--r--fs/crypto/inline_crypt.c49
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/file.c37
-rw-r--r--fs/ext4/inode.c37
-rw-r--r--fs/f2fs/f2fs.h40
-rw-r--r--fs/f2fs/file.c43
-rw-r--r--fs/stat.c14
-rw-r--r--fs/xfs/xfs_iops.c10
-rw-r--r--include/linux/blkdev.h4
-rw-r--r--include/linux/fscrypt.h7
-rw-r--r--include/linux/stat.h2
-rw-r--r--include/uapi/linux/stat.h4
13 files changed, 188 insertions, 83 deletions
diff --git a/block/bdev.c b/block/bdev.c
index ce05175e71ce..d699ecdb3260 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -26,6 +26,7 @@
#include <linux/namei.h>
#include <linux/part_stat.h>
#include <linux/uaccess.h>
+#include <linux/stat.h>
#include "../fs/internal.h"
#include "blk.h"
@@ -1069,3 +1070,25 @@ void sync_bdevs(bool wait)
spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
+
+/*
+ * Handle STATX_DIOALIGN for block devices.
+ *
+ * Note that the inode passed to this is the inode of a block device node file,
+ * not the block device's internal inode. Therefore it is *not* valid to use
+ * I_BDEV() here; the block device has to be looked up by i_rdev instead.
+ */
+void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+{
+ struct block_device *bdev;
+
+ bdev = blkdev_get_no_open(inode->i_rdev);
+ if (!bdev)
+ return;
+
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ stat->result_mask |= STATX_DIOALIGN;
+
+ blkdev_put_no_open(bdev);
+}
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index c40bd55bc781..cea8b14007e6 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -396,46 +396,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
/**
- * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
- * supported as far as encryption is concerned
- * @iocb: the file and position the I/O is targeting
- * @iter: the I/O data segment(s)
+ * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an
+ * inode, as far as encryption is concerned
+ * @inode: the inode in question
*
* Return: %true if there are no encryption constraints that prevent DIO from
* being supported; %false if DIO is unsupported. (Note that in the
* %true case, the filesystem might have other, non-encryption-related
- * constraints that prevent DIO from actually being supported.)
+ * constraints that prevent DIO from actually being supported. Also, on
+ * encrypted files the filesystem is still responsible for only allowing
+ * DIO when requests are filesystem-block-aligned.)
*/
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+bool fscrypt_dio_supported(struct inode *inode)
{
- const struct inode *inode = file_inode(iocb->ki_filp);
- const unsigned int blocksize = i_blocksize(inode);
+ int err;
/* If the file is unencrypted, no veto from us. */
if (!fscrypt_needs_contents_encryption(inode))
return true;
- /* We only support DIO with inline crypto, not fs-layer crypto. */
- if (!fscrypt_inode_uses_inline_crypto(inode))
- return false;
-
/*
- * Since the granularity of encryption is filesystem blocks, the file
- * position and total I/O length must be aligned to the filesystem block
- * size -- not just to the block device's logical block size as is
- * traditionally the case for DIO on many filesystems.
+ * We only support DIO with inline crypto, not fs-layer crypto.
*
- * We require that the user-provided memory buffers be filesystem block
- * aligned too. It is simpler to have a single alignment value required
- * for all properties of the I/O, as is normally the case for DIO.
- * Also, allowing less aligned buffers would imply that data units could
- * cross bvecs, which would greatly complicate the I/O stack, which
- * assumes that bios can be split at any bvec boundary.
+ * To determine whether the inode is using inline crypto, we have to set
+ * up the key if it wasn't already done. This is because in the current
+ * design of fscrypt, the decision of whether to use inline crypto or
+ * not isn't made until the inode's encryption key is being set up. In
+ * the DIO read/write case, the key will always be set up already, since
+ * the file will be open. But in the case of statx(), the key might not
+ * be set up yet, as the file might not have been opened yet.
*/
- if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+ err = fscrypt_require_key(inode);
+ if (err) {
+ /*
+ * Key unavailable or couldn't be set up. This edge case isn't
+ * worth worrying about; just report that DIO is unsupported.
+ */
return false;
-
- return true;
+ }
+ return fscrypt_inode_uses_inline_crypto(inode);
}
EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3bf9a6926798..e5f2f5ca5120 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2977,6 +2977,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct user_namespace *, struct dentry *,
struct iattr *);
+extern u32 ext4_dio_alignment(struct inode *inode);
extern int ext4_getattr(struct user_namespace *, const struct path *,
struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 109d07629f81..8bb1c35fd6dd 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,19 +36,34 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+/*
+ * Returns %true if the given DIO request should be attempted with DIO, or
+ * %false if it should fall back to buffered I/O.
+ *
+ * DIO isn't well specified; when it's unsupported (either due to the request
+ * being misaligned, or due to the file not supporting DIO at all), filesystems
+ * either fall back to buffered I/O or return EINVAL. For files that don't use
+ * any special features like encryption or verity, ext4 has traditionally
+ * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
+ * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
+ *
+ * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
+ * traditionally falls back to buffered I/O.
+ *
+ * This function implements the traditional ext4 behavior in all these cases.
+ */
+static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ u32 dio_align = ext4_dio_alignment(inode);
- if (!fscrypt_dio_supported(iocb, iter))
- return false;
- if (fsverity_active(inode))
+ if (dio_align == 0)
return false;
- if (ext4_should_journal_data(inode))
- return false;
- if (ext4_has_inline_data(inode))
- return false;
- return true;
+
+ if (dio_align == 1)
+ return true;
+
+ return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
}
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(iocb, to)) {
+ if (!ext4_should_use_dio(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(iocb, from)) {
+ if (!ext4_should_use_dio(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 601214453c3a..364774230d87 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5550,6 +5550,22 @@ err_out:
return error;
}
+u32 ext4_dio_alignment(struct inode *inode)
+{
+ if (fsverity_active(inode))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ if (ext4_has_inline_data(inode))
+ return 0;
+ if (IS_ENCRYPTED(inode)) {
+ if (!fscrypt_dio_supported(inode))
+ return 0;
+ return i_blocksize(inode);
+ }
+ return 1; /* use the iomap defaults */
+}
+
int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -5565,6 +5581,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ u32 dio_align = ext4_dio_alignment(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (dio_align == 1) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+
+ /* iomap defaults */
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ } else {
+ stat->dio_mem_align = dio_align;
+ stat->dio_offset_align = dio_align;
+ }
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3c7cdb70fe2e..aea816a133a8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4471,17 +4471,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
f2fs_mark_inode_dirty_sync(inode, true);
}
-static inline int block_unaligned_IO(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
- unsigned int blocksize_mask = (1 << i_blkbits) - 1;
- loff_t offset = iocb->ki_pos;
- unsigned long align = offset | iov_iter_alignment(iter);
-
- return align & blocksize_mask;
-}
-
static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
int flag)
{
@@ -4492,35 +4481,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
return sbi->aligned_blksize;
}
-static inline bool f2fs_force_buffered_io(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int rw = iov_iter_rw(iter);
-
- if (!fscrypt_dio_supported(iocb, iter))
- return true;
- if (fsverity_active(inode))
- return true;
- if (f2fs_compressed_file(inode))
- return true;
-
- /* disallow direct IO if any of devices has unaligned blksize */
- if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
- return true;
-
- if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
- if (block_unaligned_IO(inode, iocb, iter))
- return true;
- if (F2FS_IO_ALIGNED(sbi))
- return true;
- }
- if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
- return true;
-
- return false;
-}
-
static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
{
return fsverity_active(inode) &&
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ce4905a073b3..791770507328 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -808,6 +808,29 @@ int f2fs_truncate(struct inode *inode)
return 0;
}
+static bool f2fs_force_buffered_io(struct inode *inode, int rw)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!fscrypt_dio_supported(inode))
+ return true;
+ if (fsverity_active(inode))
+ return true;
+ if (f2fs_compressed_file(inode))
+ return true;
+
+ /* disallow direct IO if any of devices has unaligned blksize */
+ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
+ return true;
+
+ if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
+ return true;
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ return true;
+
+ return false;
+}
+
int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -824,6 +847,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ *
+ * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN
+ * cannot represent that, so in that case we report no DIO support.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ unsigned int bsize = i_blocksize(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (!f2fs_force_buffered_io(inode, WRITE)) {
+ stat->dio_mem_align = bsize;
+ stat->dio_offset_align = bsize;
+ }
+ }
+
flags = fi->i_flags;
if (flags & F2FS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
@@ -4182,7 +4223,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
if (!(iocb->ki_flags & IOCB_DIRECT))
return false;
- if (f2fs_force_buffered_io(inode, iocb, iter))
+ if (f2fs_force_buffered_io(inode, iov_iter_rw(iter)))
return false;
/*
diff --git a/fs/stat.c b/fs/stat.c
index 9ced8860e0f3..ef50573c72a2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -5,6 +5,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
@@ -230,11 +231,22 @@ retry:
goto out;
error = vfs_getattr(&path, stat, request_mask, flags);
+
stat->mnt_id = real_mount(path.mnt)->mnt_id;
stat->result_mask |= STATX_MNT_ID;
+
if (path.mnt->mnt_root == path.dentry)
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
+
+ /* Handle STATX_DIOALIGN for block devices. */
+ if (request_mask & STATX_DIOALIGN) {
+ struct inode *inode = d_backing_inode(path.dentry);
+
+ if (S_ISBLK(inode->i_mode))
+ bdev_statx_dioalign(inode, stat);
+ }
+
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -611,6 +623,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dev_major = MAJOR(stat->dev);
tmp.stx_dev_minor = MINOR(stat->dev);
tmp.stx_mnt_id = stat->mnt_id;
+ tmp.stx_dio_mem_align = stat->dio_mem_align;
+ tmp.stx_dio_offset_align = stat->dio_offset_align;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 45518b8c613c..f51c60d7e205 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -604,6 +604,16 @@ xfs_vn_getattr(
stat->blksize = BLKDEV_IOSIZE;
stat->rdev = inode->i_rdev;
break;
+ case S_IFREG:
+ if (request_mask & STATX_DIOALIGN) {
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+
+ stat->result_mask |= STATX_DIOALIGN;
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ }
+ fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
stat->rdev = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 84b13fdd34a7..8038c5fbde40 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1498,6 +1498,7 @@ int sync_blockdev(struct block_device *bdev);
int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
int sync_blockdev_nowait(struct block_device *bdev);
void sync_bdevs(bool wait);
+void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
void printk_all_partitions(void);
#else
static inline void invalidate_bdev(struct block_device *bdev)
@@ -1514,6 +1515,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev)
static inline void sync_bdevs(bool wait)
{
}
+static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+{
+}
static inline void printk_all_partitions(void)
{
}
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 1f12ebb4a69d..cad78b569c7e 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -764,7 +764,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
bool fscrypt_mergeable_bio_bh(struct bio *bio,
const struct buffer_head *next_bh);
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter);
+bool fscrypt_dio_supported(struct inode *inode);
u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);
@@ -797,11 +797,8 @@ static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
return true;
}
-static inline bool fscrypt_dio_supported(struct kiocb *iocb,
- struct iov_iter *iter)
+static inline bool fscrypt_dio_supported(struct inode *inode)
{
- const struct inode *inode = file_inode(iocb->ki_filp);
-
return !fscrypt_needs_contents_encryption(inode);
}
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 7df06931f25d..ff277ced50e9 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -50,6 +50,8 @@ struct kstat {
struct timespec64 btime; /* File creation time */
u64 blocks;
u64 mnt_id;
+ u32 dio_mem_align;
+ u32 dio_offset_align;
};
#endif
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 1500a0f58041..7cab2c65d3d7 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -124,7 +124,8 @@ struct statx {
__u32 stx_dev_minor;
/* 0x90 */
__u64 stx_mnt_id;
- __u64 __spare2;
+ __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */
+ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */
/* 0xa0 */
__u64 __spare3[12]; /* Spare space for future expansion */
/* 0x100 */
@@ -152,6 +153,7 @@ struct statx {
#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
#define STATX_BTIME 0x00000800U /* Want/got stx_btime */
#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */
+#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */