summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-02 13:27:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-02 13:27:23 -0700
commit98e247464088a11ce2328a214fdb87d4c06f8db6 (patch)
tree1f876a9b06b189855adfbefd1da6b6753effd1bb /fs
parentb349b1181d24af1c151134a3c39725e94a5619dd (diff)
parent0dd316ba8692c2374fbb82cce57c0b23144f2977 (diff)
Merge tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block
Pull io_uring buffered writes support from Jens Axboe: "This contains support for buffered writes, specifically for XFS. btrfs is in progress, will be coming in the next release. io_uring does support buffered writes on any file type, but since the buffered write path just always -EAGAIN (or -EOPNOTSUPP) any attempt to do so if IOCB_NOWAIT is set, any buffered write will effectively be handled by io-wq offload. This isn't very efficient, and we even have specific code in io-wq to serialize buffered writes to the same inode to avoid further inefficiencies with thread offload. This is particularly sad since most buffered writes don't block, they simply copy data to a page and dirty it. With this pull request, we can handle buffered writes a lot more effiently. If balance_dirty_pages() needs to block, we back off on writes as indicated. This improves buffered write support by 2-3x. Jan Kara helped with the mm bits for this, and Stefan handled the fs/iomap/xfs/io_uring parts of it" * tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block: mm: honor FGP_NOWAIT for page cache page allocation xfs: Add async buffered write support xfs: Specify lockmode when calling xfs_ilock_for_iomap() io_uring: Add tracepoint for short writes io_uring: fix issue with io_write() not always undoing sb_start_write() io_uring: Add support for async buffered writes fs: Add async write file modification handling. fs: Split off inode_needs_update_time and __file_update_time fs: add __remove_file_privs() with flags parameter fs: add a FMODE_BUF_WASYNC flags for f_mode iomap: Return -EAGAIN from iomap_write_iter() iomap: Add async buffered write support iomap: Add flags parameter to iomap_page_create() mm: Add balance_dirty_pages_ratelimited_flags() function mm: Move updates of dirty_exceeded into one place mm: Move starting of background writeback into the main balancing loop
Diffstat (limited to 'fs')
-rw-r--r--fs/inode.c168
-rw-r--r--fs/iomap/buffered-io.c67
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/xfs/xfs_file.c11
-rw-r--r--fs/xfs/xfs_iomap.c11
5 files changed, 191 insertions, 70 deletions
diff --git a/fs/inode.c b/fs/inode.c
index bd4da9c5207e..259ebf438893 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns,
return notify_change(mnt_userns, dentry, &newattrs, NULL);
}
-/*
- * Remove special file priviledges (suid, capabilities) when file is written
- * to or truncated.
- */
-int file_remove_privs(struct file *file)
+static int __file_remove_privs(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
+ int error;
int kill;
- int error = 0;
- /*
- * Fast path for nothing security related.
- * As well for non-regular files, e.g. blkdev inodes.
- * For example, blkdev_write_iter() might get here
- * trying to remove privs which it is not allowed to.
- */
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
kill = dentry_needs_remove_privs(dentry);
- if (kill < 0)
+ if (kill <= 0)
return kill;
- if (kill)
- error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
+
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
if (!error)
inode_has_no_xattr(inode);
return error;
}
-EXPORT_SYMBOL(file_remove_privs);
/**
- * file_update_time - update mtime and ctime time
- * @file: file accessed
+ * file_remove_privs - remove special file privileges (suid, capabilities)
+ * @file: file to remove privileges from
+ *
+ * When file is modified by a write or truncation ensure that special
+ * file privileges are removed.
*
- * Update the mtime and ctime members of an inode and mark the inode
- * for writeback. Note that this function is meant exclusively for
- * usage in the file write path of filesystems, and filesystems may
- * choose to explicitly ignore update via this function with the
- * S_NOCMTIME inode flag, e.g. for network filesystem where these
- * timestamps are handled by the server. This can return an error for
- * file systems who need to allocate space in order to update an inode.
+ * Return: 0 on success, negative errno on failure.
*/
+int file_remove_privs(struct file *file)
+{
+ return __file_remove_privs(file, 0);
+}
+EXPORT_SYMBOL(file_remove_privs);
-int file_update_time(struct file *file)
+static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
{
- struct inode *inode = file_inode(file);
- struct timespec64 now;
int sync_it = 0;
- int ret;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
+ if (!timespec64_equal(&inode->i_mtime, now))
sync_it = S_MTIME;
- if (!timespec64_equal(&inode->i_ctime, &now))
+ if (!timespec64_equal(&inode->i_ctime, now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2079,37 +2069,127 @@ int file_update_time(struct file *file)
if (!sync_it)
return 0;
- /* Finally allowed to write? Takes lock. */
- if (__mnt_want_write_file(file))
- return 0;
+ return sync_it;
+}
+
+static int __file_update_time(struct file *file, struct timespec64 *now,
+ int sync_mode)
+{
+ int ret = 0;
+ struct inode *inode = file_inode(file);
- ret = inode_update_time(inode, &now, sync_it);
- __mnt_drop_write_file(file);
+ /* try to update time settings */
+ if (!__mnt_want_write_file(file)) {
+ ret = inode_update_time(inode, now, sync_mode);
+ __mnt_drop_write_file(file);
+ }
return ret;
}
+
+/**
+ * file_update_time - update mtime and ctime time
+ * @file: file accessed
+ *
+ * Update the mtime and ctime members of an inode and mark the inode for
+ * writeback. Note that this function is meant exclusively for usage in
+ * the file write path of filesystems, and filesystems may choose to
+ * explicitly ignore updates via this function with the _NOCMTIME inode
+ * flag, e.g. for network filesystem where these imestamps are handled
+ * by the server. This can return an error for file systems who need to
+ * allocate space in order to update an inode.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int file_update_time(struct file *file)
+{
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct timespec64 now = current_time(inode);
+
+ ret = inode_needs_update_time(inode, &now);
+ if (ret <= 0)
+ return ret;
+
+ return __file_update_time(file, &now, ret);
+}
EXPORT_SYMBOL(file_update_time);
-/* Caller must hold the file's inode lock */
-int file_modified(struct file *file)
+/**
+ * file_modified_flags - handle mandated vfs changes when modifying a file
+ * @file: file that was modified
+ * @flags: kiocb flags
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * If IOCB_NOWAIT is set, special file privileges will not be removed and
+ * time settings will not be updated. It will return -EAGAIN.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int file_modified_flags(struct file *file, int flags)
{
- int err;
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct timespec64 now = current_time(inode);
/*
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
- err = file_remove_privs(file);
- if (err)
- return err;
+ ret = __file_remove_privs(file, flags);
+ if (ret)
+ return ret;
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
- return file_update_time(file);
+ ret = inode_needs_update_time(inode, &now);
+ if (ret <= 0)
+ return ret;
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ return __file_update_time(file, &now, ret);
+}
+
+/**
+ * file_modified - handle mandated vfs changes when modifying a file
+ * @file: file that was modified
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int file_modified(struct file *file)
+{
+ return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);
+/**
+ * kiocb_modified - handle mandated vfs changes when modifying a file
+ * @iocb: iocb that was modified
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kiocb_modified(struct kiocb *iocb)
+{
+ return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
+}
+EXPORT_SYMBOL_GPL(kiocb_modified);
+
int inode_needs_sync(struct inode *inode)
{
if (IS_SYNC(inode))
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d2a9f699e17e..c681eacc389b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
static struct bio_set iomap_ioend_bioset;
static struct iomap_page *
-iomap_page_create(struct inode *inode, struct folio *folio)
+iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
{
struct iomap_page *iop = to_iomap_page(folio);
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+ gfp_t gfp;
if (iop || nr_blocks <= 1)
return iop;
+ if (flags & IOMAP_NOWAIT)
+ gfp = GFP_NOWAIT;
+ else
+ gfp = GFP_NOFS | __GFP_NOFAIL;
+
iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
- GFP_NOFS | __GFP_NOFAIL);
- spin_lock_init(&iop->uptodate_lock);
- if (folio_test_uptodate(folio))
- bitmap_fill(iop->uptodate, nr_blocks);
- folio_attach_private(folio, iop);
+ gfp);
+ if (iop) {
+ spin_lock_init(&iop->uptodate_lock);
+ if (folio_test_uptodate(folio))
+ bitmap_fill(iop->uptodate, nr_blocks);
+ folio_attach_private(folio, iop);
+ }
return iop;
}
@@ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
- iop = iomap_page_create(iter->inode, folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
else
iop = to_iomap_page(folio);
@@ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */
- iop = iomap_page_create(iter->inode, folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
@@ -547,10 +555,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct iomap_page *iop = iomap_page_create(iter->inode, folio);
+ struct iomap_page *iop;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
+ unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
@@ -558,6 +567,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0;
folio_clear_error(folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
+ if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
+ return -EAGAIN;
+
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen);
@@ -574,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return -EIO;
folio_zero_segments(folio, poff, from, to, poff + plen);
} else {
- int status = iomap_read_folio_sync(block_start, folio,
+ int status;
+
+ if (iter->flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ status = iomap_read_folio_sync(block_start, folio,
poff, plen, srcmap);
if (status)
return status;
@@ -603,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0;
+ if (iter->flags & IOMAP_NOWAIT)
+ fgp |= FGP_NOWAIT;
+
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -622,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
if (!folio) {
- status = -ENOMEM;
+ status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
goto out_no_page;
}
if (pos + len > folio_pos(folio) + folio_size(folio))
@@ -740,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
loff_t pos = iter->pos;
ssize_t written = 0;
long status = 0;
+ struct address_space *mapping = iter->inode->i_mapping;
+ unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
do {
struct folio *folio;
@@ -752,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
+ status = balance_dirty_pages_ratelimited_flags(mapping,
+ bdp_flags);
+ if (unlikely(status))
+ break;
+
if (bytes > length)
bytes = length;
@@ -760,6 +788,10 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
+ *
+ * For async buffered writes the assumption is that the user
+ * page has already been faulted in. This can be optimized by
+ * faulting the user page.
*/
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
@@ -771,7 +803,7 @@ again:
break;
page = folio_file_page(folio, pos >> PAGE_SHIFT);
- if (mapping_writably_mapped(iter->inode->i_mapping))
+ if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
@@ -796,10 +828,12 @@ again:
pos += status;
written += status;
length -= status;
-
- balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (iov_iter_count(i) && length);
+ if (status == -EAGAIN) {
+ iov_iter_revert(i, written);
+ return -EAGAIN;
+ }
return written ? written : status;
}
@@ -815,6 +849,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
};
int ret;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ iter.flags |= IOMAP_NOWAIT;
+
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i);
if (iter.pos == iocb->ki_pos)
@@ -1329,7 +1366,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos)
{
- struct iomap_page *iop = iomap_page_create(inode, folio);
+ struct iomap_page *iop = iomap_page_create(inode, folio, 0);
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio);
diff --git a/fs/read_write.c b/fs/read_write.c
index 397da0236607..0131d0df0476 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1663,7 +1663,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !((iocb->ki_flags & IOCB_DIRECT) ||
+ (file->f_mode & FMODE_BUF_WASYNC)))
return -EINVAL;
return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5a171c0b244b..8d9b14d2b912 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -410,7 +410,7 @@ restart:
spin_unlock(&ip->i_flags_lock);
out:
- return file_modified(file);
+ return kiocb_modified(iocb);
}
static int
@@ -700,12 +700,11 @@ xfs_file_buffered_write(
bool cleared_space = false;
unsigned int iolock;
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EOPNOTSUPP;
-
write_retry:
iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
@@ -1165,7 +1164,7 @@ xfs_file_open(
{
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
return generic_file_open(inode, file);
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5a393259a3a3..5d50fed291b4 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -664,7 +664,7 @@ xfs_ilock_for_iomap(
unsigned flags,
unsigned *lockmode)
{
- unsigned mode = XFS_ILOCK_SHARED;
+ unsigned int mode = *lockmode;
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
/*
@@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin(
bool eof = false, cow_eof = false, shared = false;
int allocfork = XFS_DATA_FORK;
int error = 0;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
@@ -1172,7 +1175,7 @@ xfs_read_iomap_begin(
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
bool shared = false;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));