diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
| -rw-r--r-- | fs/xfs/xfs_file.c | 1998 |
1 files changed, 1396 insertions, 602 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c4893e226fd8..6108612182e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -22,14 +10,11 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_error.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ioctl.h" @@ -39,60 +24,38 @@ #include "xfs_pnfs.h" #include "xfs_iomap.h" #include "xfs_reflink.h" +#include "xfs_file.h" +#include "xfs_aops.h" +#include "xfs_zone_alloc.h" +#include "xfs_error.h" +#include "xfs_errortag.h" -#include <linux/dcache.h> +#include <linux/dax.h> #include <linux/falloc.h> -#include <linux/pagevec.h> #include <linux/backing-dev.h> +#include <linux/mman.h> +#include <linux/fadvise.h> +#include <linux/mount.h> static const struct vm_operations_struct xfs_file_vm_ops; /* - * Clear the specified ranges to zero through either the pagecache or DAX. - * Holes and unwritten extents will be left as-is as they already are zeroed. + * Decide if the given file range is aligned to the size of the fundamental + * allocation unit for the file. */ -int -xfs_zero_range( - struct xfs_inode *ip, - xfs_off_t pos, - xfs_off_t count, - bool *did_zero) -{ - return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); -} - -int -xfs_update_prealloc_flags( +bool +xfs_is_falloc_aligned( struct xfs_inode *ip, - enum xfs_prealloc_flags flags) + loff_t pos, + long long int len) { - struct xfs_trans *tp; - int error; + unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); - error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, - 0, 0, 0, &tp); - if (error) - return error; + if (!is_power_of_2(alloc_unit)) + return isaligned_64(pos, alloc_unit) && + isaligned_64(len, alloc_unit); - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if (!(flags & XFS_PREALLOC_INVISIBLE)) { - VFS_I(ip)->i_mode &= ~S_ISUID; - if (VFS_I(ip)->i_mode & S_IXGRP) - VFS_I(ip)->i_mode &= ~S_ISGID; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - - if (flags & XFS_PREALLOC_SET) - ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; - if (flags & XFS_PREALLOC_CLEAR) - ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flags & XFS_PREALLOC_SYNC) - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); + return !((pos | len) & (alloc_unit - 1)); } /* @@ -109,19 +72,52 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); + return xfs_log_force_inode(ip); +} + +/* + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. + * + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. + * + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. + */ +static int +xfs_fsync_flush_log( + struct xfs_inode *ip, + bool datasync, + int *log_flushed) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); - if (!lsn) + if (!seq) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); } STATIC int @@ -131,12 +127,10 @@ xfs_file_fsync( loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(file->f_mapping->host); struct xfs_mount *mp = ip->i_mount; - int error = 0; + int error, err2; int log_flushed = 0; - xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); @@ -144,7 +138,7 @@ xfs_file_fsync( if (error) return error; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; xfs_iflags_clear(ip, XFS_ITRUNCATED); @@ -155,36 +149,20 @@ xfs_file_fsync( * ensure newly written file data make it to disk before logging the new * inode size in case of an extending write. */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) + error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * All metadata updates are logged, which means that we just have to - * flush the log up to the latest LSN that touched the inode. If we have - * concurrent fsync/fdatasync() calls, we need them to all block on the - * log force before we clear the ili_fsync_fields field. This ensures - * that we don't get a racing sync operation that does not wait for the - * metadata to hit the journal before returning. If we race with - * clearing the ili_fsync_fields, then all that will happen is the log - * force will do nothing as the lsn will already be on disk. We can't - * race with setting ili_fsync_fields because that is done under - * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared - * until after the ili_fsync_fields is cleared. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - if (!datasync || - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - lsn = ip->i_itemp->ili_last_lsn; - } - - if (lsn) { - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); - ip->i_itemp->ili_fsync_fields = 0; + if (ip->i_itemp) { + err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); + if (err2 && !error) + error = err2; } - xfs_iunlock(ip, XFS_ILOCK_SHARED); /* * If we only have a single device, and the log force about was @@ -194,30 +172,77 @@ xfs_file_fsync( * commit. */ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && - mp->m_logdev_targp == mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + mp->m_logdev_targp == mp->m_ddev_targp) { + err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + if (err2 && !error) + error = err2; + } return error; } +static int +xfs_ilock_iocb( + struct kiocb *iocb, + unsigned int lock_mode) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, lock_mode)) + return -EAGAIN; + } else { + xfs_ilock(ip, lock_mode); + } + + return 0; +} + +static int +xfs_ilock_iocb_for_write( + struct kiocb *iocb, + unsigned int *lock_mode) +{ + ssize_t ret; + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + + ret = xfs_ilock_iocb(iocb, *lock_mode); + if (ret) + return ret; + + /* + * If a reflink remap is in progress we always need to take the iolock + * exclusively to wait for it to finish. + */ + if (*lock_mode == XFS_IOLOCK_SHARED && + xfs_iflags_test(ip, XFS_IREMAPPING)) { + xfs_iunlock(ip, *lock_mode); + *lock_mode = XFS_IOLOCK_EXCL; + return xfs_ilock_iocb(iocb, *lock_mode); + } + + return 0; +} + STATIC ssize_t -xfs_file_dio_aio_read( +xfs_file_dio_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); - size_t count = iov_iter_count(to); ssize_t ret; - trace_xfs_file_direct_read(ip, count, iocb->ki_pos); + trace_xfs_file_direct_read(iocb, to); - if (!count) + if (!iov_iter_count(to)) return 0; /* skip atime */ file_accessed(iocb->ki_filp); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -229,20 +254,17 @@ xfs_file_dax_read( struct iov_iter *to) { struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); - size_t count = iov_iter_count(to); ssize_t ret = 0; - trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + trace_xfs_file_dax_read(iocb, to); - if (!count) + if (!iov_iter_count(to)) return 0; /* skip atime */ - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; + ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); @@ -250,16 +272,18 @@ xfs_file_dax_read( } STATIC ssize_t -xfs_file_buffered_aio_read( +xfs_file_buffered_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); ssize_t ret; - trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + trace_xfs_file_buffered_read(iocb, to); - xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; ret = generic_file_read_iter(iocb, to); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -277,72 +301,155 @@ xfs_file_read_iter( XFS_STATS_INC(mp, xs_read_calls); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (IS_DAX(inode)) ret = xfs_file_dax_read(iocb, to); else if (iocb->ki_flags & IOCB_DIRECT) - ret = xfs_file_dio_aio_read(iocb, to); + ret = xfs_file_dio_read(iocb, to); else - ret = xfs_file_buffered_aio_read(iocb, to); + ret = xfs_file_buffered_read(iocb, to); if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); return ret; } +STATIC ssize_t +xfs_file_splice_read( + struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (xfs_is_shutdown(mp)) + return -EIO; + + trace_xfs_file_splice_read(ip, *ppos, len); + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = filemap_splice_read(in, ppos, pipe, len, flags); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (ret > 0) + XFS_STATS_ADD(mp, xs_read_bytes, ret); + return ret; +} + /* - * Zero any on disk space between the current EOF and the new, larger EOF. - * - * This handles the normal case of zeroing the remainder of the last block in - * the file and the unusual case of zeroing blocks out beyond the size of the - * file. This second case only happens with fixed size extents and when the - * system crashes before the inode size was updated but after blocks were - * allocated. + * Take care of zeroing post-EOF blocks when they might exist. * - * Expects the iolock to be held exclusive, and will take the ilock internally. + * Returns 0 if successfully, a negative error for a failure, or 1 if this + * function dropped the iolock and reacquired it exclusively and the caller + * needs to restart the write sanity checks. */ -int /* error (positive) */ -xfs_zero_eof( - struct xfs_inode *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize, /* current inode size */ - bool *did_zeroing) +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio, + struct xfs_zone_alloc_ctx *ac) { - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(offset > isize); + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + int error; + + /* + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; - trace_xfs_zero_eof(ip, isize, offset - isize); - return xfs_zero_range(ip, isize, offset - isize, did_zeroing); + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + + return error; } /* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to + * Called with the iolock held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ STATIC ssize_t -xfs_file_aio_write_checks( +xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - int *iolock) + unsigned int *iolock, + struct xfs_zone_alloc_ctx *ac) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - ssize_t error = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); bool drained_dio = false; + ssize_t error; restart: error = generic_write_checks(iocb, from); if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + if (iocb->ki_flags & IOCB_NOWAIT) { + error = break_layout(inode, false); + if (error == -EWOULDBLOCK) + error = -EAGAIN; + } else { + error = xfs_break_layouts(inode, iolock, BREAK_WRITE); + } + if (error) return error; @@ -351,97 +458,137 @@ restart: * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_iunlock(ip, *iolock); + xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); + error = xfs_ilock_iocb(iocb, *iolock); + if (error) { + *iolock = 0; + return error; + } goto restart; } + /* - * If the offset is beyond the size of the file, we need to zero any + * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which implies - * having to redo all checks before. + * write. * - * We need to serialise against EOF updates that occur in IO - * completions here. We want to make sure that nobody is changing the - * size while we do this check until we have placed an IO barrier (i.e. - * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. - * The spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value - * and hence be able to correctly determine if we need to run zeroing. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { - bool zero = false; - - spin_unlock(&ip->i_flags_lock); - if (!drained_dio) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); - drained_dio = true; + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio, ac); + if (error == 1) goto restart; - } - error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); if (error) return error; - } else - spin_unlock(&ip->i_flags_lock); + } + + return kiocb_modified(iocb); +} + +static ssize_t +xfs_zoned_write_space_reserve( + struct xfs_mount *mp, + struct kiocb *iocb, + struct iov_iter *from, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + loff_t count = iov_iter_count(from); + int error; + + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= XFS_ZR_NOWAIT; /* - * Updating the timestamps will grab the ilock again from - * xfs_fs_dirty_inode, so we have to call it after dropping the - * lock above. Eventually we should look into a way to avoid - * the pointless lock roundtrip. + * Check the rlimit and LFS boundary first so that we don't over-reserve + * by possibly a lot. + * + * The generic write path will redo this check later, and it might have + * changed by then. If it got expanded we'll stick to our earlier + * smaller limit, and if it is decreased the new smaller limit will be + * used and our extra space reservation will be returned after finishing + * the write. */ - if (likely(!(file->f_mode & FMODE_NOCMTIME))) { - error = file_update_time(file); - if (error) - return error; - } + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); + if (error) + return error; /* - * If we're writing the file then make sure to clear the setuid and - * setgid bits if the process is not being run by root. This keeps - * people from modifying setuid and setgid binaries. + * Sloppily round up count to file system blocks. + * + * This will often reserve an extra block, but that avoids having to look + * at the start offset, which isn't stable for O_APPEND until taking the + * iolock. Also we need to reserve a block each for zeroing the old + * EOF block and the new start block if they are unaligned. + * + * Any remaining block will be returned after the write. */ - if (!IS_NOSEC(inode)) - return file_remove_privs(file); - return 0; + return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, + flags, ac); } static int xfs_dio_write_end_io( struct kiocb *iocb, ssize_t size, + int error, unsigned flags) { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; - bool update_size = false; - int error = 0; + unsigned int nofs_flag; + + ASSERT(!xfs_is_zoned_inode(ip) || + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); trace_xfs_end_io_direct_write(ip, offset, size); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; - if (size <= 0) - return size; + if (error) + return error; + if (!size) + return 0; + + /* + * Capture amount written on completion as we can't reliably account + * for it on submission. + */ + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); + + /* + * We can allocate memory here while doing writeback on behalf of + * memory reclaim. To avoid memory allocation deadlocks set the + * task-wide nofs context for the following operations. + */ + nofs_flag = memalloc_nofs_save(); + + if (flags & IOMAP_DIO_COW) { + if (iocb->ki_flags & IOCB_ATOMIC) + error = xfs_reflink_end_atomic_cow(ip, offset, size); + else + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + goto out; + } + + /* + * Unwritten conversion updates the in-core isize after extent + * conversion but before updating the on-disk size. Updating isize any + * earlier allows a racing dio read to find unwritten extents before + * they are converted. + */ + if (flags & IOMAP_DIO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, offset, size, true); + goto out; + } /* * We need to update the in-core inode size here so that we don't end up @@ -453,140 +600,317 @@ xfs_dio_write_end_io( * other IO completions here to update the EOF. Failing to serialise * here can result in EOF moving backwards and Bad Things Happen when * that occurs. + * + * As IO completion only ever extends EOF, we can do an unlocked check + * here to avoid taking the spinlock. If we land within the current EOF, + * then we do not need to do an extending update at all, and we don't + * need to take the lock to check this. If we race with an update moving + * EOF, then we'll either still be beyond EOF and need to take the lock, + * or we'll be within EOF and we don't need to take it at all. */ + if (offset + size <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size); - update_size = true; + spin_unlock(&ip->i_flags_lock); + error = xfs_setfilesize(ip, offset, size); + } else { + spin_unlock(&ip->i_flags_lock); } - spin_unlock(&ip->i_flags_lock); - if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); - if (error) - return error; +out: + memalloc_nofs_restore(nofs_flag); + return error; +} + +static const struct iomap_dio_ops xfs_dio_write_ops = { + .end_io = xfs_dio_write_end_io, +}; + +static void +xfs_dio_zoned_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; + struct xfs_zone_alloc_ctx *ac = iter->private; + xfs_filblks_t count_fsb; + struct iomap_ioend *ioend; + + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); + if (count_fsb > ac->reserved_blocks) { + xfs_err(mp, +"allocation (%lld) larger than reservation (%lld).", + count_fsb, ac->reserved_blocks); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + bio_io_error(bio); + return; } + ac->reserved_blocks -= count_fsb; - if (flags & IOMAP_DIO_UNWRITTEN) - error = xfs_iomap_write_unwritten(ip, offset, size); - else if (update_size) - error = xfs_setfilesize(ip, offset, size); + bio->bi_end_io = xfs_end_bio; + ioend = iomap_init_ioend(iter->inode, bio, file_offset, + IOMAP_IOEND_DIRECT); + xfs_zone_alloc_and_submit(ioend, &ac->open_zone); +} - return error; +static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { + .bio_set = &iomap_ioend_bioset, + .submit_io = xfs_dio_zoned_submit_io, + .end_io = xfs_dio_write_end_io, +}; + +/* + * Handle block aligned direct I/O writes. + */ +static noinline ssize_t +xfs_file_dio_write_aligned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from, + const struct iomap_ops *ops, + const struct iomap_dio_ops *dops, + struct xfs_zone_alloc_ctx *ac) +{ + unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int dio_flags = 0; + ssize_t ret; + + /* + * For always COW inodes, each bio must be aligned to the file system + * block size and not just the device sector size because we need to + * allocate a block-aligned amount of space for each write. + */ + if (xfs_is_always_cow_inode(ip)) + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + ret = xfs_file_write_checks(iocb, from, &iolock, ac); + if (ret) + goto out_unlock; + + /* + * We don't need to hold the IOLOCK exclusively across the IO, so demote + * the iolock back to shared if we had to take the exclusive lock in + * xfs_file_write_checks() for other reasons. + */ + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); +out_unlock: + xfs_iunlock(ip, iolock); + return ret; } /* - * xfs_file_dio_aio_write - handle direct IO writes - * - * Lock the inode appropriately to prepare for and issue a direct IO write. - * By separating it from the buffered write path we remove all the tricky to - * follow locking changes and looping. - * - * If there are cached pages or we're extending the file, we need IOLOCK_EXCL - * until we're sure the bytes at the new EOF have been zeroed and/or the cached - * pages are flushed out. + * Handle block aligned direct I/O writes to zoned devices. + */ +static noinline ssize_t +xfs_file_dio_write_zoned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); + if (ret < 0) + return ret; + ret = xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_zoned_direct_write_iomap_ops, + &xfs_dio_zoned_write_ops, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); + return ret; +} + +/* + * Handle block atomic writes * - * In most cases the direct IO writes will be done holding IOLOCK_SHARED - * allowing them to be done in parallel with reads and other direct IO writes. - * However, if the IO is not aligned to filesystem blocks, the direct IO layer - * needs to do sub-block zeroing and that requires serialisation against other - * direct IOs to the same block. In this case we need to serialise the - * submission of the unaligned IOs so that we don't get racing block zeroing in - * the dio layer. To avoid the problem with aio, we also need to wait for - * outstanding IOs to complete so that unwritten extent conversion is completed - * before we try to map the overlapping block. This is currently implemented by - * hitting it with a big hammer (i.e. inode_dio_wait()). + * Two methods of atomic writes are supported: + * - REQ_ATOMIC-based, which would typically use some form of HW offload in the + * disk + * - COW-based, which uses a COW fork as a staging extent for data updates + * before atomically updating extent mappings for the range being written * - * Returns with locks held indicated by @iolock and errors indicated by - * negative return values. */ -STATIC ssize_t -xfs_file_dio_aio_write( +static noinline ssize_t +xfs_file_dio_write_atomic( + struct xfs_inode *ip, struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - int unaligned_io = 0; - int iolock; - size_t count = iov_iter_count(from); - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) - return -EINVAL; + unsigned int iolock = XFS_IOLOCK_SHARED; + ssize_t ret, ocount = iov_iter_count(from); + const struct iomap_ops *dops; /* - * Don't take the exclusive iolock here unless the I/O is unaligned to - * the file system block size. We don't need to consider the EOF - * extension case here because xfs_file_aio_write_checks() will relock - * the inode as necessary for EOF zeroing cases and fill out the new - * inode size as appropriate. + * HW offload should be faster, so try that first if it is already + * known that the write length is not too large. */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) { - unaligned_io = 1; + if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) + dops = &xfs_atomic_write_cow_iomap_ops; + else + dops = &xfs_direct_write_iomap_ops; - /* - * We can't properly handle unaligned direct I/O to reflink - * files yet, as we can't unshare a partial block. - */ - if (xfs_is_reflink_inode(ip)) { - trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); - return -EREMCHG; - } - iolock = XFS_IOLOCK_EXCL; - } else { +retry: + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); + if (ret) + goto out_unlock; + + /* Demote similar to xfs_file_dio_write_aligned() */ + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } - if (!xfs_ilock_nowait(ip, iolock)) { + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, + 0, NULL, 0); + + /* + * The retry mechanism is based on the ->iomap_begin method returning + * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not + * possible. The REQ_ATOMIC-based method typically not be possible if + * the write spans multiple extents or the disk blocks are misaligned. + */ + if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { + xfs_iunlock(ip, iolock); + dops = &xfs_atomic_write_cow_iomap_ops; + goto retry; + } + +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); + return ret; +} + +/* + * Handle block unaligned direct I/O writes + * + * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing + * them to be done in parallel with reads and other direct I/O writes. However, + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need + * to do sub-block zeroing and that requires serialisation against other direct + * I/O to the same block. In this case we need to serialise the submission of + * the unaligned I/O so that we don't get racing block zeroing in the dio layer. + * In the case where sub-block zeroing is not required, we can do concurrent + * sub-block dios to the same block successfully. + * + * Optimistically submit the I/O using the shared lock first, but use the + * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN + * if block allocation or partial block zeroing would be required. In that case + * we try again with the exclusive lock. + */ +static noinline ssize_t +xfs_file_dio_write_unaligned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + size_t isize = i_size_read(VFS_I(ip)); + size_t count = iov_iter_count(from); + unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; + ssize_t ret; + + /* + * Extending writes need exclusivity because of the sub-block zeroing + * that the DIO code always does for partial tail blocks beyond EOF, so + * don't even bother trying the fast path in this case. + */ + if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; - xfs_ilock(ip, iolock); +retry_exclusive: + iolock = XFS_IOLOCK_EXCL; + flags = IOMAP_DIO_FORCE_WAIT; } - ret = xfs_file_aio_write_checks(iocb, from, &iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) - goto out; - count = iov_iter_count(from); + return ret; /* - * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to take the exclusive lock - * for other reasons in xfs_file_aio_write_checks. + * We can't properly handle unaligned direct I/O to reflink files yet, + * as we can't unshare a partial block. */ - if (unaligned_io) { - /* If we are going to wait for other DIO to finish, bail */ - if (iocb->ki_flags & IOCB_NOWAIT) { - if (atomic_read(&inode->i_dio_count)) - return -EAGAIN; - } else { - inode_dio_wait(inode); - } - } else if (iolock == XFS_IOLOCK_EXCL) { - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + if (xfs_is_cow_inode(ip)) { + trace_xfs_reflink_bounce_dio_write(iocb, from); + ret = -ENOTBLK; + goto out_unlock; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); -out: - xfs_iunlock(ip, iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); + if (ret) + goto out_unlock; + + /* + * If we are doing exclusive unaligned I/O, this must be the only I/O + * in-flight. Otherwise we risk data corruption due to unwritten extent + * conversions from the AIO end_io handler. Wait for all other I/O to + * drain first. + */ + if (flags & IOMAP_DIO_FORCE_WAIT) + inode_dio_wait(VFS_I(ip)); + + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, + &xfs_dio_write_ops, flags, NULL, 0); /* - * No fallback to buffered IO on errors for XFS, direct IO will either - * complete fully or fail. + * Retry unaligned I/O with exclusive blocking semantics if the DIO + * layer rejected it for mapping or locking reasons. If we are doing + * nonblocking user I/O, propagate the error. */ - ASSERT(ret < 0 || ret == count); + if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { + ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); + xfs_iunlock(ip, iolock); + goto retry_exclusive; + } + +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); return ret; } +static ssize_t +xfs_file_dio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + size_t count = iov_iter_count(from); + + /* direct I/O must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) + return -EINVAL; + + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) + return xfs_file_dio_write_unaligned(ip, iocb, from); + if (xfs_is_zoned_inode(ip)) + return xfs_file_dio_write_zoned(ip, iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) + return xfs_file_dio_write_atomic(ip, iocb, from); + return xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); +} + static noinline ssize_t xfs_file_dax_write( struct kiocb *iocb, @@ -594,63 +918,65 @@ xfs_file_dax_write( { struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - int iolock = XFS_IOLOCK_EXCL; + unsigned int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; - size_t count; loff_t pos; - if (!xfs_ilock_nowait(ip, iolock)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - xfs_ilock(ip, iolock); - } - - ret = xfs_file_aio_write_checks(iocb, from, &iolock); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; pos = iocb->ki_pos; - count = iov_iter_count(from); - trace_xfs_file_dax_write(ip, count, pos); - ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); + trace_xfs_file_dax_write(iocb, from); + ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } out: - xfs_iunlock(ip, iolock); - return error ? error : ret; + if (iolock) + xfs_iunlock(ip, iolock); + if (error) + return error; + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } + return ret; } STATIC ssize_t -xfs_file_buffered_aio_write( +xfs_file_buffered_write( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int enospc = 0; - int iolock; + bool cleared_space = false; + unsigned int iolock; write_retry: iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; - ret = xfs_file_aio_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - - trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); - ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); - if (likely(ret >= 0)) - iocb->ki_pos += ret; + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + NULL); /* * If we hit a space limit, try to free up some lingering preallocated @@ -659,34 +985,97 @@ write_retry: * metadata space. This reduces the chances that the eofblocks scan * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this * also behaves as a filter to prevent too many eofblocks scans from - * running at the same time. + * running at the same time. Use a synchronous scan to increase the + * effectiveness of the scan. */ - if (ret == -EDQUOT && !enospc) { + if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - enospc = xfs_inode_free_quota_eofblocks(ip); - if (enospc) - goto write_retry; - enospc = xfs_inode_free_quota_cowblocks(ip); - if (enospc) - goto write_retry; - iolock = 0; - } else if (ret == -ENOSPC && !enospc) { - struct xfs_eofblocks eofb = {0}; - - enospc = 1; + xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); + cleared_space = true; + goto write_retry; + } else if (ret == -ENOSPC && !cleared_space) { + struct xfs_icwalk icw = {0}; + + cleared_space = true; xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); - eofb.eof_flags = XFS_EOF_FLAGS_SYNC; - xfs_icache_free_eofblocks(ip->i_mount, &eofb); - xfs_icache_free_cowblocks(ip->i_mount, &eofb); + icw.icw_flags = XFS_ICWALK_FLAG_SYNC; + xfs_blockgc_free_space(ip->i_mount, &icw); goto write_retry; } - current->backing_dev_info = NULL; out: if (iolock) xfs_iunlock(ip, iolock); + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } + return ret; +} + +STATIC ssize_t +xfs_file_buffered_write_zoned( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + unsigned int iolock = XFS_IOLOCK_EXCL; + bool cleared_space = false; + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); + if (ret < 0) + return ret; + + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + goto out_unreserve; + + ret = xfs_file_write_checks(iocb, from, &iolock, &ac); + if (ret) + goto out_unlock; + + /* + * Truncate the iter to the length that we were actually able to + * allocate blocks for. This needs to happen after + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND + * writes. + */ + iov_iter_truncate(from, + XFS_FSB_TO_B(mp, ac.reserved_blocks) - + (iocb->ki_pos & mp->m_blockmask)); + if (!iov_iter_count(from)) + goto out_unlock; + +retry: + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + &ac); + if (ret == -ENOSPC && !cleared_space) { + /* + * Kick off writeback to convert delalloc space and release the + * usually too pessimistic indirect block reservations. + */ + xfs_flush_inodes(mp); + cleared_space = true; + goto retry; + } + +out_unlock: + xfs_iunlock(ip, iolock); +out_unreserve: + xfs_zoned_space_unreserve(ip->i_mount, &ac); + if (ret > 0) { + XFS_STATS_ADD(mp, xs_write_bytes, ret); + ret = generic_write_sync(iocb, ret); + } return ret; } @@ -695,9 +1084,7 @@ xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); @@ -707,200 +1094,471 @@ xfs_file_write_iter( if (ocount == 0) return 0; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; + if (iocb->ki_flags & IOCB_ATOMIC) { + if (ocount < xfs_get_atomic_write_min(ip)) + return -EINVAL; + + if (ocount > xfs_get_atomic_write_max(ip)) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (IS_DAX(inode)) - ret = xfs_file_dax_write(iocb, from); - else if (iocb->ki_flags & IOCB_DIRECT) { + return xfs_file_dax_write(iocb, from); + + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered * write *only* in the case that we're doing a reflink * CoW. In all other directio scenarios we do not * allow an operation to fall back to buffered mode. */ - ret = xfs_file_dio_aio_write(iocb, from); - if (ret == -EREMCHG) - goto buffered; - } else { -buffered: - ret = xfs_file_buffered_aio_write(iocb, from); + ret = xfs_file_dio_write(iocb, from); + if (ret != -ENOTBLK) + return ret; } - if (ret > 0) { - XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); - - /* Handle various SYNC-type writes */ - ret = generic_write_sync(iocb, ret); - } - return ret; + if (xfs_is_zoned_inode(ip)) + return xfs_file_buffered_write_zoned(iocb, from); + return xfs_file_buffered_write(iocb, from); } -#define XFS_FALLOC_FL_SUPPORTED \ - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); -STATIC long -xfs_file_fallocate( + if (xfs_has_wsync(ip->i_mount)) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + +static int +xfs_falloc_newsize( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + loff_t *new_size) { struct inode *inode = file_inode(file); - struct xfs_inode *ip = XFS_I(inode); - long error; - enum xfs_prealloc_flags flags = 0; - uint iolock = XFS_IOLOCK_EXCL; - loff_t new_size = 0; - bool do_file_insert = 0; - if (!S_ISREG(inode->i_mode)) + if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) + return 0; + *new_size = offset + len; + return inode_newsize_ok(inode, *new_size); +} + +static int +xfs_falloc_setsize( + struct file *file, + loff_t new_size) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = new_size, + }; + + if (!new_size) + return 0; + return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), + &iattr); +} + +static int +xfs_falloc_collapse_range( + struct file *file, + loff_t offset, + loff_t len, + struct xfs_zone_alloc_ctx *ac) +{ + struct inode *inode = file_inode(file); + loff_t new_size = i_size_read(inode) - len; + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) return -EINVAL; - if (mode & ~XFS_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + /* + * There is no need to overlap collapse range with EOF, in which case it + * is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); if (error) - goto out_unlock; + return error; + return xfs_falloc_setsize(file, new_size); +} - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; +static int +xfs_falloc_insert_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t isize = i_size_read(inode); + int error; - if (mode & FALLOC_FL_PUNCH_HOLE) { - error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; - if (offset & blksize_mask || len & blksize_mask) { - error = -EINVAL; - goto out_unlock; - } + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) + return -EFBIG; - /* - * There is no need to overlap collapse range with EOF, - * in which case it is effectively a truncate operation - */ - if (offset + len >= i_size_read(inode)) { - error = -EINVAL; - goto out_unlock; - } + /* Offset should be less than i_size */ + if (offset >= isize) + return -EINVAL; - new_size = i_size_read(inode) - len; + error = xfs_falloc_setsize(file, isize + len); + if (error) + return error; - error = xfs_collapse_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; - - new_size = i_size_read(inode) + len; - if (offset & blksize_mask || len & blksize_mask) { - error = -EINVAL; - goto out_unlock; - } + /* + * Perform hole insertion now that the file size has been updated so + * that if we crash during the operation we don't leave shifted extents + * past EOF and hence losing access to the data that is contained within + * them. + */ + return xfs_insert_file_space(XFS_I(inode), offset, len); +} - /* check the new inode size does not wrap through zero */ - if (new_size > inode->i_sb->s_maxbytes) { - error = -EFBIG; - goto out_unlock; - } +/* + * Punch a hole and prealloc the range. We use a hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by + * virtue of the hole punch. + */ +static int +xfs_falloc_zero_range( + struct file *file, + int mode, + loff_t offset, + loff_t len, + struct xfs_zone_alloc_ctx *ac) +{ + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + unsigned int blksize = i_blocksize(inode); + loff_t new_size = 0; + int error; - /* Offset should be less than i_size */ - if (offset >= i_size_read(inode)) { - error = -EINVAL; - goto out_unlock; - } - do_file_insert = 1; - } else { - flags |= XFS_PREALLOC_SET; - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } + trace_xfs_zero_file_space(ip); - if (mode & FALLOC_FL_ZERO_RANGE) - error = xfs_zero_file_space(ip, offset, len); - else { - if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; - } - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); - } + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + /* + * Zero range implements a full zeroing mechanism but is only used in + * limited situations. It is more efficient to allocate unwritten + * extents than to perform zeroing here, so use an errortag to randomly + * force zeroing on DEBUG kernels for added test coverage. + */ + if (XFS_TEST_ERROR(ip->i_mount, + XFS_ERRTAG_FORCE_ZERO_RANGE)) { + error = xfs_zero_range(ip, offset, len, ac, NULL); + } else { + error = xfs_free_file_space(ip, offset, len, ac); if (error) - goto out_unlock; + return error; + + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(ip, offset, len); } + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_unshare_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; - if (file->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; - error = xfs_update_prealloc_flags(ip, flags); + error = xfs_reflink_unshare(XFS_I(inode), offset, len); if (error) - goto out_unlock; + return error; - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_dentry(file), &iattr); - if (error) - goto out_unlock; - } +static int +xfs_falloc_allocate_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; /* - * Perform hole insertion now that the file size has been - * updated so that if we crash during the operation we don't - * leave shifted extents past EOF and hence losing access to - * the data that is contained within them. + * If always_cow mode we can't use preallocations and thus should not + * create them. */ - if (do_file_insert) - error = xfs_insert_file_space(ip, offset, len); + if (xfs_is_always_cow_inode(XFS_I(inode))) + return -EOPNOTSUPP; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ + FALLOC_FL_UNSHARE_RANGE) + +STATIC long +__xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len, + struct xfs_zone_alloc_ctx *ac) +{ + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + long error; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); + if (error) + goto out_unlock; + + /* + * Must wait for all AIO to complete before we continue as AIO can + * change the file size on completion without holding any locks we + * currently hold. We must do this first because AIO can update both + * the on disk and in memory inode sizes, and the operations that follow + * require the in-memory size to be fully up-to-date. + */ + inode_dio_wait(inode); + + error = file_modified(file); + if (error) + goto out_unlock; + + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: + error = xfs_free_file_space(ip, offset, len, ac); + break; + case FALLOC_FL_COLLAPSE_RANGE: + error = xfs_falloc_collapse_range(file, offset, len, ac); + break; + case FALLOC_FL_INSERT_RANGE: + error = xfs_falloc_insert_range(file, offset, len); + break; + case FALLOC_FL_ZERO_RANGE: + error = xfs_falloc_zero_range(file, mode, offset, len, ac); + break; + case FALLOC_FL_UNSHARE_RANGE: + error = xfs_falloc_unshare_range(file, mode, offset, len); + break; + case FALLOC_FL_ALLOCATE_RANGE: + error = xfs_falloc_allocate_range(file, mode, offset, len); + break; + default: + error = -EOPNOTSUPP; + break; + } + + if (!error && xfs_file_sync_writes(file)) + error = xfs_log_force_inode(ip); out_unlock: xfs_iunlock(ip, iolock); return error; } +static long +xfs_file_zoned_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct xfs_zone_alloc_ctx ac = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + int error; + + error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac); + if (error) + return error; + error = __xfs_file_fallocate(file, mode, offset, len, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); + return error; +} + +static long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* + * For zoned file systems, zeroing the first and last block of a hole + * punch requires allocating a new block to rewrite the remaining data + * and new zeroes out of place. Get a reservations for those before + * taking the iolock. Dip into the reserved pool because we are + * expected to be able to punch a hole even on a completely full + * file system. + */ + if (xfs_is_zoned_inode(XFS_I(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE))) + return xfs_file_zoned_fallocate(file, mode, offset, len); + return __xfs_file_fallocate(file, mode, offset, len, NULL); +} + STATIC int -xfs_file_clone_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) +xfs_file_fadvise( + struct file *file, + loff_t start, + loff_t end, + int advice) { - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); + struct xfs_inode *ip = XFS_I(file_inode(file)); + int ret; + int lockflags = 0; + + /* + * Operations creating pages in page cache need protection from hole + * punching and similar ops + */ + if (advice == POSIX_FADV_WILLNEED) { + lockflags = XFS_IOLOCK_SHARED; + xfs_ilock(ip, lockflags); + } + ret = generic_fadvise(file, start, end, advice); + if (lockflags) + xfs_iunlock(ip, lockflags); + return ret; } -STATIC ssize_t -xfs_file_dedupe_range( - struct file *src_file, - u64 loff, - u64 len, - struct file *dst_file, - u64 dst_loff) +STATIC loff_t +xfs_file_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + loff_t len, + unsigned int remap_flags) { - int error; + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + loff_t remapped = 0; + xfs_extlen_t cowextsize; + int ret; - error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, - len, true); - if (error) - return error; - return len; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (!xfs_has_reflink(mp)) + return -EOPNOTSUPP; + + if (xfs_is_shutdown(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret || len == 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + + ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, + &remapped); + if (ret) + goto out_unlock; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (pos_in == 0 && len == i_size_read(inode_in) && + (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + pos_out == 0 && len >= i_size_read(inode_out) && + !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_cowextsize; + + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + remap_flags); + if (ret) + goto out_unlock; + + if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) + xfs_log_force_inode(dest); +out_unlock: + xfs_iunlock2_remapping(src, dest); + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + /* + * If the caller did not set CAN_SHORTEN, then it is not prepared to + * handle partial results -- either the whole remap succeeds, or we + * must say why it did not. In this case, any error should be returned + * to the caller. + */ + if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return ret; + return remapped > 0 ? remapped : ret; } STATIC int @@ -908,12 +1566,12 @@ xfs_file_open( struct inode *inode, struct file *file) { - if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) - return -EFBIG; - if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_AIO_NOWAIT; - return 0; + file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; + return generic_file_open(inode, file); } STATIC int @@ -922,10 +1580,12 @@ xfs_dir_open( struct file *file) { struct xfs_inode *ip = XFS_I(inode); - int mode; + unsigned int mode; int error; - error = xfs_file_open(inode, file); + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; + error = generic_file_open(inode, file); if (error) return error; @@ -934,18 +1594,91 @@ xfs_dir_open( * certain to have the next operation be a read there. */ mode = xfs_ilock_data_map_shared(ip); - if (ip->i_d.di_nextents > 0) - error = xfs_dir3_data_readahead(ip, 0, -1); + if (ip->i_df.if_nextents > 0) + error = xfs_dir3_data_readahead(ip, 0, 0); xfs_iunlock(ip, mode); return error; } +/* + * Don't bother propagating errors. We're just doing cleanup, and the caller + * ignores the return value anyway. + */ STATIC int xfs_file_release( - struct inode *inode, - struct file *filp) + struct inode *inode, + struct file *file) { - return xfs_release(XFS_I(inode)); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + /* + * If this is a read-only mount or the file system has been shut down, + * don't generate I/O. + */ + if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) + return 0; + + /* + * If we previously truncated this file and removed old data in the + * process, we want to initiate "early" writeout on the last close. + * This is an attempt to combat the notorious NULL files problem which + * is particularly noticeable from a truncate down, buffered (re-)write + * (delalloc), followed by a crash. What we are effectively doing here + * is significantly reducing the time window where we'd otherwise be + * exposed to that problem. + */ + if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { + xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); + if (ip->i_delayed_blks > 0) + filemap_flush(inode->i_mapping); + } + + /* + * XFS aggressively preallocates post-EOF space to generate contiguous + * allocations for writers that append to the end of the file. + * + * To support workloads that close and reopen the file frequently, these + * preallocations usually persist after a close unless it is the first + * close for the inode. This is a tradeoff to generate tightly packed + * data layouts for unpacking tarballs or similar archives that write + * one file after another without going back to it while keeping the + * preallocation for files that have recurring open/write/close cycles. + * + * This heuristic is skipped for inodes with the append-only flag as + * that flag is rather pointless for inodes written only once. + * + * There is no point in freeing blocks here for open but unlinked files + * as they will be taken care of by the inactivation path soon. + * + * When releasing a read-only context, don't flush data or trim post-EOF + * blocks. This avoids open/read/close workloads from removing EOF + * blocks that other writers depend upon to reduce fragmentation. + * + * Inodes on the zoned RT device never have preallocations, so skip + * taking the locks below. + */ + if (!inode->i_nlink || + !(file->f_mode & FMODE_WRITE) || + (ip->i_diflags & XFS_DIFLAG_APPEND) || + xfs_is_zoned_inode(ip)) + return 0; + + /* + * If we can't get the iolock just skip truncating the blocks past EOF + * because we could deadlock with the mmap_lock otherwise. We'll get + * another chance to drop them once the last reference to the inode is + * dropped, so we'll never leak blocks permanently. + */ + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (xfs_can_free_eofblocks(ip) && + !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return 0; } STATIC int @@ -969,7 +1702,7 @@ xfs_file_readdir( * point we can change the ->readdir prototype to include the * buffer size. For now we use the current glibc buffer size. */ - bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); return xfs_readdir(NULL, ip, ctx, bufsize); } @@ -982,17 +1715,17 @@ xfs_file_llseek( { struct inode *inode = file->f_mapping->host; - if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount)) + if (xfs_is_shutdown(XFS_I(inode)->i_mount)) return -EIO; switch (whence) { default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); break; case SEEK_DATA: - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); break; } @@ -1001,140 +1734,186 @@ xfs_file_llseek( return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } +static inline vm_fault_t +xfs_dax_fault_locked( + struct vm_fault *vmf, + unsigned int order, + bool write_fault) +{ + vm_fault_t ret; + unsigned long pfn; + + if (!IS_ENABLED(CONFIG_FS_DAX)) { + ASSERT(0); + return VM_FAULT_SIGBUS; + } + ret = dax_iomap_fault(vmf, order, &pfn, NULL, + (write_fault && !vmf->cow_page) ? + &xfs_dax_write_iomap_ops : + &xfs_read_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, order, pfn); + return ret; +} + +static vm_fault_t +xfs_dax_read_fault( + struct vm_fault *vmf, + unsigned int order) +{ + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + vm_fault_t ret; + + trace_xfs_read_fault(ip, order); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + ret = xfs_dax_fault_locked(vmf, order, false); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return ret; +} + /* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: * - * mmap_sem (MM) + * mmap_lock (MM) * sb_start_pagefault(vfs, freeze) - * i_mmaplock (XFS - truncate serialisation) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ - -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ -STATIC int -xfs_filemap_page_mkwrite( - struct vm_fault *vmf) +static vm_fault_t +__xfs_write_fault( + struct vm_fault *vmf, + unsigned int order, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(vmf->vma->vm_file); - int ret; + struct xfs_inode *ip = XFS_I(inode); + unsigned int lock_mode = XFS_MMAPLOCK_SHARED; + vm_fault_t ret; - trace_xfs_filemap_page_mkwrite(XFS_I(inode)); + trace_xfs_write_fault(ip, order); sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - } else { - ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); - ret = block_page_mkwrite_return(ret); + /* + * Normally we only need the shared mmaplock, but if a reflink remap is + * in progress we take the exclusive lock to wait for the remap to + * finish before taking a write fault. + */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (xfs_iflags_test(ip, XFS_IREMAPPING)) { + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + lock_mode = XFS_MMAPLOCK_EXCL; } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (IS_DAX(inode)) + ret = xfs_dax_fault_locked(vmf, order, true); + else + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + ac); + xfs_iunlock(ip, lock_mode); + sb_end_pagefault(inode->i_sb); + return ret; +} +static vm_fault_t +xfs_write_fault_zoned( + struct vm_fault *vmf, + unsigned int order) +{ + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + unsigned int len = folio_size(page_folio(vmf->page)); + struct xfs_zone_alloc_ctx ac = { }; + int error; + vm_fault_t ret; + + /* + * This could over-allocate as it doesn't check for truncation. + * + * But as the overallocation is limited to less than a folio and will be + * release instantly that's just fine. + */ + error = xfs_zoned_space_reserve(ip->i_mount, + XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); + if (error < 0) + return vmf_fs_error(error); + ret = __xfs_write_fault(vmf, order, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); return ret; } -STATIC int +static vm_fault_t +xfs_write_fault( + struct vm_fault *vmf, + unsigned int order) +{ + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) + return xfs_write_fault_zoned(vmf, order); + return __xfs_write_fault(vmf, order, NULL); +} + +static inline bool +xfs_is_write_fault( + struct vm_fault *vmf) +{ + return (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); +} + +static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - int ret; - - trace_xfs_filemap_fault(XFS_I(inode)); /* DAX can shortcut the normal fault path on write faults! */ - if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) - return xfs_filemap_page_mkwrite(vmf); - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - else - ret = filemap_fault(vmf); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (IS_DAX(inode)) { + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, 0); + return xfs_dax_read_fault(vmf, 0); + } - return ret; + trace_xfs_read_fault(XFS_I(inode), 0); + return filemap_fault(vmf); } -/* - * Similar to xfs_filemap_fault(), the DAX fault path can call into here on - * both read and write faults. Hence we need to handle both cases. There is no - * ->huge_mkwrite callout for huge pages, so we have a single function here to - * handle both cases here. @flags carries the information on the type of fault - * occuring. - */ -STATIC int +static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret; - - if (!IS_DAX(inode)) + if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; - trace_xfs_filemap_huge_fault(ip); - - if (vmf->flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - } - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - if (vmf->flags & FAULT_FLAG_WRITE) - sb_end_pagefault(inode->i_sb); + /* DAX can shortcut the normal fault path on write faults! */ + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, order); + return xfs_dax_read_fault(vmf, order); +} - return ret; +static vm_fault_t +xfs_filemap_page_mkwrite( + struct vm_fault *vmf) +{ + return xfs_write_fault(vmf, 0); } /* - * pfn_mkwrite was originally inteneded to ensure we capture time stamp - * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED - * to ensure we serialise the fault barrier in place. + * pfn_mkwrite was originally intended to ensure we capture time stamp updates + * on write faults. In reality, it needs to serialise against truncate and + * prepare memory for writing so handle is as standard write fault. */ -static int +static vm_fault_t xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret = VM_FAULT_NOPAGE; - loff_t size; - - trace_xfs_filemap_pfn_mkwrite(ip); - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* check if the faulting page hasn't raced with truncate */ - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else if (IS_DAX(inode)) - ret = dax_pfn_mkwrite(vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); - return ret; - + return xfs_write_fault(vmf, 0); } static const struct vm_operations_struct xfs_file_vm_ops = { @@ -1146,14 +1925,25 @@ static const struct vm_operations_struct xfs_file_vm_ops = { }; STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) +xfs_file_mmap_prepare( + struct vm_area_desc *desc) { - file_accessed(filp); - vma->vm_ops = &xfs_file_vm_ops; - if (IS_DAX(file_inode(filp))) - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + struct file *file = desc->file; + struct inode *inode = file_inode(file); + struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); + + /* + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. + */ + if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), + target->bt_daxdev)) + return -EOPNOTSUPP; + + file_accessed(file); + desc->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(inode)) + desc->vm_flags |= VM_HUGEPAGE; return 0; } @@ -1161,20 +1951,24 @@ const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, .write_iter = xfs_file_write_iter, - .splice_read = generic_file_splice_read, + .splice_read = xfs_file_splice_read, .splice_write = iter_file_splice_write, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif - .mmap = xfs_file_mmap, + .mmap_prepare = xfs_file_mmap_prepare, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .clone_file_range = xfs_file_clone_range, - .dedupe_file_range = xfs_file_dedupe_range, + .fadvise = xfs_file_fadvise, + .remap_file_range = xfs_file_remap_range, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { |
