diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
| -rw-r--r-- | fs/xfs/xfs_file.c | 2628 |
1 files changed, 1584 insertions, 1044 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index de3dc98f4e8f..6108612182e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1,129 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_vnodeops.h" -#include "xfs_da_btree.h" -#include "xfs_dir2_format.h" +#include "xfs_bmap_util.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" +#include "xfs_iomap.h" +#include "xfs_reflink.h" +#include "xfs_file.h" +#include "xfs_aops.h" +#include "xfs_zone_alloc.h" +#include "xfs_error.h" +#include "xfs_errortag.h" -#include <linux/aio.h> -#include <linux/dcache.h> +#include <linux/dax.h> #include <linux/falloc.h> -#include <linux/pagevec.h> +#include <linux/backing-dev.h> +#include <linux/mman.h> +#include <linux/fadvise.h> +#include <linux/mount.h> static const struct vm_operations_struct xfs_file_vm_ops; /* - * Locking primitives for read and write IO paths to ensure we consistently use - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + * Decide if the given file range is aligned to the size of the fundamental + * allocation unit for the file. */ -static inline void -xfs_rw_ilock( - struct xfs_inode *ip, - int type) -{ - if (type & XFS_IOLOCK_EXCL) - mutex_lock(&VFS_I(ip)->i_mutex); - xfs_ilock(ip, type); -} - -static inline void -xfs_rw_iunlock( +bool +xfs_is_falloc_aligned( struct xfs_inode *ip, - int type) + loff_t pos, + long long int len) { - xfs_iunlock(ip, type); - if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); -} + unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); -static inline void -xfs_rw_ilock_demote( - struct xfs_inode *ip, - int type) -{ - xfs_ilock_demote(ip, type); - if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); -} + if (!is_power_of_2(alloc_unit)) + return isaligned_64(pos, alloc_unit) && + isaligned_64(len, alloc_unit); -/* - * xfs_iozero - * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. - */ -int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ -{ - struct page *page; - struct address_space *mapping; - int status; - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - pos += bytes; - count -= bytes; - status = 0; - } while (count); - - return (-status); + return !((pos | len) & (alloc_unit - 1)); } /* @@ -140,19 +72,52 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); + return xfs_log_force_inode(ip); +} - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); +/* + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. + * + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. + * + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. + */ +static int +xfs_fsync_flush_log( + struct xfs_inode *ip, + bool datasync, + int *log_flushed) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - if (!lsn) + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + + if (!seq) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); } STATIC int @@ -162,52 +127,42 @@ xfs_file_fsync( loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(file->f_mapping->host); struct xfs_mount *mp = ip->i_mount; - int error = 0; + int error, err2; int log_flushed = 0; - xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); - error = filemap_write_and_wait_range(inode->i_mapping, start, end); + error = file_write_and_wait_range(file, start, end); if (error) return error; - if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + if (xfs_is_shutdown(mp)) + return -EIO; xfs_iflags_clear(ip, XFS_ITRUNCATED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { - /* - * If we have an RT and/or log subvolume we need to make sure - * to flush the write cache the device used for file data - * first. This is to ensure newly written file data make - * it to disk before logging the new inode size in case of - * an extending write. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); - else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); - } + /* + * If we have an RT and/or log subvolume we need to make sure to flush + * the write cache the device used for file data first. This is to + * ensure newly written file data make it to disk before logging the new + * inode size in case of an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) + error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * All metadata updates are logged, which means that we just have - * to flush the log up to the latest LSN that touched the inode. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - if (!datasync || - (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) - lsn = ip->i_itemp->ili_last_lsn; + if (ip->i_itemp) { + err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); + if (err2 && !error) + error = err2; } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (lsn) - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); /* * If we only have a single device, and the log force about was @@ -216,660 +171,1407 @@ xfs_file_fsync( * an already allocated file and thus do not have any metadata to * commit. */ - if ((mp->m_flags & XFS_MOUNT_BARRIER) && - mp->m_logdev_targp == mp->m_ddev_targp && - !XFS_IS_REALTIME_INODE(ip) && - !log_flushed) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && + mp->m_logdev_targp == mp->m_ddev_targp) { + err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + if (err2 && !error) + error = err2; + } - return -error; + return error; } -STATIC ssize_t -xfs_file_aio_read( +static int +xfs_ilock_iocb( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) + unsigned int lock_mode) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = 0; - ssize_t ret = 0; - int ioflags = 0; - xfs_fsize_t n; + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); - XFS_STATS_INC(xs_read_calls); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, lock_mode)) + return -EAGAIN; + } else { + xfs_ilock(ip, lock_mode); + } - BUG_ON(iocb->ki_pos != pos); + return 0; +} - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; +static int +xfs_ilock_iocb_for_write( + struct kiocb *iocb, + unsigned int *lock_mode) +{ + ssize_t ret; + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); - ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); - if (ret < 0) + ret = xfs_ilock_iocb(iocb, *lock_mode); + if (ret) return ret; - if (unlikely(ioflags & IO_ISDIRECT)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (size & target->bt_smask)) { - if (pos == i_size_read(inode)) - return 0; - return -XFS_ERROR(EINVAL); - } + /* + * If a reflink remap is in progress we always need to take the iolock + * exclusively to wait for it to finish. + */ + if (*lock_mode == XFS_IOLOCK_SHARED && + xfs_iflags_test(ip, XFS_IREMAPPING)) { + xfs_iunlock(ip, *lock_mode); + *lock_mode = XFS_IOLOCK_EXCL; + return xfs_ilock_iocb(iocb, *lock_mode); } - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; + return 0; +} - if (n < size) - size = n; +STATIC ssize_t +xfs_file_dio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; + trace_xfs_file_direct_read(iocb, to); - /* - * Locking is a bit tricky here. If we take an exclusive lock - * for direct IO, we effectively serialise all new concurrent - * read IO to this file and block it behind IO that is currently in - * progress because IO in progress holds the IO lock shared. We only - * need to hold the lock exclusive to blow away the page cache, so - * only take lock exclusively if the page cache needs invalidation. - * This allows the normal direct IO case of no page cache pages to - * proceeed concurrently without serialisation. - */ - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); - - if (inode->i_mapping->nrpages) { - ret = -filemap_write_and_wait_range( - VFS_I(ip)->i_mapping, - pos, -1); - if (ret) { - xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; - } - truncate_pagecache_range(VFS_I(ip), pos, -1); - } - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - } + if (!iov_iter_count(to)) + return 0; /* skip atime */ - trace_xfs_file_read(ip, size, pos, ioflags); + file_accessed(iocb->ki_filp); - ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +static noinline ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + ssize_t ret = 0; + + trace_xfs_file_dax_read(iocb, to); + + if (!iov_iter_count(to)) + return 0; /* skip atime */ + + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; + ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); return ret; } STATIC ssize_t -xfs_file_splice_read( - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, - unsigned int flags) +xfs_file_buffered_read( + struct kiocb *iocb, + struct iov_iter *to) { - struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); ssize_t ret; - XFS_STATS_INC(xs_read_calls); + trace_xfs_file_buffered_read(iocb, to); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; + ret = generic_file_read_iter(iocb, to); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; + ssize_t ret = 0; - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + XFS_STATS_INC(mp, xs_read_calls); - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + if (xfs_is_shutdown(mp)) + return -EIO; - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) + ret = xfs_file_dio_read(iocb, to); + else + ret = xfs_file_buffered_read(iocb, to); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + if (ret > 0) + XFS_STATS_ADD(mp, xs_read_bytes, ret); return ret; } -/* - * xfs_file_splice_write() does not use xfs_rw_ilock() because - * generic_file_splice_write() takes the i_mutex itself. This, in theory, - * couuld cause lock inversions between the aio_write path and the splice path - * if someone is doing concurrent splice(2) based writes and write(2) based - * writes to the same inode. The only real way to fix this is to re-implement - * the generic code here with correct locking orders. - */ STATIC ssize_t -xfs_file_splice_write( - struct pipe_inode_info *pipe, - struct file *outfilp, +xfs_file_splice_read( + struct file *in, loff_t *ppos, - size_t count, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { - struct inode *inode = outfilp->f_mapping->host; + struct inode *inode = file_inode(in); struct xfs_inode *ip = XFS_I(inode); - int ioflags = 0; - ssize_t ret; - - XFS_STATS_INC(xs_write_calls); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; - if (outfilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + XFS_STATS_INC(mp, xs_read_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(mp)) return -EIO; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + trace_xfs_file_splice_read(ip, *ppos, len); - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); - - ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = filemap_splice_read(in, ppos, pipe, len, flags); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + XFS_STATS_ADD(mp, xs_read_bytes, ret); return ret; } /* - * This routine is called to handle zeroing any space in the last block of the - * file that is beyond the EOF. We do this since the size is being increased - * without writing anything to that block and we don't want to read the - * garbage on the disk. + * Take care of zeroing post-EOF blocks when they might exist. + * + * Returns 0 if successfully, a negative error for a failure, or 1 if this + * function dropped the iolock and reacquired it exclusively and the caller + * needs to restart the write sanity checks. */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct xfs_inode *ip, - xfs_fsize_t offset, - xfs_fsize_t isize) +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio, + struct xfs_zone_alloc_ctx *ac) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); - int zero_offset = XFS_B_FSB_OFFSET(mp, isize); - int zero_len; - int nimaps = 1; - int error = 0; - struct xfs_bmbt_irec imap; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + int error; /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. */ - if (imap.br_startblock == HOLESTARTBLOCK) + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - return xfs_iozero(ip, isize, zero_len); + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + + return error; } /* - * Zero any on disk space between the current EOF and the new, larger EOF. - * - * This handles the normal case of zeroing the remainder of the last block in - * the file and the unusual case of zeroing blocks out beyond the size of the - * file. This second case only happens with fixed size extents and when the - * system crashes before the inode size was updated but after blocks were - * allocated. + * Common pre-write limit and setup checks. * - * Expects the iolock to be held exclusive, and will take the ilock internally. + * Called with the iolock held either shared and exclusive according to + * @iolock, and returns with it held. Might upgrade the iolock to exclusive + * if called for a direct write beyond i_size. */ -int /* error (positive) */ -xfs_zero_eof( - struct xfs_inode *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ +STATIC ssize_t +xfs_file_write_checks( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + struct xfs_zone_alloc_ctx *ac) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - struct xfs_bmbt_irec imap; - - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(offset > isize); + struct inode *inode = iocb->ki_filp->f_mapping->host; + size_t count = iov_iter_count(from); + bool drained_dio = false; + ssize_t error; + +restart: + error = generic_write_checks(iocb, from); + if (error <= 0) + return error; + + if (iocb->ki_flags & IOCB_NOWAIT) { + error = break_layout(inode, false); + if (error == -EWOULDBLOCK) + error = -EAGAIN; + } else { + error = xfs_break_layouts(inode, iolock, BREAK_WRITE); + } + + if (error) + return error; /* - * First handle zeroing the block on which isize resides. + * For changing security info in file_remove_privs() we need i_rwsem + * exclusively. + */ + if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { + xfs_iunlock(XFS_I(inode), *iolock); + *iolock = XFS_IOLOCK_EXCL; + error = xfs_ilock_iocb(iocb, *iolock); + if (error) { + *iolock = 0; + return error; + } + goto restart; + } + + /* + * If the offset is beyond the size of the file, we need to zero all + * blocks that fall between the existing EOF and the start of this + * write. * - * We only zero a part of that block so it is handled specially. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize); + if (iocb->ki_pos > i_size_read(inode)) { + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio, ac); + if (error == 1) + goto restart; if (error) return error; } + return kiocb_modified(iocb); +} + +static ssize_t +xfs_zoned_write_space_reserve( + struct xfs_mount *mp, + struct kiocb *iocb, + struct iov_iter *from, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + loff_t count = iov_iter_count(from); + int error; + + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= XFS_ZR_NOWAIT; + /* - * Calculate the range between the new size and the old where blocks - * needing to be zeroed may exist. + * Check the rlimit and LFS boundary first so that we don't over-reserve + * by possibly a lot. * - * To get the block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back to a block - * boundary. We subtract 1 in case the size is exactly on a block - * boundary. + * The generic write path will redo this check later, and it might have + * changed by then. If it got expanded we'll stick to our earlier + * smaller limit, and if it is decreased the new smaller limit will be + * used and our extra space reservation will be returned after finishing + * the write. */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); + if (error) + return error; + + /* + * Sloppily round up count to file system blocks. + * + * This will often reserve an extra block, but that avoids having to look + * at the start offset, which isn't stable for O_APPEND until taking the + * iolock. Also we need to reserve a block each for zeroing the old + * EOF block and the new start block if they are unaligned. + * + * Any remaining block will be returned after the write. + */ + return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, + flags, ac); +} + +static int +xfs_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + int error, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + loff_t offset = iocb->ki_pos; + unsigned int nofs_flag; + + ASSERT(!xfs_is_zoned_inode(ip) || + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + + trace_xfs_end_io_direct_write(ip, offset, size); + + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; + + if (error) + return error; + if (!size) return 0; - } - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + /* + * Capture amount written on completion as we can't reliably account + * for it on submission. + */ + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); + + /* + * We can allocate memory here while doing writeback on behalf of + * memory reclaim. To avoid memory allocation deadlocks set the + * task-wide nofs context for the following operations. + */ + nofs_flag = memalloc_nofs_save(); - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, - &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (flags & IOMAP_DIO_COW) { + if (iocb->ki_flags & IOCB_ATOMIC) + error = xfs_reflink_end_atomic_cow(ip, offset, size); + else + error = xfs_reflink_end_cow(ip, offset, size); if (error) - return error; + goto out; + } - ASSERT(nimaps > 0); + /* + * Unwritten conversion updates the in-core isize after extent + * conversion but before updating the on-disk size. Updating isize any + * earlier allows a racing dio read to find unwritten extents before + * they are converted. + */ + if (flags & IOMAP_DIO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, offset, size, true); + goto out; + } - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } + /* + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + * + * As IO completion only ever extends EOF, we can do an unlocked check + * here to avoid taking the spinlock. If we land within the current EOF, + * then we do not need to do an extending update at all, and we don't + * need to take the lock to check this. If we race with an update moving + * EOF, then we'll either still be beyond EOF and need to take the lock, + * or we'll be within EOF and we don't need to take it at all. + */ + if (offset + size <= i_size_read(inode)) + goto out; - /* - * There are blocks we need to zero. - */ - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) { + i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); + error = xfs_setfilesize(ip, offset, size); + } else { + spin_unlock(&ip->i_flags_lock); + } - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; +out: + memalloc_nofs_restore(nofs_flag); + return error; +} - error = xfs_iozero(ip, zero_off, zero_len); - if (error) - return error; +static const struct iomap_dio_ops xfs_dio_write_ops = { + .end_io = xfs_dio_write_end_io, +}; - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); +static void +xfs_dio_zoned_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; + struct xfs_zone_alloc_ctx *ac = iter->private; + xfs_filblks_t count_fsb; + struct iomap_ioend *ioend; + + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); + if (count_fsb > ac->reserved_blocks) { + xfs_err(mp, +"allocation (%lld) larger than reservation (%lld).", + count_fsb, ac->reserved_blocks); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + bio_io_error(bio); + return; } + ac->reserved_blocks -= count_fsb; - return 0; + bio->bi_end_io = xfs_end_bio; + ioend = iomap_init_ioend(iter->inode, bio, file_offset, + IOMAP_IOEND_DIRECT); + xfs_zone_alloc_and_submit(ioend, &ac->open_zone); } +static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { + .bio_set = &iomap_ioend_bioset, + .submit_io = xfs_dio_zoned_submit_io, + .end_io = xfs_dio_write_end_io, +}; + /* - * Common pre-write limit and setup checks. - * - * Called with the iolocked held either shared and exclusive according to - * @iolock, and returns with it held. Might upgrade the iolock to exclusive - * if called for a direct write beyond i_size. + * Handle block aligned direct I/O writes. */ -STATIC ssize_t -xfs_file_aio_write_checks( - struct file *file, - loff_t *pos, - size_t *count, - int *iolock) +static noinline ssize_t +xfs_file_dio_write_aligned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from, + const struct iomap_ops *ops, + const struct iomap_dio_ops *dops, + struct xfs_zone_alloc_ctx *ac) { - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - int error = 0; + unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int dio_flags = 0; + ssize_t ret; -restart: - error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); - if (error) - return error; + /* + * For always COW inodes, each bio must be aligned to the file system + * block size and not just the device sector size because we need to + * allocate a block-aligned amount of space for each write. + */ + if (xfs_is_always_cow_inode(ip)) + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + ret = xfs_file_write_checks(iocb, from, &iolock, ac); + if (ret) + goto out_unlock; /* - * If the offset is beyond the size of the file, we need to zero any - * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which implies - * having to redo all checks before. + * We don't need to hold the IOLOCK exclusively across the IO, so demote + * the iolock back to shared if we had to take the exclusive lock in + * xfs_file_write_checks() for other reasons. */ - if (*pos > i_size_read(inode)) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); - goto restart; - } - error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); - if (error) - return error; + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; } + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); +out_unlock: + xfs_iunlock(ip, iolock); + return ret; +} + +/* + * Handle block aligned direct I/O writes to zoned devices. + */ +static noinline ssize_t +xfs_file_dio_write_zoned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); + if (ret < 0) + return ret; + ret = xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_zoned_direct_write_iomap_ops, + &xfs_dio_zoned_write_ops, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); + return ret; +} + +/* + * Handle block atomic writes + * + * Two methods of atomic writes are supported: + * - REQ_ATOMIC-based, which would typically use some form of HW offload in the + * disk + * - COW-based, which uses a COW fork as a staging extent for data updates + * before atomically updating extent mappings for the range being written + * + */ +static noinline ssize_t +xfs_file_dio_write_atomic( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + unsigned int iolock = XFS_IOLOCK_SHARED; + ssize_t ret, ocount = iov_iter_count(from); + const struct iomap_ops *dops; /* - * Updating the timestamps will grab the ilock again from - * xfs_fs_dirty_inode, so we have to call it after dropping the - * lock above. Eventually we should look into a way to avoid - * the pointless lock roundtrip. + * HW offload should be faster, so try that first if it is already + * known that the write length is not too large. */ - if (likely(!(file->f_mode & FMODE_NOCMTIME))) { - error = file_update_time(file); - if (error) - return error; + if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) + dops = &xfs_atomic_write_cow_iomap_ops; + else + dops = &xfs_direct_write_iomap_ops; + +retry: + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); + if (ret) + goto out_unlock; + + /* Demote similar to xfs_file_dio_write_aligned() */ + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; } + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, + 0, NULL, 0); + /* - * If we're writing the file then make sure to clear the setuid and - * setgid bits if the process is not being run by root. This keeps - * people from modifying setuid and setgid binaries. + * The retry mechanism is based on the ->iomap_begin method returning + * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not + * possible. The REQ_ATOMIC-based method typically not be possible if + * the write spans multiple extents or the disk blocks are misaligned. */ - return file_remove_suid(file); + if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { + xfs_iunlock(ip, iolock); + dops = &xfs_atomic_write_cow_iomap_ops; + goto retry; + } + +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); + return ret; } /* - * xfs_file_dio_aio_write - handle direct IO writes + * Handle block unaligned direct I/O writes * - * Lock the inode appropriately to prepare for and issue a direct IO write. - * By separating it from the buffered write path we remove all the tricky to - * follow locking changes and looping. + * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing + * them to be done in parallel with reads and other direct I/O writes. However, + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need + * to do sub-block zeroing and that requires serialisation against other direct + * I/O to the same block. In this case we need to serialise the submission of + * the unaligned I/O so that we don't get racing block zeroing in the dio layer. + * In the case where sub-block zeroing is not required, we can do concurrent + * sub-block dios to the same block successfully. * - * If there are cached pages or we're extending the file, we need IOLOCK_EXCL - * until we're sure the bytes at the new EOF have been zeroed and/or the cached - * pages are flushed out. - * - * In most cases the direct IO writes will be done holding IOLOCK_SHARED - * allowing them to be done in parallel with reads and other direct IO writes. - * However, if the IO is not aligned to filesystem blocks, the direct IO layer - * needs to do sub-block zeroing and that requires serialisation against other - * direct IOs to the same block. In this case we need to serialise the - * submission of the unaligned IOs so that we don't get racing block zeroing in - * the dio layer. To avoid the problem with aio, we also need to wait for - * outstanding IOs to complete so that unwritten extent conversion is completed - * before we try to map the overlapping block. This is currently implemented by - * hitting it with a big hammer (i.e. inode_dio_wait()). - * - * Returns with locks held indicated by @iolock and errors indicated by - * negative return values. + * Optimistically submit the I/O using the shared lock first, but use the + * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN + * if block allocation or partial block zeroing would be required. In that case + * we try again with the exclusive lock. */ -STATIC ssize_t -xfs_file_dio_aio_write( +static noinline ssize_t +xfs_file_dio_write_unaligned( + struct xfs_inode *ip, struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos, - size_t ocount) + struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - size_t count = ocount; - int unaligned_io = 0; - int iolock; - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) - return -XFS_ERROR(EINVAL); - - if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) - unaligned_io = 1; + size_t isize = i_size_read(VFS_I(ip)); + size_t count = iov_iter_count(from); + unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; + ssize_t ret; /* - * We don't need to take an exclusive lock unless there page cache needs - * to be invalidated or unaligned IO is being executed. We don't need to - * consider the EOF extension case here because - * xfs_file_aio_write_checks() will relock the inode as necessary for - * EOF zeroing cases and fill out the new inode size as appropriate. + * Extending writes need exclusivity because of the sub-block zeroing + * that the DIO code always does for partial tail blocks beyond EOF, so + * don't even bother trying the fast path in this case. */ - if (unaligned_io || mapping->nrpages) + if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; +retry_exclusive: iolock = XFS_IOLOCK_EXCL; - else - iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, iolock); + flags = IOMAP_DIO_FORCE_WAIT; + } + + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; /* - * Recheck if there are cached pages that need invalidate after we got - * the iolock to protect against other threads adding new pages while - * we were waiting for the iolock. + * We can't properly handle unaligned direct I/O to reflink files yet, + * as we can't unshare a partial block. */ - if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, iolock); - iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); + if (xfs_is_cow_inode(ip)) { + trace_xfs_reflink_bounce_dio_write(iocb, from); + ret = -ENOTBLK; + goto out_unlock; } - ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) - goto out; + goto out_unlock; - if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, -1); - if (ret) - goto out; - truncate_pagecache_range(VFS_I(ip), pos, -1); - } + /* + * If we are doing exclusive unaligned I/O, this must be the only I/O + * in-flight. Otherwise we risk data corruption due to unwritten extent + * conversions from the AIO end_io handler. Wait for all other I/O to + * drain first. + */ + if (flags & IOMAP_DIO_FORCE_WAIT) + inode_dio_wait(VFS_I(ip)); + + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, + &xfs_dio_write_ops, flags, NULL, 0); /* - * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to flush cached pages + * Retry unaligned I/O with exclusive blocking semantics if the DIO + * layer rejected it for mapping or locking reasons. If we are doing + * nonblocking user I/O, propagate the error. */ - if (unaligned_io) - inode_dio_wait(inode); - else if (iolock == XFS_IOLOCK_EXCL) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { + ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); + xfs_iunlock(ip, iolock); + goto retry_exclusive; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); + return ret; +} +static ssize_t +xfs_file_dio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + size_t count = iov_iter_count(from); + + /* direct I/O must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) + return -EINVAL; + + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) + return xfs_file_dio_write_unaligned(ip, iocb, from); + if (xfs_is_zoned_inode(ip)) + return xfs_file_dio_write_zoned(ip, iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) + return xfs_file_dio_write_atomic(ip, iocb, from); + return xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); +} + +static noinline ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + unsigned int iolock = XFS_IOLOCK_EXCL; + ssize_t ret, error = 0; + loff_t pos; + + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); + if (ret) + goto out; + + pos = iocb->ki_pos; + + trace_xfs_file_dax_write(iocb, from); + ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); + if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { + i_size_write(inode, iocb->ki_pos); + error = xfs_setfilesize(ip, pos, ret); + } out: - xfs_rw_iunlock(ip, iolock); + if (iolock) + xfs_iunlock(ip, iolock); + if (error) + return error; + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } return ret; } STATIC ssize_t -xfs_file_buffered_aio_write( +xfs_file_buffered_write( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos, - size_t ocount) + struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int enospc = 0; - int iolock = XFS_IOLOCK_EXCL; - size_t count = ocount; + bool cleared_space = false; + unsigned int iolock; - xfs_rw_ilock(ip, iolock); +write_retry: + iolock = XFS_IOLOCK_EXCL; + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; - ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; - -write_retry: - trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, 0); + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + NULL); /* - * If we just got an ENOSPC, try to write back all dirty inodes to - * convert delalloc space to free up some of the excess reserved - * metadata space. + * If we hit a space limit, try to free up some lingering preallocated + * space before returning an error. In the case of ENOSPC, first try to + * write back all dirty inodes to free up some of the excess reserved + * metadata space. This reduces the chances that the eofblocks scan + * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this + * also behaves as a filter to prevent too many eofblocks scans from + * running at the same time. Use a synchronous scan to increase the + * effectiveness of the scan. */ - if (ret == -ENOSPC && !enospc) { - enospc = 1; + if (ret == -EDQUOT && !cleared_space) { + xfs_iunlock(ip, iolock); + xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); + cleared_space = true; + goto write_retry; + } else if (ret == -ENOSPC && !cleared_space) { + struct xfs_icwalk icw = {0}; + + cleared_space = true; xfs_flush_inodes(ip->i_mount); + + xfs_iunlock(ip, iolock); + icw.icw_flags = XFS_ICWALK_FLAG_SYNC; + xfs_blockgc_free_space(ip->i_mount, &icw); goto write_retry; } - current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + if (iolock) + xfs_iunlock(ip, iolock); + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } return ret; } STATIC ssize_t -xfs_file_aio_write( +xfs_file_buffered_write_zoned( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) + struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + unsigned int iolock = XFS_IOLOCK_EXCL; + bool cleared_space = false; + struct xfs_zone_alloc_ctx ac = { }; ssize_t ret; - size_t ocount = 0; - XFS_STATS_INC(xs_write_calls); + ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); + if (ret < 0) + return ret; - BUG_ON(iocb->ki_pos != pos); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + goto out_unreserve; - ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + ret = xfs_file_write_checks(iocb, from, &iolock, &ac); if (ret) - return ret; + goto out_unlock; + + /* + * Truncate the iter to the length that we were actually able to + * allocate blocks for. This needs to happen after + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND + * writes. + */ + iov_iter_truncate(from, + XFS_FSB_TO_B(mp, ac.reserved_blocks) - + (iocb->ki_pos & mp->m_blockmask)); + if (!iov_iter_count(from)) + goto out_unlock; + +retry: + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + &ac); + if (ret == -ENOSPC && !cleared_space) { + /* + * Kick off writeback to convert delalloc space and release the + * usually too pessimistic indirect block reservations. + */ + xfs_flush_inodes(mp); + cleared_space = true; + goto retry; + } + +out_unlock: + xfs_iunlock(ip, iolock); +out_unreserve: + xfs_zoned_space_unreserve(ip->i_mount, &ac); + if (ret > 0) { + XFS_STATS_ADD(mp, xs_write_bytes, ret); + ret = generic_write_sync(iocb, ret); + } + return ret; +} + +STATIC ssize_t +xfs_file_write_iter( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + size_t ocount = iov_iter_count(from); + + XFS_STATS_INC(ip->i_mount, xs_write_calls); if (ocount == 0) return 0; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ret = -EIO; - goto out; - } + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; - if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); - else - ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount); + if (iocb->ki_flags & IOCB_ATOMIC) { + if (ocount < xfs_get_atomic_write_min(ip)) + return -EINVAL; - if (ret > 0) { - ssize_t err; + if (ocount > xfs_get_atomic_write_max(ip)) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } - XFS_STATS_ADD(xs_write_bytes, ret); + if (IS_DAX(inode)) + return xfs_file_dax_write(iocb, from); - /* Handle various SYNC-type writes */ - err = generic_write_sync(file, pos, ret); - if (err < 0) - ret = err; + if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Allow a directio write to fall back to a buffered + * write *only* in the case that we're doing a reflink + * CoW. In all other directio scenarios we do not + * allow an operation to fall back to buffered mode. + */ + ret = xfs_file_dio_write(iocb, from); + if (ret != -ENOTBLK) + return ret; } -out: - return ret; + if (xfs_is_zoned_inode(ip)) + return xfs_file_buffered_write_zoned(iocb, from); + return xfs_file_buffered_write(iocb, from); } -STATIC long -xfs_file_fallocate( - struct file *file, - int mode, - loff_t offset, - loff_t len) +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) { - struct inode *inode = file_inode(file); - long error; - loff_t new_size = 0; - xfs_flock64_t bf; - xfs_inode_t *ip = XFS_I(inode); - int cmd = XFS_IOC_RESVSP; - int attr_flags = XFS_ATTR_NOLOCK; + struct xfs_inode *ip = XFS_I(file_inode(filp)); - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) - return -EOPNOTSUPP; + if (xfs_has_wsync(ip->i_mount)) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + +static int +xfs_falloc_newsize( + struct file *file, + int mode, + loff_t offset, + loff_t len, + loff_t *new_size) +{ + struct inode *inode = file_inode(file); + + if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) + return 0; + *new_size = offset + len; + return inode_newsize_ok(inode, *new_size); +} + +static int +xfs_falloc_setsize( + struct file *file, + loff_t new_size) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = new_size, + }; + + if (!new_size) + return 0; + return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), + &iattr); +} + +static int +xfs_falloc_collapse_range( + struct file *file, + loff_t offset, + loff_t len, + struct xfs_zone_alloc_ctx *ac) +{ + struct inode *inode = file_inode(file); + loff_t new_size = i_size_read(inode) - len; + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; + + /* + * There is no need to overlap collapse range with EOF, in which case it + * is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) + return -EINVAL; - bf.l_whence = 0; - bf.l_start = offset; - bf.l_len = len; + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_insert_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t isize = i_size_read(inode); + int error; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; - if (mode & FALLOC_FL_PUNCH_HOLE) - cmd = XFS_IOC_UNRESVSP; + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) + return -EFBIG; - /* check the new inode size is valid before allocating */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); + /* Offset should be less than i_size */ + if (offset >= isize) + return -EINVAL; + + error = xfs_falloc_setsize(file, isize + len); + if (error) + return error; + + /* + * Perform hole insertion now that the file size has been updated so + * that if we crash during the operation we don't leave shifted extents + * past EOF and hence losing access to the data that is contained within + * them. + */ + return xfs_insert_file_space(XFS_I(inode), offset, len); +} + +/* + * Punch a hole and prealloc the range. We use a hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by + * virtue of the hole punch. + */ +static int +xfs_falloc_zero_range( + struct file *file, + int mode, + loff_t offset, + loff_t len, + struct xfs_zone_alloc_ctx *ac) +{ + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + unsigned int blksize = i_blocksize(inode); + loff_t new_size = 0; + int error; + + trace_xfs_zero_file_space(ip); + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + /* + * Zero range implements a full zeroing mechanism but is only used in + * limited situations. It is more efficient to allocate unwritten + * extents than to perform zeroing here, so use an errortag to randomly + * force zeroing on DEBUG kernels for added test coverage. + */ + if (XFS_TEST_ERROR(ip->i_mount, + XFS_ERRTAG_FORCE_ZERO_RANGE)) { + error = xfs_zero_range(ip, offset, len, ac, NULL); + } else { + error = xfs_free_file_space(ip, offset, len, ac); if (error) - goto out_unlock; + return error; + + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(ip, offset, len); } + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_unshare_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; - if (file->f_flags & O_DSYNC) - attr_flags |= XFS_ATTR_SYNC; + error = xfs_reflink_unshare(XFS_I(inode), offset, len); + if (error) + return error; - error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags); + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_allocate_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + /* + * If always_cow mode we can't use preallocations and thus should not + * create them. + */ + if (xfs_is_always_cow_inode(XFS_I(inode))) + return -EOPNOTSUPP; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ + FALLOC_FL_UNSHARE_RANGE) + +STATIC long +__xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len, + struct xfs_zone_alloc_ctx *ac) +{ + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + long error; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; + /* + * Must wait for all AIO to complete before we continue as AIO can + * change the file size on completion without holding any locks we + * currently hold. We must do this first because AIO can update both + * the on disk and in memory inode sizes, and the operations that follow + * require the in-memory size to be fully up-to-date. + */ + inode_dio_wait(inode); - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK); + error = file_modified(file); + if (error) + goto out_unlock; + + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: + error = xfs_free_file_space(ip, offset, len, ac); + break; + case FALLOC_FL_COLLAPSE_RANGE: + error = xfs_falloc_collapse_range(file, offset, len, ac); + break; + case FALLOC_FL_INSERT_RANGE: + error = xfs_falloc_insert_range(file, offset, len); + break; + case FALLOC_FL_ZERO_RANGE: + error = xfs_falloc_zero_range(file, mode, offset, len, ac); + break; + case FALLOC_FL_UNSHARE_RANGE: + error = xfs_falloc_unshare_range(file, mode, offset, len); + break; + case FALLOC_FL_ALLOCATE_RANGE: + error = xfs_falloc_allocate_range(file, mode, offset, len); + break; + default: + error = -EOPNOTSUPP; + break; } + if (!error && xfs_file_sync_writes(file)) + error = xfs_log_force_inode(ip); + out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, iolock); + return error; +} + +static long +xfs_file_zoned_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct xfs_zone_alloc_ctx ac = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + int error; + + error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac); + if (error) + return error; + error = __xfs_file_fallocate(file, mode, offset, len, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); return error; } +static long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* + * For zoned file systems, zeroing the first and last block of a hole + * punch requires allocating a new block to rewrite the remaining data + * and new zeroes out of place. Get a reservations for those before + * taking the iolock. Dip into the reserved pool because we are + * expected to be able to punch a hole even on a completely full + * file system. + */ + if (xfs_is_zoned_inode(XFS_I(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE))) + return xfs_file_zoned_fallocate(file, mode, offset, len); + return __xfs_file_fallocate(file, mode, offset, len, NULL); +} + +STATIC int +xfs_file_fadvise( + struct file *file, + loff_t start, + loff_t end, + int advice) +{ + struct xfs_inode *ip = XFS_I(file_inode(file)); + int ret; + int lockflags = 0; + + /* + * Operations creating pages in page cache need protection from hole + * punching and similar ops + */ + if (advice == POSIX_FADV_WILLNEED) { + lockflags = XFS_IOLOCK_SHARED; + xfs_ilock(ip, lockflags); + } + ret = generic_fadvise(file, start, end, advice); + if (lockflags) + xfs_iunlock(ip, lockflags); + return ret; +} + +STATIC loff_t +xfs_file_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + loff_t len, + unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + loff_t remapped = 0; + xfs_extlen_t cowextsize; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (!xfs_has_reflink(mp)) + return -EOPNOTSUPP; + + if (xfs_is_shutdown(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret || len == 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + + ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, + &remapped); + if (ret) + goto out_unlock; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (pos_in == 0 && len == i_size_read(inode_in) && + (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + pos_out == 0 && len >= i_size_read(inode_out) && + !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_cowextsize; + + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + remap_flags); + if (ret) + goto out_unlock; + + if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) + xfs_log_force_inode(dest); +out_unlock: + xfs_iunlock2_remapping(src, dest); + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + /* + * If the caller did not set CAN_SHORTEN, then it is not prepared to + * handle partial results -- either the whole remap succeeds, or we + * must say why it did not. In this case, any error should be returned + * to the caller. + */ + if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return ret; + return remapped > 0 ? remapped : ret; +} STATIC int xfs_file_open( struct inode *inode, struct file *file) { - if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) - return -EFBIG; - if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - return 0; + file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; + return generic_file_open(inode, file); } STATIC int @@ -878,10 +1580,12 @@ xfs_dir_open( struct file *file) { struct xfs_inode *ip = XFS_I(inode); - int mode; + unsigned int mode; int error; - error = xfs_file_open(inode, file); + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; + error = generic_file_open(inode, file); if (error) return error; @@ -889,19 +1593,92 @@ xfs_dir_open( * If there are any blocks, read-ahead block 0 as we're almost * certain to have the next operation be a read there. */ - mode = xfs_ilock_map_shared(ip); - if (ip->i_d.di_nextents > 0) - xfs_dir3_data_readahead(NULL, ip, 0, -1); + mode = xfs_ilock_data_map_shared(ip); + if (ip->i_df.if_nextents > 0) + error = xfs_dir3_data_readahead(ip, 0, 0); xfs_iunlock(ip, mode); - return 0; + return error; } +/* + * Don't bother propagating errors. We're just doing cleanup, and the caller + * ignores the return value anyway. + */ STATIC int xfs_file_release( - struct inode *inode, - struct file *filp) + struct inode *inode, + struct file *file) { - return -xfs_release(XFS_I(inode)); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + /* + * If this is a read-only mount or the file system has been shut down, + * don't generate I/O. + */ + if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) + return 0; + + /* + * If we previously truncated this file and removed old data in the + * process, we want to initiate "early" writeout on the last close. + * This is an attempt to combat the notorious NULL files problem which + * is particularly noticeable from a truncate down, buffered (re-)write + * (delalloc), followed by a crash. What we are effectively doing here + * is significantly reducing the time window where we'd otherwise be + * exposed to that problem. + */ + if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { + xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); + if (ip->i_delayed_blks > 0) + filemap_flush(inode->i_mapping); + } + + /* + * XFS aggressively preallocates post-EOF space to generate contiguous + * allocations for writers that append to the end of the file. + * + * To support workloads that close and reopen the file frequently, these + * preallocations usually persist after a close unless it is the first + * close for the inode. This is a tradeoff to generate tightly packed + * data layouts for unpacking tarballs or similar archives that write + * one file after another without going back to it while keeping the + * preallocation for files that have recurring open/write/close cycles. + * + * This heuristic is skipped for inodes with the append-only flag as + * that flag is rather pointless for inodes written only once. + * + * There is no point in freeing blocks here for open but unlinked files + * as they will be taken care of by the inactivation path soon. + * + * When releasing a read-only context, don't flush data or trim post-EOF + * blocks. This avoids open/read/close workloads from removing EOF + * blocks that other writers depend upon to reduce fragmentation. + * + * Inodes on the zoned RT device never have preallocations, so skip + * taking the locks below. + */ + if (!inode->i_nlink || + !(file->f_mode & FMODE_WRITE) || + (ip->i_diflags & XFS_DIFLAG_APPEND) || + xfs_is_zoned_inode(ip)) + return 0; + + /* + * If we can't get the iolock just skip truncating the blocks past EOF + * because we could deadlock with the mmap_lock otherwise. We'll get + * another chance to drop them once the last reference to the inode is + * dropped, so we'll never leak blocks permanently. + */ + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (xfs_can_free_eofblocks(ip) && + !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return 0; } STATIC int @@ -911,7 +1688,6 @@ xfs_file_readdir( { struct inode *inode = file_inode(file); xfs_inode_t *ip = XFS_I(inode); - int error; size_t bufsize; /* @@ -926,509 +1702,279 @@ xfs_file_readdir( * point we can change the ->readdir prototype to include the * buffer size. For now we use the current glibc buffer size. */ - bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); - error = xfs_readdir(ip, ctx, bufsize); - if (error) - return -error; - return 0; + return xfs_readdir(NULL, ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int whence) { - vma->vm_ops = &xfs_file_vm_ops; + struct inode *inode = file->f_mapping->host; - file_accessed(filp); - return 0; + if (xfs_is_shutdown(XFS_I(inode)->i_mount)) + return -EIO; + + switch (whence) { + default: + return generic_file_llseek(file, offset, whence); + case SEEK_HOLE: + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); + break; + case SEEK_DATA: + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); + break; + } + + if (offset < 0) + return offset; + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } -/* - * mmap()d file has taken write protection fault and is being made - * writable. We can set the page state up correctly for a writable - * page, which means we can do correct delalloc accounting (ENOSPC - * checking!) and unwritten extent mapping. - */ -STATIC int -xfs_vm_page_mkwrite( - struct vm_area_struct *vma, - struct vm_fault *vmf) +static inline vm_fault_t +xfs_dax_fault_locked( + struct vm_fault *vmf, + unsigned int order, + bool write_fault) { - return block_page_mkwrite(vma, vmf, xfs_get_blocks); -} + vm_fault_t ret; + unsigned long pfn; -/* - * This type is designed to indicate the type of offset we would like - * to search from page cache for either xfs_seek_data() or xfs_seek_hole(). - */ -enum { - HOLE_OFF = 0, - DATA_OFF, -}; + if (!IS_ENABLED(CONFIG_FS_DAX)) { + ASSERT(0); + return VM_FAULT_SIGBUS; + } + ret = dax_iomap_fault(vmf, order, &pfn, NULL, + (write_fault && !vmf->cow_page) ? + &xfs_dax_write_iomap_ops : + &xfs_read_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, order, pfn); + return ret; +} -/* - * Lookup the desired type of offset from the given page. - * - * On success, return true and the offset argument will point to the - * start of the region that was found. Otherwise this function will - * return false and keep the offset argument unchanged. - */ -STATIC bool -xfs_lookup_buffer_offset( - struct page *page, - loff_t *offset, - unsigned int type) +static vm_fault_t +xfs_dax_read_fault( + struct vm_fault *vmf, + unsigned int order) { - loff_t lastoff = page_offset(page); - bool found = false; - struct buffer_head *bh, *head; + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + vm_fault_t ret; - bh = head = page_buffers(page); - do { - /* - * Unwritten extents that have data in the page - * cache covering them can be identified by the - * BH_Unwritten state flag. Pages with multiple - * buffers might have a mix of holes, data and - * unwritten extents - any buffer with valid - * data in it should have BH_Uptodate flag set - * on it. - */ - if (buffer_unwritten(bh) || - buffer_uptodate(bh)) { - if (type == DATA_OFF) - found = true; - } else { - if (type == HOLE_OFF) - found = true; - } + trace_xfs_read_fault(ip, order); - if (found) { - *offset = lastoff; - break; - } - lastoff += bh->b_size; - } while ((bh = bh->b_this_page) != head); + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + ret = xfs_dax_fault_locked(vmf, order, false); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - return found; + return ret; } /* - * This routine is called to find out and return a data or hole offset - * from the page cache for unwritten extents according to the desired - * type for xfs_seek_data() or xfs_seek_hole(). - * - * The argument offset is used to tell where we start to search from the - * page cache. Map is used to figure out the end points of the range to - * lookup pages. + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: * - * Return true if the desired type of offset was found, and the argument - * offset is filled with that address. Otherwise, return false and keep - * offset unchanged. + * mmap_lock (MM) + * sb_start_pagefault(vfs, freeze) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) */ -STATIC bool -xfs_find_get_desired_pgoff( - struct inode *inode, - struct xfs_bmbt_irec *map, - unsigned int type, - loff_t *offset) +static vm_fault_t +__xfs_write_fault( + struct vm_fault *vmf, + unsigned int order, + struct xfs_zone_alloc_ctx *ac) { + struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct pagevec pvec; - pgoff_t index; - pgoff_t end; - loff_t endoff; - loff_t startoff = *offset; - loff_t lastoff = startoff; - bool found = false; - - pagevec_init(&pvec, 0); - - index = startoff >> PAGE_CACHE_SHIFT; - endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_CACHE_SHIFT; - do { - int want; - unsigned nr_pages; - unsigned int i; - - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, - want); - /* - * No page mapped into given range. If we are searching holes - * and if this is the first time we got into the loop, it means - * that the given offset is landed in a hole, return it. - * - * If we have already stepped through some block buffers to find - * holes but they all contains data. In this case, the last - * offset is already updated and pointed to the end of the last - * mapped page, if it does not reach the endpoint to search, - * that means there should be a hole between them. - */ - if (nr_pages == 0) { - /* Data search found nothing */ - if (type == DATA_OFF) - break; - - ASSERT(type == HOLE_OFF); - if (lastoff == startoff || lastoff < endoff) { - found = true; - *offset = lastoff; - } - break; - } + unsigned int lock_mode = XFS_MMAPLOCK_SHARED; + vm_fault_t ret; - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; - break; - } + trace_xfs_write_fault(ip, order); - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - loff_t b_offset; - - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to tmpfs - * file mapping. However, page->index will not change - * because we have a reference on the page. - * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. - */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } - goto out; - } - - lock_page(page); - /* - * Page truncated or invalidated(page->mapping == NULL). - * We can freely skip it and proceed to check the next - * page. - */ - if (unlikely(page->mapping != inode->i_mapping)) { - unlock_page(page); - continue; - } - - if (!page_has_buffers(page)) { - unlock_page(page); - continue; - } - - found = xfs_lookup_buffer_offset(page, &b_offset, type); - if (found) { - /* - * The found offset may be less than the start - * point to search if this is the first time to - * come here. - */ - *offset = max_t(loff_t, startoff, b_offset); - unlock_page(page); - goto out; - } - - /* - * We either searching data but nothing was found, or - * searching hole but found a data buffer. In either - * case, probably the next page contains the desired - * things, update the last offset to it so. - */ - lastoff = page_offset(page) + PAGE_SIZE; - unlock_page(page); - } + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); - /* - * The number of returned pages less than our desired, search - * done. In this case, nothing was found for searching data, - * but we found a hole behind the last offset. - */ - if (nr_pages < want) { - if (type == HOLE_OFF) { - *offset = lastoff; - found = true; - } - break; - } + /* + * Normally we only need the shared mmaplock, but if a reflink remap is + * in progress we take the exclusive lock to wait for the remap to + * finish before taking a write fault. + */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (xfs_iflags_test(ip, XFS_IREMAPPING)) { + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + lock_mode = XFS_MMAPLOCK_EXCL; + } - index = pvec.pages[i - 1]->index + 1; - pagevec_release(&pvec); - } while (index <= end); + if (IS_DAX(inode)) + ret = xfs_dax_fault_locked(vmf, order, true); + else + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + ac); + xfs_iunlock(ip, lock_mode); -out: - pagevec_release(&pvec); - return found; + sb_end_pagefault(inode->i_sb); + return ret; } -STATIC loff_t -xfs_seek_data( - struct file *file, - loff_t start) +static vm_fault_t +xfs_write_fault_zoned( + struct vm_fault *vmf, + unsigned int order) { - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - loff_t uninitialized_var(offset); - xfs_fsize_t isize; - xfs_fileoff_t fsbno; - xfs_filblks_t end; - uint lock; + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + unsigned int len = folio_size(page_folio(vmf->page)); + struct xfs_zone_alloc_ctx ac = { }; int error; - - lock = xfs_ilock_map_shared(ip); - - isize = i_size_read(inode); - if (start >= isize) { - error = ENXIO; - goto out_unlock; - } + vm_fault_t ret; /* - * Try to read extents from the first block indicated - * by fsbno to the end block of the file. + * This could over-allocate as it doesn't check for truncation. + * + * But as the overallocation is limited to less than a folio and will be + * release instantly that's just fine. */ - fsbno = XFS_B_TO_FSBT(mp, start); - end = XFS_B_TO_FSB(mp, isize); - for (;;) { - struct xfs_bmbt_irec map[2]; - int nmap = 2; - unsigned int i; - - error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_unlock; - - /* No extents at given offset, must be beyond EOF */ - if (nmap == 0) { - error = ENXIO; - goto out_unlock; - } - - for (i = 0; i < nmap; i++) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[i].br_startoff)); - - /* Landed in a data extent */ - if (map[i].br_startblock == DELAYSTARTBLOCK || - (map[i].br_state == XFS_EXT_NORM && - !isnullstartblock(map[i].br_startblock))) - goto out; - - /* - * Landed in an unwritten extent, try to search data - * from page cache. - */ - if (map[i].br_state == XFS_EXT_UNWRITTEN) { - if (xfs_find_get_desired_pgoff(inode, &map[i], - DATA_OFF, &offset)) - goto out; - } - } - - /* - * map[0] is hole or its an unwritten extent but - * without data in page cache. Probably means that - * we are reading after EOF if nothing in map[1]. - */ - if (nmap == 1) { - error = ENXIO; - goto out_unlock; - } - - ASSERT(i > 1); - - /* - * Nothing was found, proceed to the next round of search - * if reading offset not beyond or hit EOF. - */ - fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; - start = XFS_FSB_TO_B(mp, fsbno); - if (start >= isize) { - error = ENXIO; - goto out_unlock; - } - } - -out: - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); - -out_unlock: - xfs_iunlock_map_shared(ip, lock); - - if (error) - return -error; - return offset; + error = xfs_zoned_space_reserve(ip->i_mount, + XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); + if (error < 0) + return vmf_fs_error(error); + ret = __xfs_write_fault(vmf, order, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); + return ret; } -STATIC loff_t -xfs_seek_hole( - struct file *file, - loff_t start) +static vm_fault_t +xfs_write_fault( + struct vm_fault *vmf, + unsigned int order) { - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - loff_t uninitialized_var(offset); - xfs_fsize_t isize; - xfs_fileoff_t fsbno; - xfs_filblks_t end; - uint lock; - int error; + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) + return xfs_write_fault_zoned(vmf, order); + return __xfs_write_fault(vmf, order, NULL); +} - if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); +static inline bool +xfs_is_write_fault( + struct vm_fault *vmf) +{ + return (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); +} - lock = xfs_ilock_map_shared(ip); +static vm_fault_t +xfs_filemap_fault( + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); - isize = i_size_read(inode); - if (start >= isize) { - error = ENXIO; - goto out_unlock; + /* DAX can shortcut the normal fault path on write faults! */ + if (IS_DAX(inode)) { + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, 0); + return xfs_dax_read_fault(vmf, 0); } - fsbno = XFS_B_TO_FSBT(mp, start); - end = XFS_B_TO_FSB(mp, isize); - - for (;;) { - struct xfs_bmbt_irec map[2]; - int nmap = 2; - unsigned int i; + trace_xfs_read_fault(XFS_I(inode), 0); + return filemap_fault(vmf); +} - error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_unlock; +static vm_fault_t +xfs_filemap_huge_fault( + struct vm_fault *vmf, + unsigned int order) +{ + if (!IS_DAX(file_inode(vmf->vma->vm_file))) + return VM_FAULT_FALLBACK; - /* No extents at given offset, must be beyond EOF */ - if (nmap == 0) { - error = ENXIO; - goto out_unlock; - } + /* DAX can shortcut the normal fault path on write faults! */ + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, order); + return xfs_dax_read_fault(vmf, order); +} - for (i = 0; i < nmap; i++) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[i].br_startoff)); - - /* Landed in a hole */ - if (map[i].br_startblock == HOLESTARTBLOCK) - goto out; - - /* - * Landed in an unwritten extent, try to search hole - * from page cache. - */ - if (map[i].br_state == XFS_EXT_UNWRITTEN) { - if (xfs_find_get_desired_pgoff(inode, &map[i], - HOLE_OFF, &offset)) - goto out; - } - } +static vm_fault_t +xfs_filemap_page_mkwrite( + struct vm_fault *vmf) +{ + return xfs_write_fault(vmf, 0); +} - /* - * map[0] contains data or its unwritten but contains - * data in page cache, probably means that we are - * reading after EOF. We should fix offset to point - * to the end of the file(i.e., there is an implicit - * hole at the end of any file). - */ - if (nmap == 1) { - offset = isize; - break; - } +/* + * pfn_mkwrite was originally intended to ensure we capture time stamp updates + * on write faults. In reality, it needs to serialise against truncate and + * prepare memory for writing so handle is as standard write fault. + */ +static vm_fault_t +xfs_filemap_pfn_mkwrite( + struct vm_fault *vmf) +{ + return xfs_write_fault(vmf, 0); +} - ASSERT(i > 1); +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .huge_fault = xfs_filemap_huge_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, + .pfn_mkwrite = xfs_filemap_pfn_mkwrite, +}; - /* - * Both mappings contains data, proceed to the next round of - * search if the current reading offset not beyond or hit EOF. - */ - fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; - start = XFS_FSB_TO_B(mp, fsbno); - if (start >= isize) { - offset = isize; - break; - } - } +STATIC int +xfs_file_mmap_prepare( + struct vm_area_desc *desc) +{ + struct file *file = desc->file; + struct inode *inode = file_inode(file); + struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); -out: /* - * At this point, we must have found a hole. However, the returned - * offset may be bigger than the file size as it may be aligned to - * page boundary for unwritten extents, we need to deal with this - * situation in particular. + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. */ - offset = min_t(loff_t, offset, isize); - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); - -out_unlock: - xfs_iunlock_map_shared(ip, lock); - - if (error) - return -error; - return offset; -} + if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), + target->bt_daxdev)) + return -EOPNOTSUPP; -STATIC loff_t -xfs_file_llseek( - struct file *file, - loff_t offset, - int origin) -{ - switch (origin) { - case SEEK_END: - case SEEK_CUR: - case SEEK_SET: - return generic_file_llseek(file, offset, origin); - case SEEK_DATA: - return xfs_seek_data(file, offset); - case SEEK_HOLE: - return xfs_seek_hole(file, offset); - default: - return -EINVAL; - } + file_accessed(file); + desc->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(inode)) + desc->vm_flags |= VM_HUGEPAGE; + return 0; } const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = xfs_file_aio_read, - .aio_write = xfs_file_aio_write, + .read_iter = xfs_file_read_iter, + .write_iter = xfs_file_write_iter, .splice_read = xfs_file_splice_read, - .splice_write = xfs_file_splice_write, + .splice_write = iter_file_splice_write, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif - .mmap = xfs_file_mmap, + .mmap_prepare = xfs_file_mmap_prepare, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, + .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, + .fadvise = xfs_file_fadvise, + .remap_file_range = xfs_file_remap_range, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { .open = xfs_dir_open, .read = generic_read_dir, - .iterate = xfs_file_readdir, + .iterate_shared = xfs_file_readdir, .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT @@ -1436,9 +1982,3 @@ const struct file_operations xfs_dir_file_operations = { #endif .fsync = xfs_dir_fsync, }; - -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = xfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, -}; |
