diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r-- | fs/xfs/xfs_file.c | 828 |
1 files changed, 472 insertions, 356 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e33e5e13b95f..9a435b1ff264 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -24,6 +24,7 @@ #include "xfs_pnfs.h" #include "xfs_iomap.h" #include "xfs_reflink.h" +#include "xfs_file.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -38,33 +39,19 @@ static const struct vm_operations_struct xfs_file_vm_ops; * Decide if the given file range is aligned to the size of the fundamental * allocation unit for the file. */ -static bool +bool xfs_is_falloc_aligned( struct xfs_inode *ip, loff_t pos, long long int len) { - struct xfs_mount *mp = ip->i_mount; - uint64_t mask; - - if (XFS_IS_REALTIME_INODE(ip)) { - if (!is_power_of_2(mp->m_sb.sb_rextsize)) { - u64 rextbytes; - u32 mod; - - rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); - div_u64_rem(pos, rextbytes, &mod); - if (mod) - return false; - div_u64_rem(len, rextbytes, &mod); - return mod == 0; - } - mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; - } else { - mask = mp->m_sb.sb_blocksize - 1; - } + unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); + + if (!is_power_of_2(alloc_unit)) + return isaligned_64(pos, alloc_unit) && + isaligned_64(len, alloc_unit); - return !((pos | len) & mask); + return !((pos | len) & (alloc_unit - 1)); } /* @@ -226,29 +213,18 @@ xfs_ilock_iocb_for_write( if (ret) return ret; - if (*lock_mode == XFS_IOLOCK_EXCL) - return 0; - if (!xfs_iflags_test(ip, XFS_IREMAPPING)) - return 0; - - xfs_iunlock(ip, *lock_mode); - *lock_mode = XFS_IOLOCK_EXCL; - return xfs_ilock_iocb(iocb, *lock_mode); -} - -static unsigned int -xfs_ilock_for_write_fault( - struct xfs_inode *ip) -{ - /* get a shared lock if no remapping in progress */ - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - if (!xfs_iflags_test(ip, XFS_IREMAPPING)) - return XFS_MMAPLOCK_SHARED; + /* + * If a reflink remap is in progress we always need to take the iolock + * exclusively to wait for it to finish. + */ + if (*lock_mode == XFS_IOLOCK_SHARED && + xfs_iflags_test(ip, XFS_IREMAPPING)) { + xfs_iunlock(ip, *lock_mode); + *lock_mode = XFS_IOLOCK_EXCL; + return xfs_ilock_iocb(iocb, *lock_mode); + } - /* wait for remapping to complete */ - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - return XFS_MMAPLOCK_EXCL; + return 0; } STATIC ssize_t @@ -372,9 +348,82 @@ xfs_file_splice_read( } /* + * Take care of zeroing post-EOF blocks when they might exist. + * + * Returns 0 if successfully, a negative error for a failure, or 1 if this + * function dropped the iolock and reacquired it exclusively and the caller + * needs to restart the write sanity checks. + */ +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + int error; + + /* + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + + return error; +} + +/* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to + * Called with the iolock held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ @@ -384,13 +433,10 @@ xfs_file_write_checks( struct iov_iter *from, unsigned int *iolock) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - ssize_t error = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); bool drained_dio = false; - loff_t isize; + ssize_t error; restart: error = generic_write_checks(iocb, from); @@ -413,7 +459,7 @@ restart: * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_iunlock(ip, *iolock); + xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); if (error) { @@ -424,64 +470,24 @@ restart: } /* - * If the offset is beyond the size of the file, we need to zero any + * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the iolock - * shared, we need to update it to exclusive which implies having to - * redo all checks before. + * write. * - * We need to serialise against EOF updates that occur in IO completions - * here. We want to make sure that nobody is changing the size while we - * do this check until we have placed an IO barrier (i.e. hold the - * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The - * spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and - * hence be able to correctly determine if we need to run zeroing. - * - * We can do an unlocked check here safely as IO completion can only - * extend EOF. Truncate is locked out at this point, so the EOF can - * not move backwards, only forwards. Hence we only need to take the - * slow path and spin locks when we are at or beyond the current EOF. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - if (iocb->ki_pos <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - isize = i_size_read(inode); - if (iocb->ki_pos > isize) { - spin_unlock(&ip->i_flags_lock); - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - - if (!drained_dio) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); - drained_dio = true; + if (iocb->ki_pos > i_size_read(inode)) { + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio); + if (error == 1) goto restart; - } - - trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; - } else - spin_unlock(&ip->i_flags_lock); + } -out: return kiocb_modified(iocb); } @@ -784,7 +790,7 @@ write_retry: trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); /* * If we hit a space limit, try to free up some lingering preallocated @@ -846,6 +852,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * Currently only atomic writing of a single FS block is + * supported. It would be possible to atomic write smaller than + * a FS block, but there is no requirement to support this. + * Note that iomap also does not support this yet. + */ + if (ocount != ip->i_mount->m_sb.sb_blocksize) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered @@ -861,80 +881,205 @@ xfs_file_write_iter( return xfs_file_buffered_write(iocb, from); } -static void -xfs_wait_dax_page( - struct inode *inode) +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(file_inode(filp)); - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - schedule(); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + if (xfs_has_wsync(ip->i_mount)) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; } -int -xfs_break_dax_layouts( - struct inode *inode, - bool *retry) +static int +xfs_falloc_newsize( + struct file *file, + int mode, + loff_t offset, + loff_t len, + loff_t *new_size) { - struct page *page; + struct inode *inode = file_inode(file); - ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL)); + if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) + return 0; + *new_size = offset + len; + return inode_newsize_ok(inode, *new_size); +} + +static int +xfs_falloc_setsize( + struct file *file, + loff_t new_size) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = new_size, + }; - page = dax_layout_busy_page(inode->i_mapping); - if (!page) + if (!new_size) return 0; + return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), + &iattr); +} + +static int +xfs_falloc_collapse_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = i_size_read(inode) - len; + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; + + /* + * There is no need to overlap collapse range with EOF, in which case it + * is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) + return -EINVAL; - *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, xfs_wait_dax_page(inode)); + error = xfs_collapse_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); } -int -xfs_break_layouts( - struct inode *inode, - uint *iolock, - enum layout_break_reason reason) +static int +xfs_falloc_insert_range( + struct file *file, + loff_t offset, + loff_t len) { - bool retry; + struct inode *inode = file_inode(file); + loff_t isize = i_size_read(inode); int error; - ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); - - do { - retry = false; - switch (reason) { - case BREAK_UNMAP: - error = xfs_break_dax_layouts(inode, &retry); - if (error || retry) - break; - fallthrough; - case BREAK_WRITE: - error = xfs_break_leased_layouts(inode, iolock, &retry); - break; - default: - WARN_ON_ONCE(1); - error = -EINVAL; - } - } while (error == 0 && retry); + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; - return error; + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) + return -EFBIG; + + /* Offset should be less than i_size */ + if (offset >= isize) + return -EINVAL; + + error = xfs_falloc_setsize(file, isize + len); + if (error) + return error; + + /* + * Perform hole insertion now that the file size has been updated so + * that if we crash during the operation we don't leave shifted extents + * past EOF and hence losing access to the data that is contained within + * them. + */ + return xfs_insert_file_space(XFS_I(inode), offset, len); } -/* Does this file, inode, or mount want synchronous writes? */ -static inline bool xfs_file_sync_writes(struct file *filp) +/* + * Punch a hole and prealloc the range. We use a hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by + * virtue of the hole punch. + */ +static int +xfs_falloc_zero_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) { - struct xfs_inode *ip = XFS_I(file_inode(filp)); + struct inode *inode = file_inode(file); + unsigned int blksize = i_blocksize(inode); + loff_t new_size = 0; + int error; - if (xfs_has_wsync(ip->i_mount)) - return true; - if (filp->f_flags & (__O_SYNC | O_DSYNC)) - return true; - if (IS_SYNC(file_inode(filp))) - return true; + trace_xfs_zero_file_space(XFS_I(inode)); - return false; + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_free_file_space(XFS_I(inode), offset, len); + if (error) + return error; + + len = round_up(offset + len, blksize) - round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_unshare_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_reflink_unshare(XFS_I(inode), offset, len); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_allocate_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + /* + * If always_cow mode we can't use preallocations and thus should not + * create them. + */ + if (xfs_is_always_cow_inode(XFS_I(inode))) + return -EOPNOTSUPP; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); } #define XFS_FALLOC_FL_SUPPORTED \ @@ -953,8 +1098,6 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - loff_t new_size = 0; - bool do_file_insert = false; if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -975,156 +1118,35 @@ xfs_file_fallocate( */ inode_dio_wait(inode); - /* - * Now AIO and DIO has drained we flush and (if necessary) invalidate - * the cached range over the first operation we are about to run. - * - * We care about zero and collapse here because they both run a hole - * punch over the range first. Because that can zero data, and the range - * of invalidation for the shift operations is much larger, we still do - * the required flush for collapse in xfs_prepare_shift(). - * - * Insert has the same range requirements as collapse, and we extend the - * file first which can zero data. Hence insert has the same - * flush/invalidate requirements as collapse and so they are both - * handled at the right time by xfs_prepare_shift(). - */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | - FALLOC_FL_COLLAPSE_RANGE)) { - error = xfs_flush_unmap_range(ip, offset, len); - if (error) - goto out_unlock; - } - error = file_modified(file); if (error) goto out_unlock; - if (mode & FALLOC_FL_PUNCH_HOLE) { + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - if (!xfs_is_falloc_aligned(ip, offset, len)) { - error = -EINVAL; - goto out_unlock; - } - - /* - * There is no need to overlap collapse range with EOF, - * in which case it is effectively a truncate operation - */ - if (offset + len >= i_size_read(inode)) { - error = -EINVAL; - goto out_unlock; - } - - new_size = i_size_read(inode) - len; - - error = xfs_collapse_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_INSERT_RANGE) { - loff_t isize = i_size_read(inode); - - if (!xfs_is_falloc_aligned(ip, offset, len)) { - error = -EINVAL; - goto out_unlock; - } - - /* - * New inode size must not exceed ->s_maxbytes, accounting for - * possible signed overflow. - */ - if (inode->i_sb->s_maxbytes - isize < len) { - error = -EFBIG; - goto out_unlock; - } - new_size = isize + len; - - /* Offset should be less than i_size */ - if (offset >= isize) { - error = -EINVAL; - goto out_unlock; - } - do_file_insert = true; - } else { - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - /* - * Punch a hole and prealloc the range. We use a hole - * punch rather than unwritten extent conversion for two - * reasons: - * - * 1.) Hole punch handles partial block zeroing for us. - * 2.) If prealloc returns ENOSPC, the file range is - * still zero-valued by virtue of the hole punch. - */ - unsigned int blksize = i_blocksize(inode); - - trace_xfs_zero_file_space(ip); - - error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - - len = round_up(offset + len, blksize) - - round_down(offset, blksize); - offset = round_down(offset, blksize); - } else if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; - } else { - /* - * If always_cow mode we can't use preallocations and - * thus should not create them. - */ - if (xfs_is_always_cow_inode(ip)) { - error = -EOPNOTSUPP; - goto out_unlock; - } - } - - if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len); - if (error) - goto out_unlock; - } - } - - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_mnt_idmap(file), - file_dentry(file), &iattr); - if (error) - goto out_unlock; - } - - /* - * Perform hole insertion now that the file size has been - * updated so that if we crash during the operation we don't - * leave shifted extents past EOF and hence losing access to - * the data that is contained within them. - */ - if (do_file_insert) { - error = xfs_insert_file_space(ip, offset, len); - if (error) - goto out_unlock; + break; + case FALLOC_FL_COLLAPSE_RANGE: + error = xfs_falloc_collapse_range(file, offset, len); + break; + case FALLOC_FL_INSERT_RANGE: + error = xfs_falloc_insert_range(file, offset, len); + break; + case FALLOC_FL_ZERO_RANGE: + error = xfs_falloc_zero_range(file, mode, offset, len); + break; + case FALLOC_FL_UNSHARE_RANGE: + error = xfs_falloc_unshare_range(file, mode, offset, len); + break; + case FALLOC_FL_ALLOCATE_RANGE: + error = xfs_falloc_allocate_range(file, mode, offset, len); + break; + default: + error = -EOPNOTSUPP; + break; } - if (xfs_file_sync_writes(file)) + if (!error && xfs_file_sync_writes(file)) error = xfs_log_force_inode(ip); out_unlock: @@ -1220,6 +1242,14 @@ out_unlock: xfs_iunlock2_remapping(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + /* + * If the caller did not set CAN_SHORTEN, then it is not prepared to + * handle partial results -- either the whole remap succeeds, or we + * must say why it did not. In this case, any error should be returned + * to the caller. + */ + if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return ret; return remapped > 0 ? remapped : ret; } @@ -1230,8 +1260,9 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT; + file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_inode_can_atomicwrite(XFS_I(inode))) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } @@ -1244,7 +1275,9 @@ xfs_dir_open( unsigned int mode; int error; - error = xfs_file_open(inode, file); + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; + error = generic_file_open(inode, file); if (error) return error; @@ -1259,12 +1292,78 @@ xfs_dir_open( return error; } +/* + * Don't bother propagating errors. We're just doing cleanup, and the caller + * ignores the return value anyway. + */ STATIC int xfs_file_release( - struct inode *inode, - struct file *filp) + struct inode *inode, + struct file *file) { - return xfs_release(XFS_I(inode)); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + /* + * If this is a read-only mount or the file system has been shut down, + * don't generate I/O. + */ + if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) + return 0; + + /* + * If we previously truncated this file and removed old data in the + * process, we want to initiate "early" writeout on the last close. + * This is an attempt to combat the notorious NULL files problem which + * is particularly noticeable from a truncate down, buffered (re-)write + * (delalloc), followed by a crash. What we are effectively doing here + * is significantly reducing the time window where we'd otherwise be + * exposed to that problem. + */ + if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { + xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); + if (ip->i_delayed_blks > 0) + filemap_flush(inode->i_mapping); + } + + /* + * XFS aggressively preallocates post-EOF space to generate contiguous + * allocations for writers that append to the end of the file. + * + * To support workloads that close and reopen the file frequently, these + * preallocations usually persist after a close unless it is the first + * close for the inode. This is a tradeoff to generate tightly packed + * data layouts for unpacking tarballs or similar archives that write + * one file after another without going back to it while keeping the + * preallocation for files that have recurring open/write/close cycles. + * + * This heuristic is skipped for inodes with the append-only flag as + * that flag is rather pointless for inodes written only once. + * + * There is no point in freeing blocks here for open but unlinked files + * as they will be taken care of by the inactivation path soon. + * + * When releasing a read-only context, don't flush data or trim post-EOF + * blocks. This avoids open/read/close workloads from removing EOF + * blocks that other writers depend upon to reduce fragmentation. + * + * If we can't get the iolock just skip truncating the blocks past EOF + * because we could deadlock with the mmap_lock otherwise. We'll get + * another chance to drop them once the last reference to the inode is + * dropped, so we'll never leak blocks permanently. + */ + if (inode->i_nlink && + (file->f_mode & FMODE_WRITE) && + !(ip->i_diflags & XFS_DIFLAG_APPEND) && + !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (xfs_can_free_eofblocks(ip) && + !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return 0; } STATIC int @@ -1320,31 +1419,44 @@ xfs_file_llseek( return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } -#ifdef CONFIG_FS_DAX static inline vm_fault_t -xfs_dax_fault( +xfs_dax_fault_locked( struct vm_fault *vmf, unsigned int order, - bool write_fault, - pfn_t *pfn) + bool write_fault) { - return dax_iomap_fault(vmf, order, pfn, NULL, + vm_fault_t ret; + pfn_t pfn; + + if (!IS_ENABLED(CONFIG_FS_DAX)) { + ASSERT(0); + return VM_FAULT_SIGBUS; + } + ret = dax_iomap_fault(vmf, order, &pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_dax_write_iomap_ops : &xfs_read_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, order, pfn); + return ret; } -#else -static inline vm_fault_t -xfs_dax_fault( + +static vm_fault_t +xfs_dax_read_fault( struct vm_fault *vmf, - unsigned int order, - bool write_fault, - pfn_t *pfn) + unsigned int order) { - ASSERT(0); - return VM_FAULT_SIGBUS; + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + vm_fault_t ret; + + trace_xfs_read_fault(ip, order); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + ret = xfs_dax_fault_locked(vmf, order, false); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return ret; } -#endif /* * Locking for serialisation of IO during page faults. This results in a lock @@ -1357,43 +1469,39 @@ xfs_dax_fault( * i_lock (XFS - extent map serialisation) */ static vm_fault_t -__xfs_filemap_fault( +xfs_write_fault( struct vm_fault *vmf, - unsigned int order, - bool write_fault) + unsigned int order) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); + unsigned int lock_mode = XFS_MMAPLOCK_SHARED; vm_fault_t ret; - unsigned int lock_mode = 0; - - trace_xfs_filemap_fault(ip, order, write_fault); - if (write_fault) { - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - } - - if (IS_DAX(inode) || write_fault) - lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); + trace_xfs_write_fault(ip, order); - if (IS_DAX(inode)) { - pfn_t pfn; + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); - ret = xfs_dax_fault(vmf, order, write_fault, &pfn); - if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, order, pfn); - } else if (write_fault) { - ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); - } else { - ret = filemap_fault(vmf); + /* + * Normally we only need the shared mmaplock, but if a reflink remap is + * in progress we take the exclusive lock to wait for the remap to + * finish before taking a write fault. + */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (xfs_iflags_test(ip, XFS_IREMAPPING)) { + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + lock_mode = XFS_MMAPLOCK_EXCL; } - if (lock_mode) - xfs_iunlock(XFS_I(inode), lock_mode); + if (IS_DAX(inode)) + ret = xfs_dax_fault_locked(vmf, order, true); + else + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + xfs_iunlock(ip, lock_mode); - if (write_fault) - sb_end_pagefault(inode->i_sb); + sb_end_pagefault(inode->i_sb); return ret; } @@ -1409,10 +1517,17 @@ static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) { + struct inode *inode = file_inode(vmf->vma->vm_file); + /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, 0, - IS_DAX(file_inode(vmf->vma->vm_file)) && - xfs_is_write_fault(vmf)); + if (IS_DAX(inode)) { + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, 0); + return xfs_dax_read_fault(vmf, 0); + } + + trace_xfs_read_fault(XFS_I(inode), 0); + return filemap_fault(vmf); } static vm_fault_t @@ -1424,15 +1539,16 @@ xfs_filemap_huge_fault( return VM_FAULT_FALLBACK; /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, order, - xfs_is_write_fault(vmf)); + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, order); + return xfs_dax_read_fault(vmf, order); } static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, 0, true); + return xfs_write_fault(vmf, 0); } /* @@ -1444,8 +1560,7 @@ static vm_fault_t xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - - return __xfs_filemap_fault(vmf, 0, true); + return xfs_write_fault(vmf, 0); } static const struct vm_operations_struct xfs_file_vm_ops = { @@ -1490,7 +1605,6 @@ const struct file_operations xfs_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, - .mmap_supported_flags = MAP_SYNC, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, @@ -1498,6 +1612,8 @@ const struct file_operations xfs_file_operations = { .fallocate = xfs_file_fallocate, .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, }; const struct file_operations xfs_dir_file_operations = { |