summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c828
1 files changed, 472 insertions, 356 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f..9a435b1ff264 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -24,6 +24,7 @@
#include "xfs_pnfs.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include "xfs_file.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -38,33 +39,19 @@ static const struct vm_operations_struct xfs_file_vm_ops;
* Decide if the given file range is aligned to the size of the fundamental
* allocation unit for the file.
*/
-static bool
+bool
xfs_is_falloc_aligned(
struct xfs_inode *ip,
loff_t pos,
long long int len)
{
- struct xfs_mount *mp = ip->i_mount;
- uint64_t mask;
-
- if (XFS_IS_REALTIME_INODE(ip)) {
- if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
- u64 rextbytes;
- u32 mod;
-
- rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
- div_u64_rem(pos, rextbytes, &mod);
- if (mod)
- return false;
- div_u64_rem(len, rextbytes, &mod);
- return mod == 0;
- }
- mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
- } else {
- mask = mp->m_sb.sb_blocksize - 1;
- }
+ unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
+
+ if (!is_power_of_2(alloc_unit))
+ return isaligned_64(pos, alloc_unit) &&
+ isaligned_64(len, alloc_unit);
- return !((pos | len) & mask);
+ return !((pos | len) & (alloc_unit - 1));
}
/*
@@ -226,29 +213,18 @@ xfs_ilock_iocb_for_write(
if (ret)
return ret;
- if (*lock_mode == XFS_IOLOCK_EXCL)
- return 0;
- if (!xfs_iflags_test(ip, XFS_IREMAPPING))
- return 0;
-
- xfs_iunlock(ip, *lock_mode);
- *lock_mode = XFS_IOLOCK_EXCL;
- return xfs_ilock_iocb(iocb, *lock_mode);
-}
-
-static unsigned int
-xfs_ilock_for_write_fault(
- struct xfs_inode *ip)
-{
- /* get a shared lock if no remapping in progress */
- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- if (!xfs_iflags_test(ip, XFS_IREMAPPING))
- return XFS_MMAPLOCK_SHARED;
+ /*
+ * If a reflink remap is in progress we always need to take the iolock
+ * exclusively to wait for it to finish.
+ */
+ if (*lock_mode == XFS_IOLOCK_SHARED &&
+ xfs_iflags_test(ip, XFS_IREMAPPING)) {
+ xfs_iunlock(ip, *lock_mode);
+ *lock_mode = XFS_IOLOCK_EXCL;
+ return xfs_ilock_iocb(iocb, *lock_mode);
+ }
- /* wait for remapping to complete */
- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- return XFS_MMAPLOCK_EXCL;
+ return 0;
}
STATIC ssize_t
@@ -372,9 +348,82 @@ xfs_file_splice_read(
}
/*
+ * Take care of zeroing post-EOF blocks when they might exist.
+ *
+ * Returns 0 if successfully, a negative error for a failure, or 1 if this
+ * function dropped the iolock and reacquired it exclusively and the caller
+ * needs to restart the write sanity checks.
+ */
+static ssize_t
+xfs_file_write_zero_eof(
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int *iolock,
+ size_t count,
+ bool *drained_dio)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ loff_t isize;
+ int error;
+
+ /*
+ * We need to serialise against EOF updates that occur in IO completions
+ * here. We want to make sure that nobody is changing the size while
+ * we do this check until we have placed an IO barrier (i.e. hold
+ * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
+ * spinlock effectively forms a memory barrier once we have
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ * hence be able to correctly determine if we need to run zeroing.
+ */
+ spin_lock(&ip->i_flags_lock);
+ isize = i_size_read(VFS_I(ip));
+ if (iocb->ki_pos <= isize) {
+ spin_unlock(&ip->i_flags_lock);
+ return 0;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ if (!*drained_dio) {
+ /*
+ * If zeroing is needed and we are currently holding the iolock
+ * shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
+ */
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
+
+ /*
+ * We now have an IO submission barrier in place, but AIO can do
+ * EOF updates during IO completion and hence we now need to
+ * wait for all of them to drain. Non-AIO DIO will have drained
+ * before we are given the XFS_IOLOCK_EXCL, and so for most
+ * cases this wait is a no-op.
+ */
+ inode_dio_wait(VFS_I(ip));
+ *drained_dio = true;
+ return 1;
+ }
+
+ trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
+
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+
+ return error;
+}
+
+/*
* Common pre-write limit and setup checks.
*
- * Called with the iolocked held either shared and exclusive according to
+ * Called with the iolock held either shared and exclusive according to
* @iolock, and returns with it held. Might upgrade the iolock to exclusive
* if called for a direct write beyond i_size.
*/
@@ -384,13 +433,10 @@ xfs_file_write_checks(
struct iov_iter *from,
unsigned int *iolock)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- ssize_t error = 0;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
bool drained_dio = false;
- loff_t isize;
+ ssize_t error;
restart:
error = generic_write_checks(iocb, from);
@@ -413,7 +459,7 @@ restart:
* exclusively.
*/
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- xfs_iunlock(ip, *iolock);
+ xfs_iunlock(XFS_I(inode), *iolock);
*iolock = XFS_IOLOCK_EXCL;
error = xfs_ilock_iocb(iocb, *iolock);
if (error) {
@@ -424,64 +470,24 @@ restart:
}
/*
- * If the offset is beyond the size of the file, we need to zero any
+ * If the offset is beyond the size of the file, we need to zero all
* blocks that fall between the existing EOF and the start of this
- * write. If zeroing is needed and we are currently holding the iolock
- * shared, we need to update it to exclusive which implies having to
- * redo all checks before.
+ * write.
*
- * We need to serialise against EOF updates that occur in IO completions
- * here. We want to make sure that nobody is changing the size while we
- * do this check until we have placed an IO barrier (i.e. hold the
- * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
- * spinlock effectively forms a memory barrier once we have the
- * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
- * hence be able to correctly determine if we need to run zeroing.
- *
- * We can do an unlocked check here safely as IO completion can only
- * extend EOF. Truncate is locked out at this point, so the EOF can
- * not move backwards, only forwards. Hence we only need to take the
- * slow path and spin locks when we are at or beyond the current EOF.
+ * We can do an unlocked check for i_size here safely as I/O completion
+ * can only extend EOF. Truncate is locked out at this point, so the
+ * EOF can not move backwards, only forwards. Hence we only need to take
+ * the slow path when we are at or beyond the current EOF.
*/
- if (iocb->ki_pos <= i_size_read(inode))
- goto out;
-
- spin_lock(&ip->i_flags_lock);
- isize = i_size_read(inode);
- if (iocb->ki_pos > isize) {
- spin_unlock(&ip->i_flags_lock);
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
-
- if (!drained_dio) {
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
- }
- /*
- * We now have an IO submission barrier in place, but
- * AIO can do EOF updates during IO completion and hence
- * we now need to wait for all of them to drain. Non-AIO
- * DIO will have drained before we are given the
- * XFS_IOLOCK_EXCL, and so for most cases this wait is a
- * no-op.
- */
- inode_dio_wait(inode);
- drained_dio = true;
+ if (iocb->ki_pos > i_size_read(inode)) {
+ error = xfs_file_write_zero_eof(iocb, from, iolock, count,
+ &drained_dio);
+ if (error == 1)
goto restart;
- }
-
- trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
return error;
- } else
- spin_unlock(&ip->i_flags_lock);
+ }
-out:
return kiocb_modified(iocb);
}
@@ -784,7 +790,7 @@ write_retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
/*
* If we hit a space limit, try to free up some lingering preallocated
@@ -846,6 +852,20 @@ xfs_file_write_iter(
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ /*
+ * Currently only atomic writing of a single FS block is
+ * supported. It would be possible to atomic write smaller than
+ * a FS block, but there is no requirement to support this.
+ * Note that iomap also does not support this yet.
+ */
+ if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ return -EINVAL;
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@@ -861,80 +881,205 @@ xfs_file_write_iter(
return xfs_file_buffered_write(iocb, from);
}
-static void
-xfs_wait_dax_page(
- struct inode *inode)
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
- schedule();
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ if (xfs_has_wsync(ip->i_mount))
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
}
-int
-xfs_break_dax_layouts(
- struct inode *inode,
- bool *retry)
+static int
+xfs_falloc_newsize(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ loff_t *new_size)
{
- struct page *page;
+ struct inode *inode = file_inode(file);
- ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+ if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
+ return 0;
+ *new_size = offset + len;
+ return inode_newsize_ok(inode, *new_size);
+}
+
+static int
+xfs_falloc_setsize(
+ struct file *file,
+ loff_t new_size)
+{
+ struct iattr iattr = {
+ .ia_valid = ATTR_SIZE,
+ .ia_size = new_size,
+ };
- page = dax_layout_busy_page(inode->i_mapping);
- if (!page)
+ if (!new_size)
return 0;
+ return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
+ &iattr);
+}
+
+static int
+xfs_falloc_collapse_range(
+ struct file *file,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = i_size_read(inode) - len;
+ int error;
+
+ if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
+ return -EINVAL;
+
+ /*
+ * There is no need to overlap collapse range with EOF, in which case it
+ * is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode))
+ return -EINVAL;
- *retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, xfs_wait_dax_page(inode));
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
}
-int
-xfs_break_layouts(
- struct inode *inode,
- uint *iolock,
- enum layout_break_reason reason)
+static int
+xfs_falloc_insert_range(
+ struct file *file,
+ loff_t offset,
+ loff_t len)
{
- bool retry;
+ struct inode *inode = file_inode(file);
+ loff_t isize = i_size_read(inode);
int error;
- ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
-
- do {
- retry = false;
- switch (reason) {
- case BREAK_UNMAP:
- error = xfs_break_dax_layouts(inode, &retry);
- if (error || retry)
- break;
- fallthrough;
- case BREAK_WRITE:
- error = xfs_break_leased_layouts(inode, iolock, &retry);
- break;
- default:
- WARN_ON_ONCE(1);
- error = -EINVAL;
- }
- } while (error == 0 && retry);
+ if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
+ return -EINVAL;
- return error;
+ /*
+ * New inode size must not exceed ->s_maxbytes, accounting for
+ * possible signed overflow.
+ */
+ if (inode->i_sb->s_maxbytes - isize < len)
+ return -EFBIG;
+
+ /* Offset should be less than i_size */
+ if (offset >= isize)
+ return -EINVAL;
+
+ error = xfs_falloc_setsize(file, isize + len);
+ if (error)
+ return error;
+
+ /*
+ * Perform hole insertion now that the file size has been updated so
+ * that if we crash during the operation we don't leave shifted extents
+ * past EOF and hence losing access to the data that is contained within
+ * them.
+ */
+ return xfs_insert_file_space(XFS_I(inode), offset, len);
}
-/* Does this file, inode, or mount want synchronous writes? */
-static inline bool xfs_file_sync_writes(struct file *filp)
+/*
+ * Punch a hole and prealloc the range. We use a hole punch rather than
+ * unwritten extent conversion for two reasons:
+ *
+ * 1.) Hole punch handles partial block zeroing for us.
+ * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by
+ * virtue of the hole punch.
+ */
+static int
+xfs_falloc_zero_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
{
- struct xfs_inode *ip = XFS_I(file_inode(filp));
+ struct inode *inode = file_inode(file);
+ unsigned int blksize = i_blocksize(inode);
+ loff_t new_size = 0;
+ int error;
- if (xfs_has_wsync(ip->i_mount))
- return true;
- if (filp->f_flags & (__O_SYNC | O_DSYNC))
- return true;
- if (IS_SYNC(file_inode(filp)))
- return true;
+ trace_xfs_zero_file_space(XFS_I(inode));
- return false;
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_free_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+
+ len = round_up(offset + len, blksize) - round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_unshare_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = 0;
+ int error;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_reflink_unshare(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_allocate_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = 0;
+ int error;
+
+ /*
+ * If always_cow mode we can't use preallocations and thus should not
+ * create them.
+ */
+ if (xfs_is_always_cow_inode(XFS_I(inode)))
+ return -EOPNOTSUPP;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
}
#define XFS_FALLOC_FL_SUPPORTED \
@@ -953,8 +1098,6 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- loff_t new_size = 0;
- bool do_file_insert = false;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
@@ -975,156 +1118,35 @@ xfs_file_fallocate(
*/
inode_dio_wait(inode);
- /*
- * Now AIO and DIO has drained we flush and (if necessary) invalidate
- * the cached range over the first operation we are about to run.
- *
- * We care about zero and collapse here because they both run a hole
- * punch over the range first. Because that can zero data, and the range
- * of invalidation for the shift operations is much larger, we still do
- * the required flush for collapse in xfs_prepare_shift().
- *
- * Insert has the same range requirements as collapse, and we extend the
- * file first which can zero data. Hence insert has the same
- * flush/invalidate requirements as collapse and so they are both
- * handled at the right time by xfs_prepare_shift().
- */
- if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_COLLAPSE_RANGE)) {
- error = xfs_flush_unmap_range(ip, offset, len);
- if (error)
- goto out_unlock;
- }
-
error = file_modified(file);
if (error)
goto out_unlock;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- if (!xfs_is_falloc_aligned(ip, offset, len)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- /*
- * There is no need to overlap collapse range with EOF,
- * in which case it is effectively a truncate operation
- */
- if (offset + len >= i_size_read(inode)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- new_size = i_size_read(inode) - len;
-
- error = xfs_collapse_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- } else if (mode & FALLOC_FL_INSERT_RANGE) {
- loff_t isize = i_size_read(inode);
-
- if (!xfs_is_falloc_aligned(ip, offset, len)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- /*
- * New inode size must not exceed ->s_maxbytes, accounting for
- * possible signed overflow.
- */
- if (inode->i_sb->s_maxbytes - isize < len) {
- error = -EFBIG;
- goto out_unlock;
- }
- new_size = isize + len;
-
- /* Offset should be less than i_size */
- if (offset >= isize) {
- error = -EINVAL;
- goto out_unlock;
- }
- do_file_insert = true;
- } else {
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
- if (error)
- goto out_unlock;
- }
-
- if (mode & FALLOC_FL_ZERO_RANGE) {
- /*
- * Punch a hole and prealloc the range. We use a hole
- * punch rather than unwritten extent conversion for two
- * reasons:
- *
- * 1.) Hole punch handles partial block zeroing for us.
- * 2.) If prealloc returns ENOSPC, the file range is
- * still zero-valued by virtue of the hole punch.
- */
- unsigned int blksize = i_blocksize(inode);
-
- trace_xfs_zero_file_space(ip);
-
- error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
-
- len = round_up(offset + len, blksize) -
- round_down(offset, blksize);
- offset = round_down(offset, blksize);
- } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
- } else {
- /*
- * If always_cow mode we can't use preallocations and
- * thus should not create them.
- */
- if (xfs_is_always_cow_inode(ip)) {
- error = -EOPNOTSUPP;
- goto out_unlock;
- }
- }
-
- if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- }
- }
-
- /* Change file size if needed */
- if (new_size) {
- struct iattr iattr;
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = new_size;
- error = xfs_vn_setattr_size(file_mnt_idmap(file),
- file_dentry(file), &iattr);
- if (error)
- goto out_unlock;
- }
-
- /*
- * Perform hole insertion now that the file size has been
- * updated so that if we crash during the operation we don't
- * leave shifted extents past EOF and hence losing access to
- * the data that is contained within them.
- */
- if (do_file_insert) {
- error = xfs_insert_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ error = xfs_falloc_collapse_range(file, offset, len);
+ break;
+ case FALLOC_FL_INSERT_RANGE:
+ error = xfs_falloc_insert_range(file, offset, len);
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ error = xfs_falloc_zero_range(file, mode, offset, len);
+ break;
+ case FALLOC_FL_UNSHARE_RANGE:
+ error = xfs_falloc_unshare_range(file, mode, offset, len);
+ break;
+ case FALLOC_FL_ALLOCATE_RANGE:
+ error = xfs_falloc_allocate_range(file, mode, offset, len);
+ break;
+ default:
+ error = -EOPNOTSUPP;
+ break;
}
- if (xfs_file_sync_writes(file))
+ if (!error && xfs_file_sync_writes(file))
error = xfs_log_force_inode(ip);
out_unlock:
@@ -1220,6 +1242,14 @@ out_unlock:
xfs_iunlock2_remapping(src, dest);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+ /*
+ * If the caller did not set CAN_SHORTEN, then it is not prepared to
+ * handle partial results -- either the whole remap succeeds, or we
+ * must say why it did not. In this case, any error should be returned
+ * to the caller.
+ */
+ if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+ return ret;
return remapped > 0 ? remapped : ret;
}
@@ -1230,8 +1260,9 @@ xfs_file_open(
{
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
- FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
+ file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+ if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+ file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
@@ -1244,7 +1275,9 @@ xfs_dir_open(
unsigned int mode;
int error;
- error = xfs_file_open(inode, file);
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
+ error = generic_file_open(inode, file);
if (error)
return error;
@@ -1259,12 +1292,78 @@ xfs_dir_open(
return error;
}
+/*
+ * Don't bother propagating errors. We're just doing cleanup, and the caller
+ * ignores the return value anyway.
+ */
STATIC int
xfs_file_release(
- struct inode *inode,
- struct file *filp)
+ struct inode *inode,
+ struct file *file)
{
- return xfs_release(XFS_I(inode));
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If this is a read-only mount or the file system has been shut down,
+ * don't generate I/O.
+ */
+ if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
+ return 0;
+
+ /*
+ * If we previously truncated this file and removed old data in the
+ * process, we want to initiate "early" writeout on the last close.
+ * This is an attempt to combat the notorious NULL files problem which
+ * is particularly noticeable from a truncate down, buffered (re-)write
+ * (delalloc), followed by a crash. What we are effectively doing here
+ * is significantly reducing the time window where we'd otherwise be
+ * exposed to that problem.
+ */
+ if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
+ xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
+ if (ip->i_delayed_blks > 0)
+ filemap_flush(inode->i_mapping);
+ }
+
+ /*
+ * XFS aggressively preallocates post-EOF space to generate contiguous
+ * allocations for writers that append to the end of the file.
+ *
+ * To support workloads that close and reopen the file frequently, these
+ * preallocations usually persist after a close unless it is the first
+ * close for the inode. This is a tradeoff to generate tightly packed
+ * data layouts for unpacking tarballs or similar archives that write
+ * one file after another without going back to it while keeping the
+ * preallocation for files that have recurring open/write/close cycles.
+ *
+ * This heuristic is skipped for inodes with the append-only flag as
+ * that flag is rather pointless for inodes written only once.
+ *
+ * There is no point in freeing blocks here for open but unlinked files
+ * as they will be taken care of by the inactivation path soon.
+ *
+ * When releasing a read-only context, don't flush data or trim post-EOF
+ * blocks. This avoids open/read/close workloads from removing EOF
+ * blocks that other writers depend upon to reduce fragmentation.
+ *
+ * If we can't get the iolock just skip truncating the blocks past EOF
+ * because we could deadlock with the mmap_lock otherwise. We'll get
+ * another chance to drop them once the last reference to the inode is
+ * dropped, so we'll never leak blocks permanently.
+ */
+ if (inode->i_nlink &&
+ (file->f_mode & FMODE_WRITE) &&
+ !(ip->i_diflags & XFS_DIFLAG_APPEND) &&
+ !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
+ xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ if (xfs_can_free_eofblocks(ip) &&
+ !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
+ xfs_free_eofblocks(ip);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ }
+
+ return 0;
}
STATIC int
@@ -1320,31 +1419,44 @@ xfs_file_llseek(
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
-#ifdef CONFIG_FS_DAX
static inline vm_fault_t
-xfs_dax_fault(
+xfs_dax_fault_locked(
struct vm_fault *vmf,
unsigned int order,
- bool write_fault,
- pfn_t *pfn)
+ bool write_fault)
{
- return dax_iomap_fault(vmf, order, pfn, NULL,
+ vm_fault_t ret;
+ pfn_t pfn;
+
+ if (!IS_ENABLED(CONFIG_FS_DAX)) {
+ ASSERT(0);
+ return VM_FAULT_SIGBUS;
+ }
+ ret = dax_iomap_fault(vmf, order, &pfn, NULL,
(write_fault && !vmf->cow_page) ?
&xfs_dax_write_iomap_ops :
&xfs_read_iomap_ops);
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, order, pfn);
+ return ret;
}
-#else
-static inline vm_fault_t
-xfs_dax_fault(
+
+static vm_fault_t
+xfs_dax_read_fault(
struct vm_fault *vmf,
- unsigned int order,
- bool write_fault,
- pfn_t *pfn)
+ unsigned int order)
{
- ASSERT(0);
- return VM_FAULT_SIGBUS;
+ struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
+ vm_fault_t ret;
+
+ trace_xfs_read_fault(ip, order);
+
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ ret = xfs_dax_fault_locked(vmf, order, false);
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+ return ret;
}
-#endif
/*
* Locking for serialisation of IO during page faults. This results in a lock
@@ -1357,43 +1469,39 @@ xfs_dax_fault(
* i_lock (XFS - extent map serialisation)
*/
static vm_fault_t
-__xfs_filemap_fault(
+xfs_write_fault(
struct vm_fault *vmf,
- unsigned int order,
- bool write_fault)
+ unsigned int order)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
+ unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
vm_fault_t ret;
- unsigned int lock_mode = 0;
-
- trace_xfs_filemap_fault(ip, order, write_fault);
- if (write_fault) {
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
- }
-
- if (IS_DAX(inode) || write_fault)
- lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
+ trace_xfs_write_fault(ip, order);
- if (IS_DAX(inode)) {
- pfn_t pfn;
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
- ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
- if (ret & VM_FAULT_NEEDDSYNC)
- ret = dax_finish_sync_fault(vmf, order, pfn);
- } else if (write_fault) {
- ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
- } else {
- ret = filemap_fault(vmf);
+ /*
+ * Normally we only need the shared mmaplock, but if a reflink remap is
+ * in progress we take the exclusive lock to wait for the remap to
+ * finish before taking a write fault.
+ */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ lock_mode = XFS_MMAPLOCK_EXCL;
}
- if (lock_mode)
- xfs_iunlock(XFS_I(inode), lock_mode);
+ if (IS_DAX(inode))
+ ret = xfs_dax_fault_locked(vmf, order, true);
+ else
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+ xfs_iunlock(ip, lock_mode);
- if (write_fault)
- sb_end_pagefault(inode->i_sb);
+ sb_end_pagefault(inode->i_sb);
return ret;
}
@@ -1409,10 +1517,17 @@ static vm_fault_t
xfs_filemap_fault(
struct vm_fault *vmf)
{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+
/* DAX can shortcut the normal fault path on write faults! */
- return __xfs_filemap_fault(vmf, 0,
- IS_DAX(file_inode(vmf->vma->vm_file)) &&
- xfs_is_write_fault(vmf));
+ if (IS_DAX(inode)) {
+ if (xfs_is_write_fault(vmf))
+ return xfs_write_fault(vmf, 0);
+ return xfs_dax_read_fault(vmf, 0);
+ }
+
+ trace_xfs_read_fault(XFS_I(inode), 0);
+ return filemap_fault(vmf);
}
static vm_fault_t
@@ -1424,15 +1539,16 @@ xfs_filemap_huge_fault(
return VM_FAULT_FALLBACK;
/* DAX can shortcut the normal fault path on write faults! */
- return __xfs_filemap_fault(vmf, order,
- xfs_is_write_fault(vmf));
+ if (xfs_is_write_fault(vmf))
+ return xfs_write_fault(vmf, order);
+ return xfs_dax_read_fault(vmf, order);
}
static vm_fault_t
xfs_filemap_page_mkwrite(
struct vm_fault *vmf)
{
- return __xfs_filemap_fault(vmf, 0, true);
+ return xfs_write_fault(vmf, 0);
}
/*
@@ -1444,8 +1560,7 @@ static vm_fault_t
xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
-
- return __xfs_filemap_fault(vmf, 0, true);
+ return xfs_write_fault(vmf, 0);
}
static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1490,7 +1605,6 @@ const struct file_operations xfs_file_operations = {
.compat_ioctl = xfs_file_compat_ioctl,
#endif
.mmap = xfs_file_mmap,
- .mmap_supported_flags = MAP_SYNC,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
@@ -1498,6 +1612,8 @@ const struct file_operations xfs_file_operations = {
.fallocate = xfs_file_fallocate,
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
+ .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
};
const struct file_operations xfs_dir_file_operations = {