summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c2628
1 files changed, 1584 insertions, 1044 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index de3dc98f4e8f..6108612182e2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1,129 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_vnodeops.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_icache.h"
+#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
+#include "xfs_reflink.h"
+#include "xfs_file.h"
+#include "xfs_aops.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
-#include <linux/aio.h>
-#include <linux/dcache.h>
+#include <linux/dax.h>
#include <linux/falloc.h>
-#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
+#include <linux/mman.h>
+#include <linux/fadvise.h>
+#include <linux/mount.h>
static const struct vm_operations_struct xfs_file_vm_ops;
/*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ * Decide if the given file range is aligned to the size of the fundamental
+ * allocation unit for the file.
*/
-static inline void
-xfs_rw_ilock(
- struct xfs_inode *ip,
- int type)
-{
- if (type & XFS_IOLOCK_EXCL)
- mutex_lock(&VFS_I(ip)->i_mutex);
- xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
+bool
+xfs_is_falloc_aligned(
struct xfs_inode *ip,
- int type)
+ loff_t pos,
+ long long int len)
{
- xfs_iunlock(ip, type);
- if (type & XFS_IOLOCK_EXCL)
- mutex_unlock(&VFS_I(ip)->i_mutex);
-}
+ unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
-static inline void
-xfs_rw_ilock_demote(
- struct xfs_inode *ip,
- int type)
-{
- xfs_ilock_demote(ip, type);
- if (type & XFS_IOLOCK_EXCL)
- mutex_unlock(&VFS_I(ip)->i_mutex);
-}
+ if (!is_power_of_2(alloc_unit))
+ return isaligned_64(pos, alloc_unit) &&
+ isaligned_64(len, alloc_unit);
-/*
- * xfs_iozero
- *
- * xfs_iozero clears the specified range of buffer supplied,
- * and marks all the affected blocks as valid and modified. If
- * an affected block is not allocated, it will be allocated. If
- * an affected block is not completely overwritten, and is not
- * valid before the operation, it will be read from disk before
- * being partially zeroed.
- */
-int
-xfs_iozero(
- struct xfs_inode *ip, /* inode */
- loff_t pos, /* offset in file */
- size_t count) /* size of data to zero */
-{
- struct page *page;
- struct address_space *mapping;
- int status;
-
- mapping = VFS_I(ip)->i_mapping;
- do {
- unsigned offset, bytes;
- void *fsdata;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
-
- zero_user(page, offset, bytes);
-
- status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
- page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
- pos += bytes;
- count -= bytes;
- status = 0;
- } while (count);
-
- return (-status);
+ return !((pos | len) & (alloc_unit - 1));
}
/*
@@ -140,19 +72,52 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
+ return xfs_log_force_inode(ip);
+}
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+/*
+ * All metadata updates are logged, which means that we just have to push the
+ * journal to the required sequence number than holds the updates. We track
+ * datasync commits separately to full sync commits, and hence only need to
+ * select the correct sequence number for the log force here.
+ *
+ * We don't have to serialise against concurrent modifications, as we do not
+ * have to wait for modifications that have not yet completed. We define a
+ * transaction commit as completing when the commit sequence number is updated,
+ * hence if the sequence number has not updated, the sync operation has been
+ * run before the commit completed and we don't have to wait for it.
+ *
+ * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
+ * set on the log item until - at least - the journal flush completes. In
+ * reality, they are only cleared when the inode is fully unpinned (i.e.
+ * persistent in the journal and not dirty in the CIL), and so we rely on
+ * xfs_log_force_seq() either skipping sequences that have been persisted or
+ * waiting on sequences that are still in flight to correctly order concurrent
+ * sync operations.
+ */
+static int
+xfs_fsync_flush_log(
+ struct xfs_inode *ip,
+ bool datasync,
+ int *log_flushed)
+{
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ xfs_csn_t seq = 0;
- if (!lsn)
+ spin_lock(&iip->ili_lock);
+ if (datasync)
+ seq = iip->ili_datasync_seq;
+ else
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
+
+ if (!seq)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+
+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+ log_flushed);
}
STATIC int
@@ -162,52 +127,42 @@ xfs_file_fsync(
loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
- int error = 0;
+ int error, err2;
int log_flushed = 0;
- xfs_lsn_t lsn = 0;
trace_xfs_file_fsync(ip);
- error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ error = file_write_and_wait_range(file, start, end);
if (error)
return error;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+ if (xfs_is_shutdown(mp))
+ return -EIO;
xfs_iflags_clear(ip, XFS_ITRUNCATED);
- if (mp->m_flags & XFS_MOUNT_BARRIER) {
- /*
- * If we have an RT and/or log subvolume we need to make sure
- * to flush the write cache the device used for file data
- * first. This is to ensure newly written file data make
- * it to disk before logging the new inode size in case of
- * an extending write.
- */
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_blkdev_issue_flush(mp->m_rtdev_targp);
- else if (mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
- }
+ /*
+ * If we have an RT and/or log subvolume we need to make sure to flush
+ * the write cache the device used for file data first. This is to
+ * ensure newly written file data make it to disk before logging the new
+ * inode size in case of an extending write.
+ */
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
+ error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
+ else if (mp->m_logdev_targp != mp->m_ddev_targp)
+ error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
- * All metadata updates are logged, which means that we just have
- * to flush the log up to the latest LSN that touched the inode.
+ * If the inode has a inode log item attached, it may need the journal
+ * flushed to persist any changes the log item might be tracking.
*/
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip)) {
- if (!datasync ||
- (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
- lsn = ip->i_itemp->ili_last_lsn;
+ if (ip->i_itemp) {
+ err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+ if (err2 && !error)
+ error = err2;
}
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (lsn)
- error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
/*
* If we only have a single device, and the log force about was
@@ -216,660 +171,1407 @@ xfs_file_fsync(
* an already allocated file and thus do not have any metadata to
* commit.
*/
- if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
- mp->m_logdev_targp == mp->m_ddev_targp &&
- !XFS_IS_REALTIME_INODE(ip) &&
- !log_flushed)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
+ mp->m_logdev_targp == mp->m_ddev_targp) {
+ err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ if (err2 && !error)
+ error = err2;
+ }
- return -error;
+ return error;
}
-STATIC ssize_t
-xfs_file_aio_read(
+static int
+xfs_ilock_iocb(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ unsigned int lock_mode)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- size_t size = 0;
- ssize_t ret = 0;
- int ioflags = 0;
- xfs_fsize_t n;
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- XFS_STATS_INC(xs_read_calls);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, lock_mode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, lock_mode);
+ }
- BUG_ON(iocb->ki_pos != pos);
+ return 0;
+}
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
- if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+static int
+xfs_ilock_iocb_for_write(
+ struct kiocb *iocb,
+ unsigned int *lock_mode)
+{
+ ssize_t ret;
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
- if (ret < 0)
+ ret = xfs_ilock_iocb(iocb, *lock_mode);
+ if (ret)
return ret;
- if (unlikely(ioflags & IO_ISDIRECT)) {
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((pos & target->bt_smask) || (size & target->bt_smask)) {
- if (pos == i_size_read(inode))
- return 0;
- return -XFS_ERROR(EINVAL);
- }
+ /*
+ * If a reflink remap is in progress we always need to take the iolock
+ * exclusively to wait for it to finish.
+ */
+ if (*lock_mode == XFS_IOLOCK_SHARED &&
+ xfs_iflags_test(ip, XFS_IREMAPPING)) {
+ xfs_iunlock(ip, *lock_mode);
+ *lock_mode = XFS_IOLOCK_EXCL;
+ return xfs_ilock_iocb(iocb, *lock_mode);
}
- n = mp->m_super->s_maxbytes - pos;
- if (n <= 0 || size == 0)
- return 0;
+ return 0;
+}
- if (n < size)
- size = n;
+STATIC ssize_t
+xfs_file_dio_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ ssize_t ret;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
+ trace_xfs_file_direct_read(iocb, to);
- /*
- * Locking is a bit tricky here. If we take an exclusive lock
- * for direct IO, we effectively serialise all new concurrent
- * read IO to this file and block it behind IO that is currently in
- * progress because IO in progress holds the IO lock shared. We only
- * need to hold the lock exclusive to blow away the page cache, so
- * only take lock exclusively if the page cache needs invalidation.
- * This allows the normal direct IO case of no page cache pages to
- * proceeed concurrently without serialisation.
- */
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
- xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
-
- if (inode->i_mapping->nrpages) {
- ret = -filemap_write_and_wait_range(
- VFS_I(ip)->i_mapping,
- pos, -1);
- if (ret) {
- xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
- }
- truncate_pagecache_range(VFS_I(ip), pos, -1);
- }
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- }
+ if (!iov_iter_count(to))
+ return 0; /* skip atime */
- trace_xfs_file_read(ip, size, pos, ioflags);
+ file_accessed(iocb->ki_filp);
- ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ ssize_t ret = 0;
+
+ trace_xfs_file_dax_read(iocb, to);
+
+ if (!iov_iter_count(to))
+ return 0; /* skip atime */
+
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+ ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ file_accessed(iocb->ki_filp);
return ret;
}
STATIC ssize_t
-xfs_file_splice_read(
- struct file *infilp,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t count,
- unsigned int flags)
+xfs_file_buffered_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
{
- struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
- int ioflags = 0;
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
ssize_t ret;
- XFS_STATS_INC(xs_read_calls);
+ trace_xfs_file_buffered_read(iocb, to);
- if (infilp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+ ret = generic_file_read_iter(iocb, to);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
+ ssize_t ret = 0;
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ XFS_STATS_INC(mp, xs_read_calls);
- trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+ if (xfs_is_shutdown(mp))
+ return -EIO;
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ if (IS_DAX(inode))
+ ret = xfs_file_dax_read(iocb, to);
+ else if (iocb->ki_flags & IOCB_DIRECT)
+ ret = xfs_file_dio_read(iocb, to);
+ else
+ ret = xfs_file_buffered_read(iocb, to);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ if (ret > 0)
+ XFS_STATS_ADD(mp, xs_read_bytes, ret);
return ret;
}
-/*
- * xfs_file_splice_write() does not use xfs_rw_ilock() because
- * generic_file_splice_write() takes the i_mutex itself. This, in theory,
- * couuld cause lock inversions between the aio_write path and the splice path
- * if someone is doing concurrent splice(2) based writes and write(2) based
- * writes to the same inode. The only real way to fix this is to re-implement
- * the generic code here with correct locking orders.
- */
STATIC ssize_t
-xfs_file_splice_write(
- struct pipe_inode_info *pipe,
- struct file *outfilp,
+xfs_file_splice_read(
+ struct file *in,
loff_t *ppos,
- size_t count,
+ struct pipe_inode_info *pipe,
+ size_t len,
unsigned int flags)
{
- struct inode *inode = outfilp->f_mapping->host;
+ struct inode *inode = file_inode(in);
struct xfs_inode *ip = XFS_I(inode);
- int ioflags = 0;
- ssize_t ret;
-
- XFS_STATS_INC(xs_write_calls);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
- if (outfilp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+ XFS_STATS_INC(mp, xs_read_calls);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(mp))
return -EIO;
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ trace_xfs_file_splice_read(ip, *ppos, len);
- trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
- ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = filemap_splice_read(in, ppos, pipe, len, flags);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
if (ret > 0)
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ XFS_STATS_ADD(mp, xs_read_bytes, ret);
return ret;
}
/*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF. We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
+ * Take care of zeroing post-EOF blocks when they might exist.
+ *
+ * Returns 0 if successfully, a negative error for a failure, or 1 if this
+ * function dropped the iolock and reacquired it exclusively and the caller
+ * needs to restart the write sanity checks.
*/
-STATIC int /* error (positive) */
-xfs_zero_last_block(
- struct xfs_inode *ip,
- xfs_fsize_t offset,
- xfs_fsize_t isize)
+static ssize_t
+xfs_file_write_zero_eof(
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int *iolock,
+ size_t count,
+ bool *drained_dio,
+ struct xfs_zone_alloc_ctx *ac)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
- int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- int zero_len;
- int nimaps = 1;
- int error = 0;
- struct xfs_bmbt_irec imap;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
- ASSERT(nimaps > 0);
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ loff_t isize;
+ int error;
/*
- * If the block underlying isize is just a hole, then there
- * is nothing to zero.
+ * We need to serialise against EOF updates that occur in IO completions
+ * here. We want to make sure that nobody is changing the size while
+ * we do this check until we have placed an IO barrier (i.e. hold
+ * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
+ * spinlock effectively forms a memory barrier once we have
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ * hence be able to correctly determine if we need to run zeroing.
*/
- if (imap.br_startblock == HOLESTARTBLOCK)
+ spin_lock(&ip->i_flags_lock);
+ isize = i_size_read(VFS_I(ip));
+ if (iocb->ki_pos <= isize) {
+ spin_unlock(&ip->i_flags_lock);
return 0;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ if (!*drained_dio) {
+ /*
+ * If zeroing is needed and we are currently holding the iolock
+ * shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
+ */
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
+
+ /*
+ * We now have an IO submission barrier in place, but AIO can do
+ * EOF updates during IO completion and hence we now need to
+ * wait for all of them to drain. Non-AIO DIO will have drained
+ * before we are given the XFS_IOLOCK_EXCL, and so for most
+ * cases this wait is a no-op.
+ */
+ inode_dio_wait(VFS_I(ip));
+ *drained_dio = true;
+ return 1;
+ }
- zero_len = mp->m_sb.sb_blocksize - zero_offset;
- if (isize + zero_len > offset)
- zero_len = offset - isize;
- return xfs_iozero(ip, isize, zero_len);
+ trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
+
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+
+ return error;
}
/*
- * Zero any on disk space between the current EOF and the new, larger EOF.
- *
- * This handles the normal case of zeroing the remainder of the last block in
- * the file and the unusual case of zeroing blocks out beyond the size of the
- * file. This second case only happens with fixed size extents and when the
- * system crashes before the inode size was updated but after blocks were
- * allocated.
+ * Common pre-write limit and setup checks.
*
- * Expects the iolock to be held exclusive, and will take the ilock internally.
+ * Called with the iolock held either shared and exclusive according to
+ * @iolock, and returns with it held. Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
*/
-int /* error (positive) */
-xfs_zero_eof(
- struct xfs_inode *ip,
- xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize) /* current inode size */
+STATIC ssize_t
+xfs_file_write_checks(
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int *iolock,
+ struct xfs_zone_alloc_ctx *ac)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fileoff_t zero_off;
- xfs_fsize_t zero_len;
- int nimaps;
- int error = 0;
- struct xfs_bmbt_irec imap;
-
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(offset > isize);
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ size_t count = iov_iter_count(from);
+ bool drained_dio = false;
+ ssize_t error;
+
+restart:
+ error = generic_write_checks(iocb, from);
+ if (error <= 0)
+ return error;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ error = break_layout(inode, false);
+ if (error == -EWOULDBLOCK)
+ error = -EAGAIN;
+ } else {
+ error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ }
+
+ if (error)
+ return error;
/*
- * First handle zeroing the block on which isize resides.
+ * For changing security info in file_remove_privs() we need i_rwsem
+ * exclusively.
+ */
+ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+ xfs_iunlock(XFS_I(inode), *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ error = xfs_ilock_iocb(iocb, *iolock);
+ if (error) {
+ *iolock = 0;
+ return error;
+ }
+ goto restart;
+ }
+
+ /*
+ * If the offset is beyond the size of the file, we need to zero all
+ * blocks that fall between the existing EOF and the start of this
+ * write.
*
- * We only zero a part of that block so it is handled specially.
+ * We can do an unlocked check for i_size here safely as I/O completion
+ * can only extend EOF. Truncate is locked out at this point, so the
+ * EOF can not move backwards, only forwards. Hence we only need to take
+ * the slow path when we are at or beyond the current EOF.
*/
- if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
- error = xfs_zero_last_block(ip, offset, isize);
+ if (iocb->ki_pos > i_size_read(inode)) {
+ error = xfs_file_write_zero_eof(iocb, from, iolock, count,
+ &drained_dio, ac);
+ if (error == 1)
+ goto restart;
if (error)
return error;
}
+ return kiocb_modified(iocb);
+}
+
+static ssize_t
+xfs_zoned_write_space_reserve(
+ struct xfs_mount *mp,
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ loff_t count = iov_iter_count(from);
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= XFS_ZR_NOWAIT;
+
/*
- * Calculate the range between the new size and the old where blocks
- * needing to be zeroed may exist.
+ * Check the rlimit and LFS boundary first so that we don't over-reserve
+ * by possibly a lot.
*
- * To get the block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back to a block
- * boundary. We subtract 1 in case the size is exactly on a block
- * boundary.
+ * The generic write path will redo this check later, and it might have
+ * changed by then. If it got expanded we'll stick to our earlier
+ * smaller limit, and if it is decreased the new smaller limit will be
+ * used and our extra space reservation will be returned after finishing
+ * the write.
*/
- last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
- start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
- ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
- if (last_fsb == end_zero_fsb) {
- /*
- * The size was only incremented on its last block.
- * We took care of that above, so just return.
- */
+ error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
+ if (error)
+ return error;
+
+ /*
+ * Sloppily round up count to file system blocks.
+ *
+ * This will often reserve an extra block, but that avoids having to look
+ * at the start offset, which isn't stable for O_APPEND until taking the
+ * iolock. Also we need to reserve a block each for zeroing the old
+ * EOF block and the new start block if they are unaligned.
+ *
+ * Any remaining block will be returned after the write.
+ */
+ return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
+ flags, ac);
+}
+
+static int
+xfs_dio_write_end_io(
+ struct kiocb *iocb,
+ ssize_t size,
+ int error,
+ unsigned flags)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ loff_t offset = iocb->ki_pos;
+ unsigned int nofs_flag;
+
+ ASSERT(!xfs_is_zoned_inode(ip) ||
+ !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
+ trace_xfs_end_io_direct_write(ip, offset, size);
+
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
+
+ if (error)
+ return error;
+ if (!size)
return 0;
- }
- ASSERT(start_zero_fsb <= end_zero_fsb);
- while (start_zero_fsb <= end_zero_fsb) {
- nimaps = 1;
- zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+ /*
+ * Capture amount written on completion as we can't reliably account
+ * for it on submission.
+ */
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
+
+ /*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
- &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (flags & IOMAP_DIO_COW) {
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ error = xfs_reflink_end_atomic_cow(ip, offset, size);
+ else
+ error = xfs_reflink_end_cow(ip, offset, size);
if (error)
- return error;
+ goto out;
+ }
- ASSERT(nimaps > 0);
+ /*
+ * Unwritten conversion updates the in-core isize after extent
+ * conversion but before updating the on-disk size. Updating isize any
+ * earlier allows a racing dio read to find unwritten extents before
+ * they are converted.
+ */
+ if (flags & IOMAP_DIO_UNWRITTEN) {
+ error = xfs_iomap_write_unwritten(ip, offset, size, true);
+ goto out;
+ }
- if (imap.br_state == XFS_EXT_UNWRITTEN ||
- imap.br_startblock == HOLESTARTBLOCK) {
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- continue;
- }
+ /*
+ * We need to update the in-core inode size here so that we don't end up
+ * with the on-disk inode size being outside the in-core inode size. We
+ * have no other method of updating EOF for AIO, so always do it here
+ * if necessary.
+ *
+ * We need to lock the test/set EOF update as we can be racing with
+ * other IO completions here to update the EOF. Failing to serialise
+ * here can result in EOF moving backwards and Bad Things Happen when
+ * that occurs.
+ *
+ * As IO completion only ever extends EOF, we can do an unlocked check
+ * here to avoid taking the spinlock. If we land within the current EOF,
+ * then we do not need to do an extending update at all, and we don't
+ * need to take the lock to check this. If we race with an update moving
+ * EOF, then we'll either still be beyond EOF and need to take the lock,
+ * or we'll be within EOF and we don't need to take it at all.
+ */
+ if (offset + size <= i_size_read(inode))
+ goto out;
- /*
- * There are blocks we need to zero.
- */
- zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
- zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+ spin_lock(&ip->i_flags_lock);
+ if (offset + size > i_size_read(inode)) {
+ i_size_write(inode, offset + size);
+ spin_unlock(&ip->i_flags_lock);
+ error = xfs_setfilesize(ip, offset, size);
+ } else {
+ spin_unlock(&ip->i_flags_lock);
+ }
- if ((zero_off + zero_len) > offset)
- zero_len = offset - zero_off;
+out:
+ memalloc_nofs_restore(nofs_flag);
+ return error;
+}
- error = xfs_iozero(ip, zero_off, zero_len);
- if (error)
- return error;
+static const struct iomap_dio_ops xfs_dio_write_ops = {
+ .end_io = xfs_dio_write_end_io,
+};
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+static void
+xfs_dio_zoned_submit_io(
+ const struct iomap_iter *iter,
+ struct bio *bio,
+ loff_t file_offset)
+{
+ struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ xfs_filblks_t count_fsb;
+ struct iomap_ioend *ioend;
+
+ count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+ count_fsb, ac->reserved_blocks);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ bio_io_error(bio);
+ return;
}
+ ac->reserved_blocks -= count_fsb;
- return 0;
+ bio->bi_end_io = xfs_end_bio;
+ ioend = iomap_init_ioend(iter->inode, bio, file_offset,
+ IOMAP_IOEND_DIRECT);
+ xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
}
+static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
+ .bio_set = &iomap_ioend_bioset,
+ .submit_io = xfs_dio_zoned_submit_io,
+ .end_io = xfs_dio_write_end_io,
+};
+
/*
- * Common pre-write limit and setup checks.
- *
- * Called with the iolocked held either shared and exclusive according to
- * @iolock, and returns with it held. Might upgrade the iolock to exclusive
- * if called for a direct write beyond i_size.
+ * Handle block aligned direct I/O writes.
*/
-STATIC ssize_t
-xfs_file_aio_write_checks(
- struct file *file,
- loff_t *pos,
- size_t *count,
- int *iolock)
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ const struct iomap_ops *ops,
+ const struct iomap_dio_ops *dops,
+ struct xfs_zone_alloc_ctx *ac)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- int error = 0;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int dio_flags = 0;
+ ssize_t ret;
-restart:
- error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
- if (error)
- return error;
+ /*
+ * For always COW inodes, each bio must be aligned to the file system
+ * block size and not just the device sector size because we need to
+ * allocate a block-aligned amount of space for each write.
+ */
+ if (xfs_is_always_cow_inode(ip))
+ dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
+
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock, ac);
+ if (ret)
+ goto out_unlock;
/*
- * If the offset is beyond the size of the file, we need to zero any
- * blocks that fall between the existing EOF and the start of this
- * write. If zeroing is needed and we are currently holding the
- * iolock shared, we need to update it to exclusive which implies
- * having to redo all checks before.
+ * We don't need to hold the IOLOCK exclusively across the IO, so demote
+ * the iolock back to shared if we had to take the exclusive lock in
+ * xfs_file_write_checks() for other reasons.
*/
- if (*pos > i_size_read(inode)) {
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
- goto restart;
- }
- error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
- if (error)
- return error;
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
}
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
+out_unlock:
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+
+/*
+ * Handle block aligned direct I/O writes to zoned devices.
+ */
+static noinline ssize_t
+xfs_file_dio_write_zoned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
+ if (ret < 0)
+ return ret;
+ ret = xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_zoned_direct_write_iomap_ops,
+ &xfs_dio_zoned_write_ops, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
+ return ret;
+}
+
+/*
+ * Handle block atomic writes
+ *
+ * Two methods of atomic writes are supported:
+ * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
+ * disk
+ * - COW-based, which uses a COW fork as a staging extent for data updates
+ * before atomically updating extent mappings for the range being written
+ *
+ */
+static noinline ssize_t
+xfs_file_dio_write_atomic(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret, ocount = iov_iter_count(from);
+ const struct iomap_ops *dops;
/*
- * Updating the timestamps will grab the ilock again from
- * xfs_fs_dirty_inode, so we have to call it after dropping the
- * lock above. Eventually we should look into a way to avoid
- * the pointless lock roundtrip.
+ * HW offload should be faster, so try that first if it is already
+ * known that the write length is not too large.
*/
- if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
- error = file_update_time(file);
- if (error)
- return error;
+ if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ else
+ dops = &xfs_direct_write_iomap_ops;
+
+retry:
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
+ if (ret)
+ return ret;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
+ if (ret)
+ goto out_unlock;
+
+ /* Demote similar to xfs_file_dio_write_aligned() */
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
}
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
+ 0, NULL, 0);
+
/*
- * If we're writing the file then make sure to clear the setuid and
- * setgid bits if the process is not being run by root. This keeps
- * people from modifying setuid and setgid binaries.
+ * The retry mechanism is based on the ->iomap_begin method returning
+ * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
+ * possible. The REQ_ATOMIC-based method typically not be possible if
+ * the write spans multiple extents or the disk blocks are misaligned.
*/
- return file_remove_suid(file);
+ if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
+ xfs_iunlock(ip, iolock);
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ goto retry;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
}
/*
- * xfs_file_dio_aio_write - handle direct IO writes
+ * Handle block unaligned direct I/O writes
*
- * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
+ * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct I/O writes. However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ * to do sub-block zeroing and that requires serialisation against other direct
+ * I/O to the same block. In this case we need to serialise the submission of
+ * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ * In the case where sub-block zeroing is not required, we can do concurrent
+ * sub-block dios to the same block successfully.
*
- * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- * pages are flushed out.
- *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer. To avoid the problem with aio, we also need to wait for
- * outstanding IOs to complete so that unwritten extent conversion is completed
- * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * Optimistically submit the I/O using the shared lock first, but use the
+ * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ * if block allocation or partial block zeroing would be required. In that case
+ * we try again with the exclusive lock.
*/
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+ struct xfs_inode *ip,
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- size_t count = ocount;
- int unaligned_io = 0;
- int iolock;
- struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
-
- if ((pos & target->bt_smask) || (count & target->bt_smask))
- return -XFS_ERROR(EINVAL);
-
- if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
- unaligned_io = 1;
+ size_t isize = i_size_read(VFS_I(ip));
+ size_t count = iov_iter_count(from);
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
+ ssize_t ret;
/*
- * We don't need to take an exclusive lock unless there page cache needs
- * to be invalidated or unaligned IO is being executed. We don't need to
- * consider the EOF extension case here because
- * xfs_file_aio_write_checks() will relock the inode as necessary for
- * EOF zeroing cases and fill out the new inode size as appropriate.
+ * Extending writes need exclusivity because of the sub-block zeroing
+ * that the DIO code always does for partial tail blocks beyond EOF, so
+ * don't even bother trying the fast path in this case.
*/
- if (unaligned_io || mapping->nrpages)
+ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+retry_exclusive:
iolock = XFS_IOLOCK_EXCL;
- else
- iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, iolock);
+ flags = IOMAP_DIO_FORCE_WAIT;
+ }
+
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
+ if (ret)
+ return ret;
/*
- * Recheck if there are cached pages that need invalidate after we got
- * the iolock to protect against other threads adding new pages while
- * we were waiting for the iolock.
+ * We can't properly handle unaligned direct I/O to reflink files yet,
+ * as we can't unshare a partial block.
*/
- if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, iolock);
- iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, iolock);
+ if (xfs_is_cow_inode(ip)) {
+ trace_xfs_reflink_bounce_dio_write(iocb, from);
+ ret = -ENOTBLK;
+ goto out_unlock;
}
- ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
- goto out;
+ goto out_unlock;
- if (mapping->nrpages) {
- ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, -1);
- if (ret)
- goto out;
- truncate_pagecache_range(VFS_I(ip), pos, -1);
- }
+ /*
+ * If we are doing exclusive unaligned I/O, this must be the only I/O
+ * in-flight. Otherwise we risk data corruption due to unwritten extent
+ * conversions from the AIO end_io handler. Wait for all other I/O to
+ * drain first.
+ */
+ if (flags & IOMAP_DIO_FORCE_WAIT)
+ inode_dio_wait(VFS_I(ip));
+
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ &xfs_dio_write_ops, flags, NULL, 0);
/*
- * If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to flush cached pages
+ * Retry unaligned I/O with exclusive blocking semantics if the DIO
+ * layer rejected it for mapping or locking reasons. If we are doing
+ * nonblocking user I/O, propagate the error.
*/
- if (unaligned_io)
- inode_dio_wait(inode);
- else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
+ if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+ xfs_iunlock(ip, iolock);
+ goto retry_exclusive;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_direct_write(iocb, iovp,
- &nr_segs, pos, &iocb->ki_pos, count, ocount);
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+static ssize_t
+xfs_file_dio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ size_t count = iov_iter_count(from);
+
+ /* direct I/O must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+ return -EINVAL;
+
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+ return xfs_file_dio_write_unaligned(ip, iocb, from);
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_dio_write_zoned(ip, iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ return xfs_file_dio_write_atomic(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
+}
+
+static noinline ssize_t
+xfs_file_dax_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ unsigned int iolock = XFS_IOLOCK_EXCL;
+ ssize_t ret, error = 0;
+ loff_t pos;
+
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
+ if (ret)
+ goto out;
+
+ pos = iocb->ki_pos;
+
+ trace_xfs_file_dax_write(iocb, from);
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
+ if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+ i_size_write(inode, iocb->ki_pos);
+ error = xfs_setfilesize(ip, pos, ret);
+ }
out:
- xfs_rw_iunlock(ip, iolock);
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ if (error)
+ return error;
+
+ if (ret > 0) {
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
- /* No fallback to buffered IO on errors for XFS. */
- ASSERT(ret < 0 || ret == count);
+ /* Handle various SYNC-type writes */
+ ret = generic_write_sync(iocb, ret);
+ }
return ret;
}
STATIC ssize_t
-xfs_file_buffered_aio_write(
+xfs_file_buffered_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int enospc = 0;
- int iolock = XFS_IOLOCK_EXCL;
- size_t count = ocount;
+ bool cleared_space = false;
+ unsigned int iolock;
- xfs_rw_ilock(ip, iolock);
+write_retry:
+ iolock = XFS_IOLOCK_EXCL;
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
- ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
-
-write_retry:
- trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, 0);
+ trace_xfs_file_buffered_write(iocb, from);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ NULL);
/*
- * If we just got an ENOSPC, try to write back all dirty inodes to
- * convert delalloc space to free up some of the excess reserved
- * metadata space.
+ * If we hit a space limit, try to free up some lingering preallocated
+ * space before returning an error. In the case of ENOSPC, first try to
+ * write back all dirty inodes to free up some of the excess reserved
+ * metadata space. This reduces the chances that the eofblocks scan
+ * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
+ * also behaves as a filter to prevent too many eofblocks scans from
+ * running at the same time. Use a synchronous scan to increase the
+ * effectiveness of the scan.
*/
- if (ret == -ENOSPC && !enospc) {
- enospc = 1;
+ if (ret == -EDQUOT && !cleared_space) {
+ xfs_iunlock(ip, iolock);
+ xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
+ cleared_space = true;
+ goto write_retry;
+ } else if (ret == -ENOSPC && !cleared_space) {
+ struct xfs_icwalk icw = {0};
+
+ cleared_space = true;
xfs_flush_inodes(ip->i_mount);
+
+ xfs_iunlock(ip, iolock);
+ icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
+ xfs_blockgc_free_space(ip->i_mount, &icw);
goto write_retry;
}
- current->backing_dev_info = NULL;
out:
- xfs_rw_iunlock(ip, iolock);
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+
+ if (ret > 0) {
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+ /* Handle various SYNC-type writes */
+ ret = generic_write_sync(iocb, ret);
+ }
return ret;
}
STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_buffered_write_zoned(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
+ bool cleared_space = false;
+ struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
- size_t ocount = 0;
- XFS_STATS_INC(xs_write_calls);
+ ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
+ if (ret < 0)
+ return ret;
- BUG_ON(iocb->ki_pos != pos);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ goto out_unreserve;
- ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+ ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
if (ret)
- return ret;
+ goto out_unlock;
+
+ /*
+ * Truncate the iter to the length that we were actually able to
+ * allocate blocks for. This needs to happen after
+ * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
+ * writes.
+ */
+ iov_iter_truncate(from,
+ XFS_FSB_TO_B(mp, ac.reserved_blocks) -
+ (iocb->ki_pos & mp->m_blockmask));
+ if (!iov_iter_count(from))
+ goto out_unlock;
+
+retry:
+ trace_xfs_file_buffered_write(iocb, from);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ &ac);
+ if (ret == -ENOSPC && !cleared_space) {
+ /*
+ * Kick off writeback to convert delalloc space and release the
+ * usually too pessimistic indirect block reservations.
+ */
+ xfs_flush_inodes(mp);
+ cleared_space = true;
+ goto retry;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, iolock);
+out_unreserve:
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
+ if (ret > 0) {
+ XFS_STATS_ADD(mp, xs_write_bytes, ret);
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_write_iter(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ size_t ocount = iov_iter_count(from);
+
+ XFS_STATS_INC(ip->i_mount, xs_write_calls);
if (ocount == 0)
return 0;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- ret = -EIO;
- goto out;
- }
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
- if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
- else
- ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount);
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ if (ocount < xfs_get_atomic_write_min(ip))
+ return -EINVAL;
- if (ret > 0) {
- ssize_t err;
+ if (ocount > xfs_get_atomic_write_max(ip))
+ return -EINVAL;
+
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
- XFS_STATS_ADD(xs_write_bytes, ret);
+ if (IS_DAX(inode))
+ return xfs_file_dax_write(iocb, from);
- /* Handle various SYNC-type writes */
- err = generic_write_sync(file, pos, ret);
- if (err < 0)
- ret = err;
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * Allow a directio write to fall back to a buffered
+ * write *only* in the case that we're doing a reflink
+ * CoW. In all other directio scenarios we do not
+ * allow an operation to fall back to buffered mode.
+ */
+ ret = xfs_file_dio_write(iocb, from);
+ if (ret != -ENOTBLK)
+ return ret;
}
-out:
- return ret;
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_buffered_write_zoned(iocb, from);
+ return xfs_file_buffered_write(iocb, from);
}
-STATIC long
-xfs_file_fallocate(
- struct file *file,
- int mode,
- loff_t offset,
- loff_t len)
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
{
- struct inode *inode = file_inode(file);
- long error;
- loff_t new_size = 0;
- xfs_flock64_t bf;
- xfs_inode_t *ip = XFS_I(inode);
- int cmd = XFS_IOC_RESVSP;
- int attr_flags = XFS_ATTR_NOLOCK;
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
- return -EOPNOTSUPP;
+ if (xfs_has_wsync(ip->i_mount))
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
+static int
+xfs_falloc_newsize(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ loff_t *new_size)
+{
+ struct inode *inode = file_inode(file);
+
+ if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
+ return 0;
+ *new_size = offset + len;
+ return inode_newsize_ok(inode, *new_size);
+}
+
+static int
+xfs_falloc_setsize(
+ struct file *file,
+ loff_t new_size)
+{
+ struct iattr iattr = {
+ .ia_valid = ATTR_SIZE,
+ .ia_size = new_size,
+ };
+
+ if (!new_size)
+ return 0;
+ return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
+ &iattr);
+}
+
+static int
+xfs_falloc_collapse_range(
+ struct file *file,
+ loff_t offset,
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = i_size_read(inode) - len;
+ int error;
+
+ if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
+ return -EINVAL;
+
+ /*
+ * There is no need to overlap collapse range with EOF, in which case it
+ * is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode))
+ return -EINVAL;
- bf.l_whence = 0;
- bf.l_start = offset;
- bf.l_len = len;
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_insert_range(
+ struct file *file,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t isize = i_size_read(inode);
+ int error;
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
+ return -EINVAL;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- cmd = XFS_IOC_UNRESVSP;
+ /*
+ * New inode size must not exceed ->s_maxbytes, accounting for
+ * possible signed overflow.
+ */
+ if (inode->i_sb->s_maxbytes - isize < len)
+ return -EFBIG;
- /* check the new inode size is valid before allocating */
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
+ /* Offset should be less than i_size */
+ if (offset >= isize)
+ return -EINVAL;
+
+ error = xfs_falloc_setsize(file, isize + len);
+ if (error)
+ return error;
+
+ /*
+ * Perform hole insertion now that the file size has been updated so
+ * that if we crash during the operation we don't leave shifted extents
+ * past EOF and hence losing access to the data that is contained within
+ * them.
+ */
+ return xfs_insert_file_space(XFS_I(inode), offset, len);
+}
+
+/*
+ * Punch a hole and prealloc the range. We use a hole punch rather than
+ * unwritten extent conversion for two reasons:
+ *
+ * 1.) Hole punch handles partial block zeroing for us.
+ * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by
+ * virtue of the hole punch.
+ */
+static int
+xfs_falloc_zero_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ unsigned int blksize = i_blocksize(inode);
+ loff_t new_size = 0;
+ int error;
+
+ trace_xfs_zero_file_space(ip);
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ /*
+ * Zero range implements a full zeroing mechanism but is only used in
+ * limited situations. It is more efficient to allocate unwritten
+ * extents than to perform zeroing here, so use an errortag to randomly
+ * force zeroing on DEBUG kernels for added test coverage.
+ */
+ if (XFS_TEST_ERROR(ip->i_mount,
+ XFS_ERRTAG_FORCE_ZERO_RANGE)) {
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
+ } else {
+ error = xfs_free_file_space(ip, offset, len, ac);
if (error)
- goto out_unlock;
+ return error;
+
+ len = round_up(offset + len, blksize) -
+ round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(ip, offset, len);
}
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_unshare_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = 0;
+ int error;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
- if (file->f_flags & O_DSYNC)
- attr_flags |= XFS_ATTR_SYNC;
+ error = xfs_reflink_unshare(XFS_I(inode), offset, len);
+ if (error)
+ return error;
- error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_allocate_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = 0;
+ int error;
+
+ /*
+ * If always_cow mode we can't use preallocations and thus should not
+ * create them.
+ */
+ if (xfs_is_always_cow_inode(XFS_I(inode)))
+ return -EOPNOTSUPP;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+#define XFS_FALLOC_FL_SUPPORTED \
+ (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
+ FALLOC_FL_UNSHARE_RANGE)
+
+STATIC long
+__xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ long error;
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
goto out_unlock;
- /* Change file size if needed */
- if (new_size) {
- struct iattr iattr;
+ /*
+ * Must wait for all AIO to complete before we continue as AIO can
+ * change the file size on completion without holding any locks we
+ * currently hold. We must do this first because AIO can update both
+ * the on disk and in memory inode sizes, and the operations that follow
+ * require the in-memory size to be fully up-to-date.
+ */
+ inode_dio_wait(inode);
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = new_size;
- error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK);
+ error = file_modified(file);
+ if (error)
+ goto out_unlock;
+
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
+ error = xfs_free_file_space(ip, offset, len, ac);
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ error = xfs_falloc_collapse_range(file, offset, len, ac);
+ break;
+ case FALLOC_FL_INSERT_RANGE:
+ error = xfs_falloc_insert_range(file, offset, len);
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ error = xfs_falloc_zero_range(file, mode, offset, len, ac);
+ break;
+ case FALLOC_FL_UNSHARE_RANGE:
+ error = xfs_falloc_unshare_range(file, mode, offset, len);
+ break;
+ case FALLOC_FL_ALLOCATE_RANGE:
+ error = xfs_falloc_allocate_range(file, mode, offset, len);
+ break;
+ default:
+ error = -EOPNOTSUPP;
+ break;
}
+ if (!error && xfs_file_sync_writes(file))
+ error = xfs_log_force_inode(ip);
+
out_unlock:
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, iolock);
+ return error;
+}
+
+static long
+xfs_file_zoned_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ int error;
+
+ error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac);
+ if (error)
+ return error;
+ error = __xfs_file_fallocate(file, mode, offset, len, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
return error;
}
+static long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ if (mode & ~XFS_FALLOC_FL_SUPPORTED)
+ return -EOPNOTSUPP;
+
+ /*
+ * For zoned file systems, zeroing the first and last block of a hole
+ * punch requires allocating a new block to rewrite the remaining data
+ * and new zeroes out of place. Get a reservations for those before
+ * taking the iolock. Dip into the reserved pool because we are
+ * expected to be able to punch a hole even on a completely full
+ * file system.
+ */
+ if (xfs_is_zoned_inode(XFS_I(inode)) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_COLLAPSE_RANGE)))
+ return xfs_file_zoned_fallocate(file, mode, offset, len);
+ return __xfs_file_fallocate(file, mode, offset, len, NULL);
+}
+
+STATIC int
+xfs_file_fadvise(
+ struct file *file,
+ loff_t start,
+ loff_t end,
+ int advice)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ int ret;
+ int lockflags = 0;
+
+ /*
+ * Operations creating pages in page cache need protection from hole
+ * punching and similar ops
+ */
+ if (advice == POSIX_FADV_WILLNEED) {
+ lockflags = XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lockflags);
+ }
+ ret = generic_fadvise(file, start, end, advice);
+ if (lockflags)
+ xfs_iunlock(ip, lockflags);
+ return ret;
+}
+
+STATIC loff_t
+xfs_file_remap_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ loff_t len,
+ unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ struct xfs_mount *mp = src->i_mount;
+ loff_t remapped = 0;
+ xfs_extlen_t cowextsize;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (!xfs_has_reflink(mp))
+ return -EOPNOTSUPP;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ /* Prepare and then clone file data. */
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+ &len, remap_flags);
+ if (ret || len == 0)
+ return ret;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
+ ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
+ &remapped);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Carry the cowextsize hint from src to dest if we're sharing the
+ * entire source file to the entire destination file, the source file
+ * has a cowextsize hint, and the destination file does not.
+ */
+ cowextsize = 0;
+ if (pos_in == 0 && len == i_size_read(inode_in) &&
+ (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ pos_out == 0 && len >= i_size_read(inode_out) &&
+ !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
+ cowextsize = src->i_cowextsize;
+
+ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
+ remap_flags);
+ if (ret)
+ goto out_unlock;
+
+ if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
+ xfs_log_force_inode(dest);
+out_unlock:
+ xfs_iunlock2_remapping(src, dest);
+ if (ret)
+ trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+ /*
+ * If the caller did not set CAN_SHORTEN, then it is not prepared to
+ * handle partial results -- either the whole remap succeeds, or we
+ * must say why it did not. In this case, any error should be returned
+ * to the caller.
+ */
+ if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+ return ret;
+ return remapped > 0 ? remapped : ret;
+}
STATIC int
xfs_file_open(
struct inode *inode,
struct file *file)
{
- if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
- return -EFBIG;
- if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+ if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- return 0;
+ file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+ if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
+ file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+ return generic_file_open(inode, file);
}
STATIC int
@@ -878,10 +1580,12 @@ xfs_dir_open(
struct file *file)
{
struct xfs_inode *ip = XFS_I(inode);
- int mode;
+ unsigned int mode;
int error;
- error = xfs_file_open(inode, file);
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
+ error = generic_file_open(inode, file);
if (error)
return error;
@@ -889,19 +1593,92 @@ xfs_dir_open(
* If there are any blocks, read-ahead block 0 as we're almost
* certain to have the next operation be a read there.
*/
- mode = xfs_ilock_map_shared(ip);
- if (ip->i_d.di_nextents > 0)
- xfs_dir3_data_readahead(NULL, ip, 0, -1);
+ mode = xfs_ilock_data_map_shared(ip);
+ if (ip->i_df.if_nextents > 0)
+ error = xfs_dir3_data_readahead(ip, 0, 0);
xfs_iunlock(ip, mode);
- return 0;
+ return error;
}
+/*
+ * Don't bother propagating errors. We're just doing cleanup, and the caller
+ * ignores the return value anyway.
+ */
STATIC int
xfs_file_release(
- struct inode *inode,
- struct file *filp)
+ struct inode *inode,
+ struct file *file)
{
- return -xfs_release(XFS_I(inode));
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If this is a read-only mount or the file system has been shut down,
+ * don't generate I/O.
+ */
+ if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
+ return 0;
+
+ /*
+ * If we previously truncated this file and removed old data in the
+ * process, we want to initiate "early" writeout on the last close.
+ * This is an attempt to combat the notorious NULL files problem which
+ * is particularly noticeable from a truncate down, buffered (re-)write
+ * (delalloc), followed by a crash. What we are effectively doing here
+ * is significantly reducing the time window where we'd otherwise be
+ * exposed to that problem.
+ */
+ if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
+ xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
+ if (ip->i_delayed_blks > 0)
+ filemap_flush(inode->i_mapping);
+ }
+
+ /*
+ * XFS aggressively preallocates post-EOF space to generate contiguous
+ * allocations for writers that append to the end of the file.
+ *
+ * To support workloads that close and reopen the file frequently, these
+ * preallocations usually persist after a close unless it is the first
+ * close for the inode. This is a tradeoff to generate tightly packed
+ * data layouts for unpacking tarballs or similar archives that write
+ * one file after another without going back to it while keeping the
+ * preallocation for files that have recurring open/write/close cycles.
+ *
+ * This heuristic is skipped for inodes with the append-only flag as
+ * that flag is rather pointless for inodes written only once.
+ *
+ * There is no point in freeing blocks here for open but unlinked files
+ * as they will be taken care of by the inactivation path soon.
+ *
+ * When releasing a read-only context, don't flush data or trim post-EOF
+ * blocks. This avoids open/read/close workloads from removing EOF
+ * blocks that other writers depend upon to reduce fragmentation.
+ *
+ * Inodes on the zoned RT device never have preallocations, so skip
+ * taking the locks below.
+ */
+ if (!inode->i_nlink ||
+ !(file->f_mode & FMODE_WRITE) ||
+ (ip->i_diflags & XFS_DIFLAG_APPEND) ||
+ xfs_is_zoned_inode(ip))
+ return 0;
+
+ /*
+ * If we can't get the iolock just skip truncating the blocks past EOF
+ * because we could deadlock with the mmap_lock otherwise. We'll get
+ * another chance to drop them once the last reference to the inode is
+ * dropped, so we'll never leak blocks permanently.
+ */
+ if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
+ xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ if (xfs_can_free_eofblocks(ip) &&
+ !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
+ xfs_free_eofblocks(ip);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ }
+
+ return 0;
}
STATIC int
@@ -911,7 +1688,6 @@ xfs_file_readdir(
{
struct inode *inode = file_inode(file);
xfs_inode_t *ip = XFS_I(inode);
- int error;
size_t bufsize;
/*
@@ -926,509 +1702,279 @@ xfs_file_readdir(
* point we can change the ->readdir prototype to include the
* buffer size. For now we use the current glibc buffer size.
*/
- bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
- error = xfs_readdir(ip, ctx, bufsize);
- if (error)
- return -error;
- return 0;
+ return xfs_readdir(NULL, ip, ctx, bufsize);
}
-STATIC int
-xfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
+STATIC loff_t
+xfs_file_llseek(
+ struct file *file,
+ loff_t offset,
+ int whence)
{
- vma->vm_ops = &xfs_file_vm_ops;
+ struct inode *inode = file->f_mapping->host;
- file_accessed(filp);
- return 0;
+ if (xfs_is_shutdown(XFS_I(inode)->i_mount))
+ return -EIO;
+
+ switch (whence) {
+ default:
+ return generic_file_llseek(file, offset, whence);
+ case SEEK_HOLE:
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
+ break;
+ case SEEK_DATA:
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
+ break;
+ }
+
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
-/*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
- struct vm_area_struct *vma,
- struct vm_fault *vmf)
+static inline vm_fault_t
+xfs_dax_fault_locked(
+ struct vm_fault *vmf,
+ unsigned int order,
+ bool write_fault)
{
- return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
+ vm_fault_t ret;
+ unsigned long pfn;
-/*
- * This type is designed to indicate the type of offset we would like
- * to search from page cache for either xfs_seek_data() or xfs_seek_hole().
- */
-enum {
- HOLE_OFF = 0,
- DATA_OFF,
-};
+ if (!IS_ENABLED(CONFIG_FS_DAX)) {
+ ASSERT(0);
+ return VM_FAULT_SIGBUS;
+ }
+ ret = dax_iomap_fault(vmf, order, &pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_dax_write_iomap_ops :
+ &xfs_read_iomap_ops);
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, order, pfn);
+ return ret;
+}
-/*
- * Lookup the desired type of offset from the given page.
- *
- * On success, return true and the offset argument will point to the
- * start of the region that was found. Otherwise this function will
- * return false and keep the offset argument unchanged.
- */
-STATIC bool
-xfs_lookup_buffer_offset(
- struct page *page,
- loff_t *offset,
- unsigned int type)
+static vm_fault_t
+xfs_dax_read_fault(
+ struct vm_fault *vmf,
+ unsigned int order)
{
- loff_t lastoff = page_offset(page);
- bool found = false;
- struct buffer_head *bh, *head;
+ struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
+ vm_fault_t ret;
- bh = head = page_buffers(page);
- do {
- /*
- * Unwritten extents that have data in the page
- * cache covering them can be identified by the
- * BH_Unwritten state flag. Pages with multiple
- * buffers might have a mix of holes, data and
- * unwritten extents - any buffer with valid
- * data in it should have BH_Uptodate flag set
- * on it.
- */
- if (buffer_unwritten(bh) ||
- buffer_uptodate(bh)) {
- if (type == DATA_OFF)
- found = true;
- } else {
- if (type == HOLE_OFF)
- found = true;
- }
+ trace_xfs_read_fault(ip, order);
- if (found) {
- *offset = lastoff;
- break;
- }
- lastoff += bh->b_size;
- } while ((bh = bh->b_this_page) != head);
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ ret = xfs_dax_fault_locked(vmf, order, false);
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
- return found;
+ return ret;
}
/*
- * This routine is called to find out and return a data or hole offset
- * from the page cache for unwritten extents according to the desired
- * type for xfs_seek_data() or xfs_seek_hole().
- *
- * The argument offset is used to tell where we start to search from the
- * page cache. Map is used to figure out the end points of the range to
- * lookup pages.
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
*
- * Return true if the desired type of offset was found, and the argument
- * offset is filled with that address. Otherwise, return false and keep
- * offset unchanged.
+ * mmap_lock (MM)
+ * sb_start_pagefault(vfs, freeze)
+ * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
+ * page_lock (MM)
+ * i_lock (XFS - extent map serialisation)
*/
-STATIC bool
-xfs_find_get_desired_pgoff(
- struct inode *inode,
- struct xfs_bmbt_irec *map,
- unsigned int type,
- loff_t *offset)
+static vm_fault_t
+__xfs_write_fault(
+ struct vm_fault *vmf,
+ unsigned int order,
+ struct xfs_zone_alloc_ctx *ac)
{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct pagevec pvec;
- pgoff_t index;
- pgoff_t end;
- loff_t endoff;
- loff_t startoff = *offset;
- loff_t lastoff = startoff;
- bool found = false;
-
- pagevec_init(&pvec, 0);
-
- index = startoff >> PAGE_CACHE_SHIFT;
- endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
- end = endoff >> PAGE_CACHE_SHIFT;
- do {
- int want;
- unsigned nr_pages;
- unsigned int i;
-
- want = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- want);
- /*
- * No page mapped into given range. If we are searching holes
- * and if this is the first time we got into the loop, it means
- * that the given offset is landed in a hole, return it.
- *
- * If we have already stepped through some block buffers to find
- * holes but they all contains data. In this case, the last
- * offset is already updated and pointed to the end of the last
- * mapped page, if it does not reach the endpoint to search,
- * that means there should be a hole between them.
- */
- if (nr_pages == 0) {
- /* Data search found nothing */
- if (type == DATA_OFF)
- break;
-
- ASSERT(type == HOLE_OFF);
- if (lastoff == startoff || lastoff < endoff) {
- found = true;
- *offset = lastoff;
- }
- break;
- }
+ unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
+ vm_fault_t ret;
- /*
- * At lease we found one page. If this is the first time we
- * step into the loop, and if the first page index offset is
- * greater than the given search offset, a hole was found.
- */
- if (type == HOLE_OFF && lastoff == startoff &&
- lastoff < page_offset(pvec.pages[0])) {
- found = true;
- break;
- }
+ trace_xfs_write_fault(ip, order);
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- loff_t b_offset;
-
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL),
- * or even swizzled back from swapper_space to tmpfs
- * file mapping. However, page->index will not change
- * because we have a reference on the page.
- *
- * Searching done if the page index is out of range.
- * If the current offset is not reaches the end of
- * the specified search range, there should be a hole
- * between them.
- */
- if (page->index > end) {
- if (type == HOLE_OFF && lastoff < endoff) {
- *offset = lastoff;
- found = true;
- }
- goto out;
- }
-
- lock_page(page);
- /*
- * Page truncated or invalidated(page->mapping == NULL).
- * We can freely skip it and proceed to check the next
- * page.
- */
- if (unlikely(page->mapping != inode->i_mapping)) {
- unlock_page(page);
- continue;
- }
-
- if (!page_has_buffers(page)) {
- unlock_page(page);
- continue;
- }
-
- found = xfs_lookup_buffer_offset(page, &b_offset, type);
- if (found) {
- /*
- * The found offset may be less than the start
- * point to search if this is the first time to
- * come here.
- */
- *offset = max_t(loff_t, startoff, b_offset);
- unlock_page(page);
- goto out;
- }
-
- /*
- * We either searching data but nothing was found, or
- * searching hole but found a data buffer. In either
- * case, probably the next page contains the desired
- * things, update the last offset to it so.
- */
- lastoff = page_offset(page) + PAGE_SIZE;
- unlock_page(page);
- }
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
- /*
- * The number of returned pages less than our desired, search
- * done. In this case, nothing was found for searching data,
- * but we found a hole behind the last offset.
- */
- if (nr_pages < want) {
- if (type == HOLE_OFF) {
- *offset = lastoff;
- found = true;
- }
- break;
- }
+ /*
+ * Normally we only need the shared mmaplock, but if a reflink remap is
+ * in progress we take the exclusive lock to wait for the remap to
+ * finish before taking a write fault.
+ */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ lock_mode = XFS_MMAPLOCK_EXCL;
+ }
- index = pvec.pages[i - 1]->index + 1;
- pagevec_release(&pvec);
- } while (index <= end);
+ if (IS_DAX(inode))
+ ret = xfs_dax_fault_locked(vmf, order, true);
+ else
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
+ ac);
+ xfs_iunlock(ip, lock_mode);
-out:
- pagevec_release(&pvec);
- return found;
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
-STATIC loff_t
-xfs_seek_data(
- struct file *file,
- loff_t start)
+static vm_fault_t
+xfs_write_fault_zoned(
+ struct vm_fault *vmf,
+ unsigned int order)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- loff_t uninitialized_var(offset);
- xfs_fsize_t isize;
- xfs_fileoff_t fsbno;
- xfs_filblks_t end;
- uint lock;
+ struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
+ unsigned int len = folio_size(page_folio(vmf->page));
+ struct xfs_zone_alloc_ctx ac = { };
int error;
-
- lock = xfs_ilock_map_shared(ip);
-
- isize = i_size_read(inode);
- if (start >= isize) {
- error = ENXIO;
- goto out_unlock;
- }
+ vm_fault_t ret;
/*
- * Try to read extents from the first block indicated
- * by fsbno to the end block of the file.
+ * This could over-allocate as it doesn't check for truncation.
+ *
+ * But as the overallocation is limited to less than a folio and will be
+ * release instantly that's just fine.
*/
- fsbno = XFS_B_TO_FSBT(mp, start);
- end = XFS_B_TO_FSB(mp, isize);
- for (;;) {
- struct xfs_bmbt_irec map[2];
- int nmap = 2;
- unsigned int i;
-
- error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
- XFS_BMAPI_ENTIRE);
- if (error)
- goto out_unlock;
-
- /* No extents at given offset, must be beyond EOF */
- if (nmap == 0) {
- error = ENXIO;
- goto out_unlock;
- }
-
- for (i = 0; i < nmap; i++) {
- offset = max_t(loff_t, start,
- XFS_FSB_TO_B(mp, map[i].br_startoff));
-
- /* Landed in a data extent */
- if (map[i].br_startblock == DELAYSTARTBLOCK ||
- (map[i].br_state == XFS_EXT_NORM &&
- !isnullstartblock(map[i].br_startblock)))
- goto out;
-
- /*
- * Landed in an unwritten extent, try to search data
- * from page cache.
- */
- if (map[i].br_state == XFS_EXT_UNWRITTEN) {
- if (xfs_find_get_desired_pgoff(inode, &map[i],
- DATA_OFF, &offset))
- goto out;
- }
- }
-
- /*
- * map[0] is hole or its an unwritten extent but
- * without data in page cache. Probably means that
- * we are reading after EOF if nothing in map[1].
- */
- if (nmap == 1) {
- error = ENXIO;
- goto out_unlock;
- }
-
- ASSERT(i > 1);
-
- /*
- * Nothing was found, proceed to the next round of search
- * if reading offset not beyond or hit EOF.
- */
- fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
- start = XFS_FSB_TO_B(mp, fsbno);
- if (start >= isize) {
- error = ENXIO;
- goto out_unlock;
- }
- }
-
-out:
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-
-out_unlock:
- xfs_iunlock_map_shared(ip, lock);
-
- if (error)
- return -error;
- return offset;
+ error = xfs_zoned_space_reserve(ip->i_mount,
+ XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
+ if (error < 0)
+ return vmf_fs_error(error);
+ ret = __xfs_write_fault(vmf, order, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
+ return ret;
}
-STATIC loff_t
-xfs_seek_hole(
- struct file *file,
- loff_t start)
+static vm_fault_t
+xfs_write_fault(
+ struct vm_fault *vmf,
+ unsigned int order)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- loff_t uninitialized_var(offset);
- xfs_fsize_t isize;
- xfs_fileoff_t fsbno;
- xfs_filblks_t end;
- uint lock;
- int error;
+ if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
+ return xfs_write_fault_zoned(vmf, order);
+ return __xfs_write_fault(vmf, order, NULL);
+}
- if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+static inline bool
+xfs_is_write_fault(
+ struct vm_fault *vmf)
+{
+ return (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
+}
- lock = xfs_ilock_map_shared(ip);
+static vm_fault_t
+xfs_filemap_fault(
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
- isize = i_size_read(inode);
- if (start >= isize) {
- error = ENXIO;
- goto out_unlock;
+ /* DAX can shortcut the normal fault path on write faults! */
+ if (IS_DAX(inode)) {
+ if (xfs_is_write_fault(vmf))
+ return xfs_write_fault(vmf, 0);
+ return xfs_dax_read_fault(vmf, 0);
}
- fsbno = XFS_B_TO_FSBT(mp, start);
- end = XFS_B_TO_FSB(mp, isize);
-
- for (;;) {
- struct xfs_bmbt_irec map[2];
- int nmap = 2;
- unsigned int i;
+ trace_xfs_read_fault(XFS_I(inode), 0);
+ return filemap_fault(vmf);
+}
- error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
- XFS_BMAPI_ENTIRE);
- if (error)
- goto out_unlock;
+static vm_fault_t
+xfs_filemap_huge_fault(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ if (!IS_DAX(file_inode(vmf->vma->vm_file)))
+ return VM_FAULT_FALLBACK;
- /* No extents at given offset, must be beyond EOF */
- if (nmap == 0) {
- error = ENXIO;
- goto out_unlock;
- }
+ /* DAX can shortcut the normal fault path on write faults! */
+ if (xfs_is_write_fault(vmf))
+ return xfs_write_fault(vmf, order);
+ return xfs_dax_read_fault(vmf, order);
+}
- for (i = 0; i < nmap; i++) {
- offset = max_t(loff_t, start,
- XFS_FSB_TO_B(mp, map[i].br_startoff));
-
- /* Landed in a hole */
- if (map[i].br_startblock == HOLESTARTBLOCK)
- goto out;
-
- /*
- * Landed in an unwritten extent, try to search hole
- * from page cache.
- */
- if (map[i].br_state == XFS_EXT_UNWRITTEN) {
- if (xfs_find_get_desired_pgoff(inode, &map[i],
- HOLE_OFF, &offset))
- goto out;
- }
- }
+static vm_fault_t
+xfs_filemap_page_mkwrite(
+ struct vm_fault *vmf)
+{
+ return xfs_write_fault(vmf, 0);
+}
- /*
- * map[0] contains data or its unwritten but contains
- * data in page cache, probably means that we are
- * reading after EOF. We should fix offset to point
- * to the end of the file(i.e., there is an implicit
- * hole at the end of any file).
- */
- if (nmap == 1) {
- offset = isize;
- break;
- }
+/*
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
+ */
+static vm_fault_t
+xfs_filemap_pfn_mkwrite(
+ struct vm_fault *vmf)
+{
+ return xfs_write_fault(vmf, 0);
+}
- ASSERT(i > 1);
+static const struct vm_operations_struct xfs_file_vm_ops = {
+ .fault = xfs_filemap_fault,
+ .huge_fault = xfs_filemap_huge_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = xfs_filemap_page_mkwrite,
+ .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
+};
- /*
- * Both mappings contains data, proceed to the next round of
- * search if the current reading offset not beyond or hit EOF.
- */
- fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
- start = XFS_FSB_TO_B(mp, fsbno);
- if (start >= isize) {
- offset = isize;
- break;
- }
- }
+STATIC int
+xfs_file_mmap_prepare(
+ struct vm_area_desc *desc)
+{
+ struct file *file = desc->file;
+ struct inode *inode = file_inode(file);
+ struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
-out:
/*
- * At this point, we must have found a hole. However, the returned
- * offset may be bigger than the file size as it may be aligned to
- * page boundary for unwritten extents, we need to deal with this
- * situation in particular.
+ * We don't support synchronous mappings for non-DAX files and
+ * for DAX files if underneath dax_device is not synchronous.
*/
- offset = min_t(loff_t, offset, isize);
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-
-out_unlock:
- xfs_iunlock_map_shared(ip, lock);
-
- if (error)
- return -error;
- return offset;
-}
+ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
+ target->bt_daxdev))
+ return -EOPNOTSUPP;
-STATIC loff_t
-xfs_file_llseek(
- struct file *file,
- loff_t offset,
- int origin)
-{
- switch (origin) {
- case SEEK_END:
- case SEEK_CUR:
- case SEEK_SET:
- return generic_file_llseek(file, offset, origin);
- case SEEK_DATA:
- return xfs_seek_data(file, offset);
- case SEEK_HOLE:
- return xfs_seek_hole(file, offset);
- default:
- return -EINVAL;
- }
+ file_accessed(file);
+ desc->vm_ops = &xfs_file_vm_ops;
+ if (IS_DAX(inode))
+ desc->vm_flags |= VM_HUGEPAGE;
+ return 0;
}
const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = xfs_file_aio_read,
- .aio_write = xfs_file_aio_write,
+ .read_iter = xfs_file_read_iter,
+ .write_iter = xfs_file_write_iter,
.splice_read = xfs_file_splice_read,
- .splice_write = xfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
#endif
- .mmap = xfs_file_mmap,
+ .mmap_prepare = xfs_file_mmap_prepare,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
+ .get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
+ .fadvise = xfs_file_fadvise,
+ .remap_file_range = xfs_file_remap_range,
+ .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct file_operations xfs_dir_file_operations = {
.open = xfs_dir_open,
.read = generic_read_dir,
- .iterate = xfs_file_readdir,
+ .iterate_shared = xfs_file_readdir,
.llseek = generic_file_llseek,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
@@ -1436,9 +1982,3 @@ const struct file_operations xfs_dir_file_operations = {
#endif
.fsync = xfs_dir_fsync,
};
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
- .fault = filemap_fault,
- .page_mkwrite = xfs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
-};