summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_aops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r--fs/xfs/xfs_aops.c121
1 files changed, 59 insertions, 62 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 813f85156b0c..6d9965b546cb 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -19,6 +19,7 @@
#include "xfs_reflink.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_icache.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -112,11 +113,11 @@ xfs_end_ioend(
* longer dirty. If we don't remove delalloc blocks here, they become
* stale and can corrupt free space accounting on unmount.
*/
- error = blk_status_to_errno(ioend->io_bio->bi_status);
+ error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- xfs_bmap_punch_delalloc_range(ip, offset,
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
}
goto done;
@@ -131,7 +132,7 @@ xfs_end_ioend(
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
- error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+ error = xfs_setfilesize(ip, offset, size);
done:
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
@@ -179,7 +180,7 @@ STATIC void
xfs_end_bio(
struct bio *bio)
{
- struct iomap_ioend *ioend = bio->bi_private;
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
unsigned long flags;
@@ -233,50 +234,12 @@ xfs_imap_valid(
return true;
}
-/*
- * Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->iomap.
- *
- * The current page is held locked so nothing could have removed the block
- * backing offset_fsb, although it could have moved from the COW to the data
- * fork by another thread.
- */
-static int
-xfs_convert_blocks(
- struct iomap_writepage_ctx *wpc,
- struct xfs_inode *ip,
- int whichfork,
- loff_t offset)
-{
- int error;
- unsigned *seq;
-
- if (whichfork == XFS_COW_FORK)
- seq = &XFS_WPC(wpc)->cow_seq;
- else
- seq = &XFS_WPC(wpc)->data_seq;
-
- /*
- * Attempt to allocate whatever delalloc extent currently backs offset
- * and put the result into wpc->iomap. Allocate in a loop because it
- * may take several attempts to allocate real blocks for a contiguous
- * delalloc extent if free space is sufficiently fragmented.
- */
- do {
- error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
- &wpc->iomap, seq);
- if (error)
- return error;
- } while (wpc->iomap.offset + wpc->iomap.length <= offset);
-
- return 0;
-}
-
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
struct inode *inode,
- loff_t offset)
+ loff_t offset,
+ unsigned int len)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -289,6 +252,7 @@ xfs_map_blocks(
struct xfs_iext_cursor icur;
int retries = 0;
int error = 0;
+ unsigned int *seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -386,7 +350,19 @@ retry:
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
- error = xfs_convert_blocks(wpc, ip, whichfork, offset);
+ /*
+ * Convert a dellalloc extent to a real one. The current page is held
+ * locked so nothing could have removed the block backing offset_fsb,
+ * although it could have moved from the COW to the data fork by another
+ * thread.
+ */
+ if (whichfork == XFS_COW_FORK)
+ seq = &XFS_WPC(wpc)->cow_seq;
+ else
+ seq = &XFS_WPC(wpc)->data_seq;
+
+ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ &wpc->iomap, seq);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
@@ -444,7 +420,7 @@ xfs_prepare_ioend(
/* send ioends that might require a transaction to the completion wq */
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED))
- ioend->io_bio->bi_end_io = xfs_end_bio;
+ ioend->io_bio.bi_end_io = xfs_end_bio;
return status;
}
@@ -468,7 +444,6 @@ xfs_discard_folio(
{
struct xfs_inode *ip = XFS_I(folio->mapping->host);
struct xfs_mount *mp = ip->i_mount;
- int error;
if (xfs_is_shutdown(mp))
return;
@@ -482,11 +457,8 @@ xfs_discard_folio(
* byte of the next folio. Hence the end offset is only dependent on the
* folio itself and not the start offset that is passed in.
*/
- error = xfs_bmap_punch_delalloc_range(ip, pos,
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
folio_pos(folio) + folio_size(folio));
-
- if (error && !xfs_is_shutdown(mp))
- xfs_alert(mp, "page discard unable to remove delalloc mapping.");
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
@@ -502,13 +474,6 @@ xfs_vm_writepages(
{
struct xfs_writepage_ctx wpc = { };
- /*
- * Writing back data in a transaction context can result in recursive
- * transactions. This is bad, so issue a warning and get out of here.
- */
- if (WARN_ON_ONCE(current->journal_info))
- return 0;
-
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
@@ -564,12 +529,44 @@ xfs_vm_readahead(
}
static int
-xfs_iomap_swapfile_activate(
+xfs_vm_swap_activate(
struct swap_info_struct *sis,
struct file *swap_file,
sector_t *span)
{
- sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+ struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+
+ /*
+ * Swap file activation can race against concurrent shared extent
+ * removal in files that have been cloned. If this happens,
+ * iomap_swapfile_iter() can fail because it encountered a shared
+ * extent even though an operation is in progress to remove those
+ * shared extents.
+ *
+ * This race becomes problematic when we defer extent removal
+ * operations beyond the end of a syscall (i.e. use async background
+ * processing algorithms). Users think the extents are no longer
+ * shared, but iomap_swapfile_iter() still sees them as shared
+ * because the refcountbt entries for the extents being removed have
+ * not yet been updated. Hence the swapon call fails unexpectedly.
+ *
+ * The race condition is currently most obvious from the unlink()
+ * operation as extent removal is deferred until after the last
+ * reference to the inode goes away. We then process the extent
+ * removal asynchronously, hence triggers the "syscall completed but
+ * work not done" condition mentioned above. To close this race
+ * window, we need to flush any pending inodegc operations to ensure
+ * they have updated the refcountbt records before we try to map the
+ * swapfile.
+ */
+ xfs_inodegc_flush(ip->i_mount);
+
+ /*
+ * Direct the swap code to the correct block device when this file
+ * sits on the RT device.
+ */
+ sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
+
return iomap_swapfile_activate(sis, swap_file, span,
&xfs_read_iomap_ops);
}
@@ -585,11 +582,11 @@ const struct address_space_operations xfs_address_space_operations = {
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.dirty_folio = noop_dirty_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};