summaryrefslogtreecommitdiff
path: root/fs/iomap/buffered-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/iomap/buffered-io.c')
-rw-r--r--fs/iomap/buffered-io.c66
1 files changed, 57 insertions, 9 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 955f19e27e47..54dc27d92781 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1774,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
*/
static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, loff_t pos, unsigned len)
+ struct inode *inode, loff_t pos, loff_t end_pos,
+ unsigned len)
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
@@ -1793,15 +1794,60 @@ new_ioend:
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
+
+ /*
+ * Clamp io_offset and io_size to the incore EOF so that ondisk
+ * file size updates in the ioend completion are byte-accurate.
+ * This avoids recovering files with zeroed tail regions when
+ * writeback races with appending writes:
+ *
+ * Thread 1: Thread 2:
+ * ------------ -----------
+ * write [A, A+B]
+ * update inode size to A+B
+ * submit I/O [A, A+BS]
+ * write [A+B, A+B+C]
+ * update inode size to A+B+C
+ * <I/O completes, updates disk size to min(A+B+C, A+BS)>
+ * <power failure>
+ *
+ * After reboot:
+ * 1) with A+B+C < A+BS, the file has zero padding in range
+ * [A+B, A+B+C]
+ *
+ * |< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|
+ * ^ ^ ^
+ * A A+B A+B+C
+ * (EOF)
+ *
+ * 2) with A+B+C > A+BS, the file has zero padding in range
+ * [A+B, A+BS]
+ *
+ * |< Block Size (BS) >|< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+ * ^ ^ ^ ^
+ * A A+B A+BS A+B+C
+ * (EOF)
+ *
+ * D = Valid Data
+ * 0 = Zero Padding
+ *
+ * Note that this defeats the ability to chain the ioends of
+ * appending writes.
+ */
wpc->ioend->io_size += len;
+ if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
+ wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+
wbc_account_cgroup_owner(wbc, folio, len);
return 0;
}
static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, u64 pos, unsigned dirty_len,
- unsigned *count)
+ struct inode *inode, u64 pos, u64 end_pos,
+ unsigned dirty_len, unsigned *count)
{
int error;
@@ -1826,7 +1872,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
break;
default:
error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
- map_len);
+ end_pos, map_len);
if (!error)
(*count)++;
break;
@@ -1897,11 +1943,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
* remaining memory is zeroed when mapped, and writes to that
* region are not written out to the file.
*
- * Also adjust the writeback range to skip all blocks entirely
- * beyond i_size.
+ * Also adjust the end_pos to the end of file and skip writeback
+ * for all blocks entirely beyond i_size.
*/
folio_zero_segment(folio, poff, folio_size(folio));
- *end_pos = round_up(isize, i_blocksize(inode));
+ *end_pos = isize;
}
return true;
@@ -1914,6 +1960,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct inode *inode = folio->mapping->host;
u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio);
+ u64 end_aligned = 0;
unsigned count = 0;
int error = 0;
u32 rlen;
@@ -1955,9 +2002,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
/*
* Walk through the folio to find dirty areas to write back.
*/
- while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+ end_aligned = round_up(end_pos, i_blocksize(inode));
+ while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
- pos, rlen, &count);
+ pos, end_pos, rlen, &count);
if (error)
break;
pos += rlen;