1 files changed, 112 insertions, 89 deletions
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index beaec6d81074..39abfeec5f36 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -99,30 +99,29 @@ static void buffer_io_error(struct buffer_head *bh)
 
 static void ext4_finish_bio(struct bio *bio)
 {
-	struct bio_vec *bvec;
-	struct bvec_iter_all iter_all;
+	struct folio_iter fi;
 
-	bio_for_each_segment_all(bvec, bio, iter_all) {
-		struct page *page = bvec->bv_page;
-		struct page *bounce_page = NULL;
+	bio_for_each_folio_all(fi, bio) {
+		struct folio *folio = fi.folio;
+		struct folio *io_folio = NULL;
 		struct buffer_head *bh, *head;
-		unsigned bio_start = bvec->bv_offset;
-		unsigned bio_end = bio_start + bvec->bv_len;
+		size_t bio_start = fi.offset;
+		size_t bio_end = bio_start + fi.length;
 		unsigned under_io = 0;
 		unsigned long flags;
 
-		if (fscrypt_is_bounce_page(page)) {
-			bounce_page = page;
-			page = fscrypt_pagecache_page(bounce_page);
+		if (fscrypt_is_bounce_folio(folio)) {
+			io_folio = folio;
+			folio = fscrypt_pagecache_folio(folio);
 		}
 
 		if (bio->bi_status) {
-			SetPageError(page);
-			mapping_set_error(page->mapping, -EIO);
+			int err = blk_status_to_errno(bio->bi_status);
+			mapping_set_error(folio->mapping, err);
 		}
-		bh = head = page_buffers(page);
+		bh = head = folio_buffers(folio);
 		/*
-		 * We check all buffers in the page under b_uptodate_lock
+		 * We check all buffers in the folio under b_uptodate_lock
 		 * to avoid races with other end io clearing async_write flags
 		 */
 		spin_lock_irqsave(&head->b_uptodate_lock, flags);
@@ -141,8 +140,8 @@ static void ext4_finish_bio(struct bio *bio)
 		} while ((bh = bh->b_this_page) != head);
 		spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
 		if (!under_io) {
-			fscrypt_free_bounce_page(bounce_page);
-			end_page_writeback(page);
+			fscrypt_free_bounce_page(&io_folio->page);
+			folio_end_writeback(folio);
 		}
 	}
 }
@@ -165,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
 }
 
 /*
- * Check a range of space and convert unwritten extents to written. Note that
+ * On successful IO, check a range of space and convert unwritten extents to
+ * written. On IO failure, check if journal abort is needed. Note that
  * we are protected from truncate touching same part of extent tree by the
  * fact that truncate code waits for all DIO to finish (thus exclusion from
  * direct IO is achieved) and also waits for PageWriteback bits. Thus we
@@ -176,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
 {
 	struct inode *inode = io_end->inode;
 	handle_t *handle = io_end->handle;
+	struct super_block *sb = inode->i_sb;
 	int ret = 0;
 
 	ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
 		   "list->prev 0x%p\n",
 		   io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
 
-	io_end->handle = NULL;	/* Following call will use up the handle */
-	ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
-	if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
-		ext4_msg(inode->i_sb, KERN_EMERG,
+	/*
+	 * Do not convert the unwritten extents if data writeback fails,
+	 * or stale data may be exposed.
+	 */
+	io_end->handle = NULL;  /* Following call will use up the handle */
+	if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) {
+		ret = -EIO;
+		if (handle)
+			jbd2_journal_free_reserved(handle);
+
+		if (test_opt(sb, DATA_ERR_ABORT))
+			jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
+	} else {
+		ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
+	}
+	if (ret < 0 && !ext4_emergency_state(sb) &&
+	    io_end->flag & EXT4_IO_END_UNWRITTEN) {
+		ext4_msg(sb, KERN_EMERG,
 			 "failed to convert unwritten extents to written "
 			 "extents -- potential data loss!  "
 			 "(inode %lu, error %d)", inode->i_ino, ret);
 	}
+
 	ext4_clear_io_unwritten_flag(io_end);
 	ext4_release_io_end(io_end);
 	return ret;
@@ -218,6 +234,18 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
 #endif
 }
 
+static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
+{
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN &&
+	    !list_empty(&io_end->list_vec))
+		return true;
+	if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
+	    io_end->flag & EXT4_IO_END_FAILED &&
+	    !ext4_emergency_state(io_end->inode->i_sb))
+		return true;
+	return false;
+}
+
 /* Add the io_end to per-inode completed end_io list. */
 static void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
@@ -226,9 +254,12 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
 	struct workqueue_struct *wq;
 	unsigned long flags;
 
-	/* Only reserved conversions from writeback should enter here */
-	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-	WARN_ON(!io_end->handle && sbi->s_journal);
+	/* Only reserved conversions or pending IO errors will enter here. */
+	WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
+	WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
+		!io_end->handle && sbi->s_journal);
+	WARN_ON(!io_end->bio);
+
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	wq = sbi->rsv_conversion_wq;
 	if (list_empty(&ei->i_rsv_conversion_list))
@@ -253,7 +284,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
 
 	while (!list_empty(&unwritten)) {
 		io_end = list_entry(unwritten.next, ext4_io_end_t, list);
-		BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+		BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
 		list_del_init(&io_end->list);
 
 		err = ext4_end_io_end(io_end);
@@ -264,7 +295,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
 }
 
 /*
- * work on completed IO, to convert unwritten extents to extents
+ * Used to convert unwritten extents to written extents upon IO completion,
+ * or used to abort the journal upon IO errors.
  */
 void ext4_end_io_rsv_work(struct work_struct *work)
 {
@@ -289,29 +321,22 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 void ext4_put_io_end_defer(ext4_io_end_t *io_end)
 {
 	if (refcount_dec_and_test(&io_end->count)) {
-		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
-				list_empty(&io_end->list_vec)) {
-			ext4_release_io_end(io_end);
-			return;
-		}
-		ext4_add_complete_io(io_end);
+		if (ext4_io_end_defer_completion(io_end))
+			return ext4_add_complete_io(io_end);
+
+		ext4_release_io_end(io_end);
 	}
 }
 
 int ext4_put_io_end(ext4_io_end_t *io_end)
 {
-	int err = 0;
-
 	if (refcount_dec_and_test(&io_end->count)) {
-		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
-			err = ext4_convert_unwritten_io_end_vec(io_end->handle,
-								io_end);
-			io_end->handle = NULL;
-			ext4_clear_io_unwritten_flag(io_end);
-		}
+		if (ext4_io_end_defer_completion(io_end))
+			return ext4_end_io_end(io_end);
+
 		ext4_release_io_end(io_end);
 	}
-	return err;
+	return 0;
 }
 
 ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
@@ -345,11 +370,12 @@ static void ext4_end_bio(struct bio *bio)
 			     bio->bi_status, inode->i_ino,
 			     (unsigned long long)
 			     bi_sector >> (inode->i_blkbits - 9));
+		io_end->flag |= EXT4_IO_END_FAILED;
 		mapping_set_error(inode->i_mapping,
 				blk_status_to_errno(bio->bi_status));
 	}
 
-	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+	if (ext4_io_end_defer_completion(io_end)) {
 		/*
 		 * Link bio into list hanging from io_end. We have to do it
 		 * atomically as bio completions can be racing against each
@@ -409,31 +435,30 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
 
 static void io_submit_add_bh(struct ext4_io_submit *io,
 			     struct inode *inode,
-			     struct page *page,
+			     struct folio *folio,
+			     struct folio *io_folio,
 			     struct buffer_head *bh)
 {
-	int ret;
-
 	if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
 			   !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
 submit_and_retry:
 		ext4_io_submit(io);
 	}
-	if (io->io_bio == NULL)
+	if (io->io_bio == NULL) {
 		io_submit_init_bio(io, bh);
-	ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
-	if (ret != bh->b_size)
+		io->io_bio->bi_write_hint = inode->i_write_hint;
+	}
+	if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
 		goto submit_and_retry;
-	wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
+	wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
 	io->io_next_block++;
 }
 
-int ext4_bio_write_page(struct ext4_io_submit *io,
-			struct page *page,
-			int len)
+int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
+		size_t len)
 {
-	struct page *bounce_page = NULL;
-	struct inode *inode = page->mapping->host;
+	struct folio *io_folio = folio;
+	struct inode *inode = folio->mapping->host;
 	unsigned block_start;
 	struct buffer_head *bh, *head;
 	int ret = 0;
@@ -441,30 +466,28 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	struct writeback_control *wbc = io->io_wbc;
 	bool keep_towrite = false;
 
-	BUG_ON(!PageLocked(page));
-	BUG_ON(PageWriteback(page));
-
-	ClearPageError(page);
+	BUG_ON(!folio_test_locked(folio));
+	BUG_ON(folio_test_writeback(folio));
 
 	/*
-	 * Comments copied from block_write_full_page:
+	 * Comments copied from block_write_full_folio:
 	 *
-	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * The folio straddles i_size.  It must be zeroed out on each and every
 	 * writepage invocation because it may be mmapped.  "A file is mapped
 	 * in multiples of the page size.  For a file that is not a multiple of
 	 * the page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	if (len < PAGE_SIZE)
-		zero_user_segment(page, len, PAGE_SIZE);
+	if (len < folio_size(folio))
+		folio_zero_segment(folio, len, folio_size(folio));
 	/*
 	 * In the first loop we prepare and mark buffers to submit. We have to
-	 * mark all buffers in the page before submitting so that
-	 * end_page_writeback() cannot be called from ext4_end_bio() when IO
+	 * mark all buffers in the folio before submitting so that
+	 * folio_end_writeback() cannot be called from ext4_end_bio() when IO
 	 * on the first buffer finishes and we are still working on submitting
 	 * the second buffer.
 	 */
-	bh = head = page_buffers(page);
+	bh = head = folio_buffers(folio);
 	do {
 		block_start = bh_offset(bh);
 		if (block_start >= len) {
@@ -479,14 +502,16 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 				clear_buffer_dirty(bh);
 			/*
 			 * Keeping dirty some buffer we cannot write? Make sure
-			 * to redirty the page and keep TOWRITE tag so that
-			 * racing WB_SYNC_ALL writeback does not skip the page.
+			 * to redirty the folio and keep TOWRITE tag so that
+			 * racing WB_SYNC_ALL writeback does not skip the folio.
 			 * This happens e.g. when doing writeout for
-			 * transaction commit.
+			 * transaction commit or when journalled data is not
+			 * yet committed.
 			 */
-			if (buffer_dirty(bh)) {
-				if (!PageDirty(page))
-					redirty_page_for_writepage(wbc, page);
+			if (buffer_dirty(bh) ||
+			    (buffer_jbd(bh) && buffer_jbddirty(bh))) {
+				if (!folio_test_dirty(folio))
+					folio_redirty_for_writepage(wbc, folio);
 				keep_towrite = true;
 			}
 			continue;
@@ -498,11 +523,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		nr_to_submit++;
 	} while ((bh = bh->b_this_page) != head);
 
-	/* Nothing to submit? Just unlock the page... */
+	/* Nothing to submit? Just unlock the folio... */
 	if (!nr_to_submit)
-		goto unlock;
+		return 0;
 
-	bh = head = page_buffers(page);
+	bh = head = folio_buffers(folio);
 
 	/*
 	 * If any blocks are being written to an encrypted file, encrypt them
@@ -511,9 +536,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	 * (e.g. holes) to be unnecessarily encrypted, but this is rare and
 	 * can't happen in the common case of blocksize == PAGE_SIZE.
 	 */
-	if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
+	if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
 		gfp_t gfp_flags = GFP_NOFS;
 		unsigned int enc_bytes = round_up(len, i_blocksize(inode));
+		struct page *bounce_page;
 
 		/*
 		 * Since bounce page allocation uses a mempool, we can only use
@@ -521,10 +547,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		 * first page of the bio.  Otherwise it can deadlock.
 		 */
 		if (io->io_bio)
-			gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
+			gfp_flags = GFP_NOWAIT;
 	retry_encrypt:
-		bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
-							       0, gfp_flags);
+		bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
+					enc_bytes, 0, gfp_flags);
 		if (IS_ERR(bounce_page)) {
 			ret = PTR_ERR(bounce_page);
 			if (ret == -ENOMEM &&
@@ -540,7 +566,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			}
 
 			printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
-			redirty_page_for_writepage(wbc, page);
+			folio_redirty_for_writepage(wbc, folio);
 			do {
 				if (buffer_async_write(bh)) {
 					clear_buffer_async_write(bh);
@@ -548,23 +574,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 				}
 				bh = bh->b_this_page;
 			} while (bh != head);
-			goto unlock;
+
+			return ret;
 		}
+		io_folio = page_folio(bounce_page);
 	}
 
-	if (keep_towrite)
-		set_page_writeback_keepwrite(page);
-	else
-		set_page_writeback(page);
+	__folio_start_writeback(folio, keep_towrite);
 
 	/* Now submit buffers to write */
 	do {
 		if (!buffer_async_write(bh))
 			continue;
-		io_submit_add_bh(io, inode,
-				 bounce_page ? bounce_page : page, bh);
+		io_submit_add_bh(io, inode, folio, io_folio, bh);
 	} while ((bh = bh->b_this_page) != head);
-unlock:
-	unlock_page(page);
-	return ret;
+
+	return 0;
 }