diff options
Diffstat (limited to 'fs/ext4/page-io.c')
| -rw-r--r-- | fs/ext4/page-io.c | 231 |
1 files changed, 131 insertions, 100 deletions
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1d370364230e..39abfeec5f36 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -99,30 +99,29 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_finish_bio(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct page *bounce_page = NULL; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct folio *io_folio = NULL; struct buffer_head *bh, *head; - unsigned bio_start = bvec->bv_offset; - unsigned bio_end = bio_start + bvec->bv_len; + size_t bio_start = fi.offset; + size_t bio_end = bio_start + fi.length; unsigned under_io = 0; unsigned long flags; - if (fscrypt_is_bounce_page(page)) { - bounce_page = page; - page = fscrypt_pagecache_page(bounce_page); + if (fscrypt_is_bounce_folio(folio)) { + io_folio = folio; + folio = fscrypt_pagecache_folio(folio); } if (bio->bi_status) { - SetPageError(page); - mapping_set_error(page->mapping, -EIO); + int err = blk_status_to_errno(bio->bi_status); + mapping_set_error(folio->mapping, err); } - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); /* - * We check all buffers in the page under b_uptodate_lock + * We check all buffers in the folio under b_uptodate_lock * to avoid races with other end io clearing async_write flags */ spin_lock_irqsave(&head->b_uptodate_lock, flags); @@ -134,13 +133,15 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_status) + if (bio->bi_status) { + set_buffer_write_io_error(bh); buffer_io_error(bh); + } } while ((bh = bh->b_this_page) != head); spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { - fscrypt_free_bounce_page(bounce_page); - end_page_writeback(page); + fscrypt_free_bounce_page(&io_folio->page); + folio_end_writeback(folio); } } } @@ -163,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) } /* - * Check a range of space and convert unwritten extents to written. Note that + * On successful IO, check a range of space and convert unwritten extents to + * written. On IO failure, check if journal abort is needed. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from * direct IO is achieved) and also waits for PageWriteback bits. Thus we @@ -174,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; handle_t *handle = io_end->handle; + struct super_block *sb = inode->i_sb; int ret = 0; ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { - ext4_msg(inode->i_sb, KERN_EMERG, + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + + if (test_opt(sb, DATA_ERR_ABORT)) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } + if (ret < 0 && !ext4_emergency_state(sb) && + io_end->flag & EXT4_IO_END_UNWRITTEN) { + ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -216,6 +234,18 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) #endif } +static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec)) + return true; + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && + io_end->flag & EXT4_IO_END_FAILED && + !ext4_emergency_state(io_end->inode->i_sb)) + return true; + return false; +} + /* Add the io_end to per-inode completed end_io list. */ static void ext4_add_complete_io(ext4_io_end_t *io_end) { @@ -224,9 +254,12 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - /* Only reserved conversions from writeback should enter here */ - WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - WARN_ON(!io_end->handle && sbi->s_journal); + /* Only reserved conversions or pending IO errors will enter here. */ + WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && + !io_end->handle && sbi->s_journal); + WARN_ON(!io_end->bio); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) @@ -251,7 +284,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, while (!list_empty(&unwritten)) { io_end = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); list_del_init(&io_end->list); err = ext4_end_io_end(io_end); @@ -262,7 +295,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode, } /* - * work on completed IO, to convert unwritten extents to extents + * Used to convert unwritten extents to written extents upon IO completion, + * or used to abort the journal upon IO errors. */ void ext4_end_io_rsv_work(struct work_struct *work) { @@ -287,29 +321,22 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || - list_empty(&io_end->list_vec)) { - ext4_release_io_end(io_end); - return; - } - ext4_add_complete_io(io_end); + if (ext4_io_end_defer_completion(io_end)) + return ext4_add_complete_io(io_end); + + ext4_release_io_end(io_end); } } int ext4_put_io_end(ext4_io_end_t *io_end) { - int err = 0; - if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_io_end_vec(io_end->handle, - io_end); - io_end->handle = NULL; - ext4_clear_io_unwritten_flag(io_end); - } + if (ext4_io_end_defer_completion(io_end)) + return ext4_end_io_end(io_end); + ext4_release_io_end(io_end); } - return err; + return 0; } ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) @@ -323,10 +350,9 @@ static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; sector_t bi_sector = bio->bi_iter.bi_sector; - char b[BDEVNAME_SIZE]; - if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", - bio_devname(bio, b), + if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n", + bio->bi_bdev, (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), bio->bi_status)) { @@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (ext4_io_end_defer_completion(io_end)) { /* * Link bio into list hanging from io_end. We have to do it * atomically as bio completions can be racing against each @@ -372,10 +399,8 @@ void ext4_io_submit(struct ext4_io_submit *io) struct bio *bio = io->io_bio; if (bio) { - int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? - REQ_SYNC : 0; - io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; - bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); + if (io->io_wbc->sync_mode == WB_SYNC_ALL) + io->io_bio->bi_opf |= REQ_SYNC; submit_bio(io->io_bio); } io->io_bio = NULL; @@ -398,10 +423,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ - bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); + bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; @@ -411,11 +435,10 @@ static void io_submit_init_bio(struct ext4_io_submit *io, static void io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, - struct page *page, + struct folio *folio, + struct folio *io_folio, struct buffer_head *bh) { - int ret; - if (io->io_bio && (bh->b_blocknr != io->io_next_block || !fscrypt_mergeable_bio_bh(io->io_bio, bh))) { submit_and_retry: @@ -425,55 +448,46 @@ submit_and_retry: io_submit_init_bio(io, bh); io->io_bio->bi_write_hint = inode->i_write_hint; } - ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); - if (ret != bh->b_size) + if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } -int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len, - bool keep_towrite) +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, + size_t len) { - struct page *bounce_page = NULL; - struct inode *inode = page->mapping->host; + struct folio *io_folio = folio; + struct inode *inode = folio->mapping->host; unsigned block_start; struct buffer_head *bh, *head; int ret = 0; - int nr_submitted = 0; int nr_to_submit = 0; struct writeback_control *wbc = io->io_wbc; + bool keep_towrite = false; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - if (keep_towrite) - set_page_writeback_keepwrite(page); - else - set_page_writeback(page); - ClearPageError(page); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); /* - * Comments copied from block_write_full_page: + * Comments copied from block_write_full_folio: * - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); + if (len < folio_size(folio)) + folio_zero_segment(folio, len, folio_size(folio)); /* * In the first loop we prepare and mark buffers to submit. We have to - * mark all buffers in the page before submitting so that - * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * mark all buffers in the folio before submitting so that + * folio_end_writeback() cannot be called from ext4_end_bio() when IO * on the first buffer finishes and we are still working on submitting * the second buffer. */ - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { block_start = bh_offset(bh); if (block_start >= len) { @@ -486,17 +500,34 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* A hole? We can safely clear the dirty bit */ if (!buffer_mapped(bh)) clear_buffer_dirty(bh); - if (io->io_bio) - ext4_io_submit(io); + /* + * Keeping dirty some buffer we cannot write? Make sure + * to redirty the folio and keep TOWRITE tag so that + * racing WB_SYNC_ALL writeback does not skip the folio. + * This happens e.g. when doing writeout for + * transaction commit or when journalled data is not + * yet committed. + */ + if (buffer_dirty(bh) || + (buffer_jbd(bh) && buffer_jbddirty(bh))) { + if (!folio_test_dirty(folio)) + folio_redirty_for_writepage(wbc, folio); + keep_towrite = true; + } continue; } if (buffer_new(bh)) clear_buffer_new(bh); set_buffer_async_write(bh); + clear_buffer_dirty(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); - bh = head = page_buffers(page); + /* Nothing to submit? Just unlock the folio... */ + if (!nr_to_submit) + return 0; + + bh = head = folio_buffers(folio); /* * If any blocks are being written to an encrypted file, encrypt them @@ -505,9 +536,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * (e.g. holes) to be unnecessarily encrypted, but this is rare and * can't happen in the common case of blocksize == PAGE_SIZE. */ - if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) { + if (fscrypt_inode_uses_fs_layer_crypto(inode)) { gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + struct page *bounce_page; /* * Since bounce page allocation uses a mempool, we can only use @@ -515,10 +547,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * first page of the bio. Otherwise it can deadlock. */ if (io->io_bio) - gfp_flags = GFP_NOWAIT | __GFP_NOWARN; + gfp_flags = GFP_NOWAIT; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, - 0, gfp_flags); + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, + enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && @@ -534,29 +566,28 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); do { - clear_buffer_async_write(bh); + if (buffer_async_write(bh)) { + clear_buffer_async_write(bh); + set_buffer_dirty(bh); + } bh = bh->b_this_page; } while (bh != head); - goto unlock; + + return ret; } + io_folio = page_folio(bounce_page); } + __folio_start_writeback(folio, keep_towrite); + /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; - io_submit_add_bh(io, inode, - bounce_page ? bounce_page : page, bh); - nr_submitted++; - clear_buffer_dirty(bh); + io_submit_add_bh(io, inode, folio, io_folio, bh); } while ((bh = bh->b_this_page) != head); -unlock: - unlock_page(page); - /* Nothing submitted - we have to end page writeback */ - if (!nr_submitted) - end_page_writeback(page); - return ret; + return 0; } |
