diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 865 |
1 files changed, 654 insertions, 211 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cdf01e60fa6d..ed54c4d0f2f9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -58,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -142,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. @@ -416,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, return ret; } +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -462,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map) + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; + unsigned int orig_mlen = map->m_len; + flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags); else - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; @@ -484,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, WARN_ON(1); } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); - return retval; + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + map->m_len == orig_mlen) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; + } + + return ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -602,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct extent_status es; int retval; int ret = 0; + unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -622,9 +712,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ - if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -653,7 +752,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif - goto found; + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -667,7 +770,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - retval = ext4_map_query_blocks(handle, inode, map); + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -696,6 +799,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; + + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -770,6 +875,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } +/* + * Make sure that the current journal transaction has enough credits to map + * one extent. Return -EAGAIN if it cannot extend the current running + * transaction. + */ +static inline int ext4_journal_ensure_extent_credits(handle_t *handle, + struct inode *inode) +{ + int credits; + int ret; + + /* Called from ext4_da_write_begin() which has no handle started? */ + if (!handle) + return 0; + + credits = ext4_chunk_trans_blocks(inode, 1); + ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); + return ret <= 0 ? ret : -EAGAIN; +} + static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { @@ -1009,7 +1134,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } @@ -1027,7 +1157,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - unsigned from = pos & (PAGE_SIZE - 1); + unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; @@ -1041,8 +1171,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); @@ -1060,11 +1189,13 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, } continue; } - if (buffer_new(bh)) + if (WARN_ON_ONCE(buffer_new(bh))) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); + err = ext4_journal_ensure_extent_credits(handle, inode); + if (!err) + err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { @@ -1141,7 +1272,8 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ -static int ext4_write_begin(struct file *file, struct address_space *mapping, +static int ext4_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -1162,10 +1294,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ - needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + needed_blocks = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)) + 1; index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, @@ -1177,17 +1308,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, } /* - * __filemap_get_folio() can take a long time if the + * write_begin_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + /* * The same as page allocation, we prealloc buffer heads before * starting the handle. @@ -1256,8 +1393,9 @@ retry_journal: ext4_orphan_del(NULL, inode); } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + if (ret == -EAGAIN || + (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_journal; folio_put(folio); return ret; @@ -1277,17 +1415,18 @@ static int write_end_fn(handle_t *handle, struct inode *inode, ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); + clear_buffer_new(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). + * `iocb` can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ -static int ext4_write_end(struct file *file, +static int ext4_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -1306,7 +1445,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + copied = block_write_end(pos, len, copied, folio); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1392,7 +1531,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, } while (bh != head); } -static int ext4_journalled_write_end(struct file *file, +static int ext4_journalled_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -1549,11 +1688,12 @@ struct mpage_da_data { unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ - pgoff_t first_page; /* The first page to write */ - pgoff_t next_page; /* Current page to examine */ - pgoff_t last_page; /* Last page to examine */ + loff_t start_pos; /* The start pos to write */ + loff_t next_pos; /* Current pos to examine */ + loff_t end_pos; /* Last pos to examine */ + /* - * Extent to map - this can be after first_page because that can be + * Extent to map - this can be after start_pos because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ @@ -1573,38 +1713,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - /* This is necessary when next_page == 0. */ - if (mpd->first_page >= mpd->next_page) + /* This is necessary when next_pos == 0. */ + if (mpd->start_pos >= mpd->next_pos) return; mpd->scanned_until_end = 0; - index = mpd->first_page; - end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; - start = index << (PAGE_SHIFT - inode->i_blkbits); - last = end << (PAGE_SHIFT - inode->i_blkbits); + start = EXT4_B_TO_LBLK(inode, mpd->start_pos); + last = mpd->next_pos >> inode->i_blkbits; /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); - ext4_es_remove_extent(inode, start, last - start + 1); + ext4_es_remove_extent(inode, start, last - start); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); - while (index <= end) { - nr = filemap_get_folios(mapping, &index, end, &fbatch); + index = mpd->start_pos >> PAGE_SHIFT; + end = mpd->next_pos >> PAGE_SHIFT; + while (index < end) { + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - if (folio->index < mpd->first_page) + if (folio_pos(folio) < mpd->start_pos) continue; - if (folio_next_index(folio) - 1 > end) + if (folio_next_index(folio) > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -1765,6 +1905,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, @@ -1805,7 +1947,7 @@ found: if (ext4_has_inline_data(inode)) retval = 0; else - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; @@ -1828,7 +1970,7 @@ add_delayed: goto found; } } else if (!ext4_has_inline_data(inode)) { - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; @@ -1904,7 +2046,8 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - mpd->first_page += folio_nr_pages(folio); + mpd->start_pos += folio_size(folio); + mpd->wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -1914,7 +2057,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) loff_t size; int err; - BUG_ON(folio->index != mpd->first_page); + WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path @@ -1935,8 +2078,6 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); - if (!err) - mpd->wbc->nr_to_write--; return err; } @@ -2159,7 +2300,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; - lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2170,6 +2310,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; + lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2203,6 +2344,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) int get_blocks_flags; int err, dioread_nolock; + /* Make sure transaction has enough credits for this extent */ + err = ext4_journal_ensure_extent_credits(handle, inode); + if (err < 0) + return err; + trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or @@ -2212,11 +2358,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -2238,6 +2388,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) } /* + * This is used to submit mapped buffers in a single folio that is not fully + * mapped for various reasons, such as insufficient space or journal credits. + */ +static int mpage_submit_partial_folio(struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct folio *folio; + loff_t pos; + int ret; + + folio = filemap_get_folio(inode->i_mapping, + mpd->start_pos >> PAGE_SHIFT); + if (IS_ERR(folio)) + return PTR_ERR(folio); + /* + * The mapped position should be within the current processing folio + * but must not be the folio start position. + */ + pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; + if (WARN_ON_ONCE((folio_pos(folio) == pos) || + !folio_contains(folio, pos >> PAGE_SHIFT))) + return -EINVAL; + + ret = mpage_submit_folio(mpd, folio); + if (ret) + goto out; + /* + * Update start_pos to prevent this folio from being released in + * mpage_release_unused_pages(), it will be reset to the aligned folio + * pos when this folio is written again in the next round. Additionally, + * do not update wbc->nr_to_write here, as it will be updated once the + * entire folio has finished processing. + */ + mpd->start_pos = pos; +out: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +/* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * @@ -2285,10 +2476,18 @@ static int mpage_map_and_submit_extent(handle_t *handle, * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ - if ((err == -ENOMEM) || + if ((err == -ENOMEM) || (err == -EAGAIN) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { - if (progress) + /* + * We may have already allocated extents for + * some bhs inside the folio, issue the + * corresponding data to prevent stale data. + */ + if (progress) { + if (mpage_submit_partial_folio(mpd)) + goto invalidate_dirty_pages; goto update_disksize; + } return err; } ext4_msg(sb, KERN_CRIT, @@ -2322,7 +2521,7 @@ update_disksize: * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ - disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; + disksize = mpd->start_pos; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; @@ -2346,21 +2545,6 @@ update_disksize: return err; } -/* - * Calculate the total number of credits to reserve for one writepages - * iteration. This is called from ext4_writepages(). We map an extent of - * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping - * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + - * bpp - 1 blocks in bpp different extents. - */ -static int ext4_da_writepages_trans_blocks(struct inode *inode) -{ - int bpp = ext4_journal_blocks_per_page(inode); - - return ext4_meta_trans_blocks(inode, - MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); -} - static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { @@ -2391,7 +2575,7 @@ static int mpage_journal_page_buffers(handle_t *handle, size_t len = folio_size(folio); folio_clear_checked(folio); - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) @@ -2425,15 +2609,15 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - pgoff_t index = mpd->first_page; - pgoff_t end = mpd->last_page; + pgoff_t index = mpd->start_pos >> PAGE_SHIFT; + pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; - int bpp = ext4_journal_blocks_per_page(mpd->inode); + int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -2441,7 +2625,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; - mpd->next_page = index; + mpd->next_pos = mpd->start_pos; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); @@ -2472,7 +2656,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != folio->index) + if (mpd->map.m_len > 0 && + mpd->next_pos != folio_pos(folio)) goto out; if (handle) { @@ -2518,8 +2703,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) } if (mpd->map.m_len == 0) - mpd->first_page = folio->index; - mpd->next_page = folio_next_index(folio); + mpd->start_pos = folio_pos(folio); + mpd->next_pos = folio_pos(folio) + folio_size(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -2647,12 +2832,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { + int bpf = ext4_journal_blocks_per_folio(inode); /* * We may need to convert up to one extent per block in - * the page and we may dirty the inode. + * the folio and we may dirty the inode. */ - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, - PAGE_SIZE >> inode->i_blkbits); + rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) @@ -2662,18 +2847,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; - mpd->first_page = writeback_index; - mpd->last_page = -1; + mpd->start_pos = writeback_index << PAGE_SHIFT; + mpd->end_pos = LLONG_MAX; } else { - mpd->first_page = wbc->range_start >> PAGE_SHIFT; - mpd->last_page = wbc->range_end >> PAGE_SHIFT; + mpd->start_pos = wbc->range_start; + mpd->end_pos = wbc->range_end; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, mpd->first_page, - mpd->last_page); + tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, + mpd->end_pos >> PAGE_SHIFT); blk_start_plug(&plug); /* @@ -2716,8 +2901,14 @@ retry: * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); - needed_blocks = ext4_da_writepages_trans_blocks(inode); - + /* + * Calculate the number of credits needed to reserve for one + * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will + * attempt to extend the transaction or start a new iteration + * if the reserved credits are insufficient. + */ + needed_blocks = ext4_chunk_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); @@ -2733,7 +2924,8 @@ retry: } mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd->first_page, wbc); + trace_ext4_da_write_folios_start(inode, mpd->start_pos, + mpd->next_pos, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, @@ -2771,6 +2963,8 @@ retry: } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; + trace_ext4_da_write_folios_end(inode, mpd->start_pos, + mpd->next_pos, wbc, ret); if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2782,6 +2976,8 @@ retry: ret = 0; continue; } + if (ret == -EAGAIN) + ret = 0; /* Fatal error - ENOMEM, EIO... */ if (ret) break; @@ -2790,8 +2986,8 @@ unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - mpd->last_page = writeback_index - 1; - mpd->first_page = 0; + mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; + mpd->start_pos = 0; goto retry; } @@ -2801,7 +2997,7 @@ unplug: * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = mpd->first_page; + mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, @@ -2912,7 +3108,8 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -static int ext4_da_write_begin(struct file *file, struct address_space *mapping, +static int ext4_da_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -2929,7 +3126,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; - return ext4_write_begin(file, mapping, pos, + return ext4_write_begin(iocb, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; @@ -2945,11 +3142,13 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -3015,8 +3214,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ - copied = block_write_end(NULL, mapping, pos, len, copied, - folio, NULL); + copied = block_write_end(pos, len, copied, folio); new_i_size = pos + copied; /* @@ -3038,7 +3236,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, unsigned long end; i_size_write(inode, new_i_size); - end = (new_i_size - 1) & (PAGE_SIZE - 1); + end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; @@ -3067,7 +3265,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, return copied; } -static int ext4_da_write_end(struct file *file, +static int ext4_da_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -3076,7 +3274,7 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) - return ext4_write_end(file, mapping, pos, + return ext4_write_end(iocb, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); @@ -3340,12 +3538,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, } } +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at @@ -3353,7 +3688,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } retry: /* @@ -3384,7 +3742,11 @@ retry: else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ret = ext4_map_blocks(handle, inode, map, m_flags); + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could @@ -3398,6 +3760,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + return ret; } @@ -3408,6 +3786,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; @@ -3421,6 +3800,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* @@ -3431,11 +3811,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) - goto out; + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3449,6 +3841,16 @@ out: */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; @@ -3690,9 +4092,7 @@ void ext4_set_aops(struct inode *inode) static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, pos; + unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3707,13 +4107,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -4006,7 +4407,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t start_lblk, end_lblk; - loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + loff_t max_end = sb->s_maxbytes; loff_t end = offset + length; handle_t *handle; unsigned int credits; @@ -4015,14 +4416,20 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); WARN_ON_ONCE(!inode_is_locked(inode)); + /* + * For indirect-block based inodes, make sure that the hole within + * one block before last range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) + if (offset >= inode->i_size || offset >= max_end) return 0; /* * If the hole extends beyond i_size, set the hole to end after - * the page that contains i_size, and also make sure that the hole - * within one block before last range. + * the page that contains i_size. */ if (end > inode->i_size) end = round_up(inode->i_size, PAGE_SIZE); @@ -4051,7 +4458,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 2); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); @@ -4072,6 +4479,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -4198,7 +4607,7 @@ int ext4_truncate(struct inode *inode) } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 1); else credits = ext4_blocks_for_truncate(inode); @@ -4224,8 +4633,10 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; - down_write(&EXT4_I(inode)->i_data_sem); + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4760,10 +5171,43 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, return 0; error: - ext4_error_inode(inode, function, line, 0, err_str); + ext4_error_inode(inode, function, line, 0, "%s", err_str); return -EFSCORRUPTED; } +static bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; +} + +/* + * Limit the maximum folio order to 2048 blocks to prevent overestimation + * of reserve handle credits during the folio writeback in environments + * where the PAGE_SIZE exceeds 4KB. + */ +#define EXT4_MAX_PAGECACHE_ORDER(i) \ + umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) +void ext4_set_inode_mapping_order(struct inode *inode) +{ + if (!ext4_should_enable_large_folio(inode)) + return; + + mapping_set_folio_order_range(inode->i_mapping, 0, + EXT4_MAX_PAGECACHE_ORDER(inode)); +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -4781,12 +5225,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if ((!(flags & EXT4_IGET_SPECIAL) && - ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || - ino == le32_to_cpu(es->s_usr_quota_inum) || - ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum) || - ino == le32_to_cpu(es->s_orphan_file_inum))) || + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) @@ -4845,10 +5284,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || @@ -4916,7 +5354,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -5086,6 +5525,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } + + ext4_set_inode_mapping_order(inode); + ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, @@ -5564,9 +6006,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code @@ -5577,6 +6017,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; @@ -5773,8 +6216,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5782,13 +6224,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int ret; /* - * How many index blocks need to touch to map @lblocks logical blocks - * to @pextents physical extents? + * How many index and leaf blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); - ret = idxblocks; - /* * Now let's see how many group bitmaps and group descriptors need * to account @@ -5801,7 +6241,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; + ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); @@ -5810,25 +6250,19 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, } /* - * Calculate the total number of credits to reserve to fit - * the modification of a single pages into a single transaction, - * which may include multiple chunks of block allocations. - * - * This could be called via ext4_write_begin() - * - * We need to consider the worse case, when - * one new block per extent. + * Calculate the journal credits for modifying the number of blocks + * in a single extent within one transaction. 'nrblocks' is used only + * for non-extent inodes. For extent type inodes, 'nrblocks' can be + * zero if the exact number of blocks is unknown. */ -int ext4_writepage_trans_blocks(struct inode *inode) +int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) { - int bpp = ext4_journal_blocks_per_page(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, bpp); - + ret = ext4_meta_trans_blocks(inode, nrblocks, 1); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret += bpp; + ret += nrblocks; return ret; } @@ -5895,6 +6329,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; @@ -6199,6 +6634,55 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, return !buffer_mapped(bh); } +static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, + get_block_t get_block) +{ + handle_t *handle; + loff_t size; + unsigned long len; + int credits; + int ret; + + credits = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + folio_lock(folio); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { + ret = -EFAULT; + goto out_error; + } + + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); + + ret = ext4_block_write_begin(handle, folio, 0, len, get_block); + if (ret) + goto out_error; + + if (!ext4_should_journal_data(inode)) { + block_commit_write(folio, 0, len); + folio_mark_dirty(folio); + } else { + ret = ext4_journal_folio_buffers(handle, folio, len); + if (ret) + goto out_error; + } + ext4_journal_stop(handle); + folio_wait_stable(folio); + return ret; + +out_error: + folio_unlock(folio); + ext4_journal_stop(handle); + return ret; +} + vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -6210,8 +6694,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - handle_t *handle; - get_block_t *get_block; + get_block_t *get_block = ext4_get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) @@ -6279,47 +6762,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; - else - get_block = ext4_get_block; retry_alloc: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - /* - * Data journalling can't use block_page_mkwrite() because it - * will set_buffer_dirty() before do_journal_get_write_access() - * thus might hit warning messages for dirty metadata buffers. - */ - if (!ext4_should_journal_data(inode)) { - err = block_page_mkwrite(vma, vmf, get_block); - } else { - folio_lock(folio); - size = i_size_read(inode); - /* Page got truncated from under us? */ - if (folio->mapping != mapping || folio_pos(folio) > size) { - ret = VM_FAULT_NOPAGE; - goto out_error; - } - - len = folio_size(folio); - if (folio_pos(folio) + len > size) - len = size - folio_pos(folio); - - err = ext4_block_write_begin(handle, folio, 0, len, - ext4_get_block); - if (!err) { - ret = VM_FAULT_SIGBUS; - if (ext4_journal_folio_buffers(handle, folio, len)) - goto out_error; - } else { - folio_unlock(folio); - } - } - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + /* Start journal and allocate blocks */ + err = ext4_block_page_mkwrite(inode, folio, get_block); + if (err == -EAGAIN || + (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_alloc; out_ret: ret = vmf_fs_error(err); @@ -6327,8 +6774,4 @@ out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; -out_error: - folio_unlock(folio); - ext4_journal_stop(handle); - goto out; } |