diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 1538 |
1 files changed, 1028 insertions, 510 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 537803250ca9..be9a4cba35fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -49,32 +50,35 @@ #include <trace/events/ext4.h> +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, + struct folio *folio, + unsigned from, unsigned to); + static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -88,7 +92,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -109,7 +113,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -136,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. @@ -176,6 +177,8 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); + dax_break_layout_final(inode); + if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { @@ -378,10 +381,11 @@ static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) return 0; + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " @@ -407,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, return ret; } +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -453,6 +483,185 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + +static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + unsigned int status; + int retval; + unsigned int orig_mlen = map->m_len; + + flags &= EXT4_EX_QUERY_FILTER; + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(handle, inode, map, flags); + else + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + map->m_len == orig_mlen) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; + } + + return ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); +} + +static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + unsigned int status; + int err, retval = 0; + + /* + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE + * indicates that the blocks and quotas has already been + * checked when the data was copied into the page cache. + */ + if (map->m_flags & EXT4_MAP_DELAYED) + flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + /* + * Here we clear m_flags because after allocating an new extent, + * it will be set again. + */ + map->m_flags &= ~EXT4_MAP_FLAGS; + + /* + * We need to check for EXT4 here because migrate could have + * changed the inode type in between. + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + /* + * We allocated new blocks which will result in i_data's + * format changing. Force the migrate to fail by clearing + * migrate flags. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode %lu: " + "retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, + map->m_len); + if (err) + return err; + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if (flags & EXT4_GET_BLOCKS_PRE_IO && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_is_written(&es)) + return retval; + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, + status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + + return retval; +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -482,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct extent_status es; int retval; int ret = 0; + unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -502,6 +712,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { @@ -533,7 +753,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif - goto found; + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -547,32 +772,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, 0); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, 0); - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -601,12 +801,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; - /* - * Here we clear m_flags because after allocating an new extent, - * it will be set again. - */ - map->m_flags &= ~EXT4_MAP_FLAGS; + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -614,76 +810,7 @@ found: * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); - - /* - * We need to check for EXT4 here because migrate - * could have changed the inode type in between - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { - /* - * We allocated new blocks which will result in - * i_data's format changing. Force the migrate - * to fail by clearing migrate flags - */ - ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - } - } - - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - /* - * We have to zeroout blocks before inserting them into extent - * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. We also have to - * unmap metadata before zeroing as otherwise writeback can - * overwrite zeros with stale data from block device. - */ - if (flags & EXT4_GET_BLOCKS_ZERO && - map->m_flags & EXT4_MAP_MAPPED && - map->m_flags & EXT4_MAP_NEW) { - ret = ext4_issue_zeroout(inode, map->m_lblk, - map->m_pblk, map->m_len); - if (ret) { - retval = ret; - goto out_sem; - } - } - - /* - * If the extent has been zeroed out, we don't need to update - * extent status tree. - */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { - if (ext4_es_is_written(&es)) - goto out_sem; - } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } - -out_sem: + retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); @@ -735,7 +862,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ - if (!bh->b_page) { + if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } @@ -840,7 +967,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); - bh = sb_getblk(inode->i_sb, map.m_pblk); + /* + * Since bh could introduce extra ref count such as referred by + * journal_head etc. Try to avoid using __GFP_MOVABLE here + * as it may fail the migration when journal_head remains. + */ + bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, + inode->i_sb->s_blocksize); + if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { @@ -982,41 +1116,30 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - int dirty = buffer_dirty(bh); - int ret; - if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - /* - * __block_write_begin() could have dirtied some buffers. Clean - * the dirty bit as jbd2_journal_get_write_access() could complain - * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_write_begin() isn't a real problem here as we clear - * the bit before releasing a page lock and thus writeback cannot - * ever write the buffer. - */ - if (dirty) - clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); - ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); - if (!ret && dirty) - ret = ext4_dirty_journalled_data(handle, bh); - return ret; } -#ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, - get_block_t *get_block) +int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, + get_block_t *get_block) { - unsigned from = pos & (PAGE_SIZE - 1); + unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; @@ -1027,10 +1150,10 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; + bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); @@ -1056,10 +1179,22 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { + /* + * We may be zeroing partial buffers or all new + * buffers in case of failure. Prepare JBD2 for + * that. + */ + if (should_journal_data) + do_journal_get_write_access(handle, + inode, bh); if (folio_test_uptodate(folio)) { - clear_buffer_new(bh); + /* + * Unlike __block_write_begin() we leave + * dirtying of new uptodate buffers to + * ->write_end() time or + * folio_zero_new_buffers(). + */ set_buffer_uptodate(bh); - mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) @@ -1089,7 +1224,11 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - folio_zero_new_buffers(folio, from, to); + if (should_journal_data) + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); + else + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1105,7 +1244,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, return err; } -#endif /* * To preserve ordering, it is essential that the hole instantiation and @@ -1116,7 +1254,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; @@ -1125,9 +1263,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; unsigned from, to; + fgf_t fgp = FGP_WRITEBEGIN; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; trace_ext4_write_begin(inode, pos, len); /* @@ -1136,12 +1276,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, - pagep); + foliop); if (ret < 0) return ret; if (ret == 1) @@ -1156,10 +1294,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * the folio (if needed) without using GFP_NOFS. */ retry_grab: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + /* * The same as page allocation, we prealloc buffer heads before * starting the handle. @@ -1187,19 +1333,12 @@ retry_journal: /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); -#ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(folio, pos, len, + ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); -#else - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(&folio->page, pos, len, - ext4_get_block_unwritten); - else - ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); -#endif + ret = ext4_block_write_begin(handle, folio, pos, len, + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, @@ -1212,7 +1351,7 @@ retry_journal: folio_unlock(folio); /* - * __block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * @@ -1241,7 +1380,7 @@ retry_journal: folio_put(folio); return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -1269,9 +1408,8 @@ static int write_end_fn(handle_t *handle, struct inode *inode, static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1286,7 +1424,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1299,8 +1437,10 @@ static int ext4_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1360,9 +1500,9 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, size = min(to, block_end) - start; folio_zero_range(folio, start, size); - write_end_fn(handle, inode, bh); } clear_buffer_new(bh); + write_end_fn(handle, inode, bh); } } block_start = block_end; @@ -1373,9 +1513,8 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1416,8 +1555,10 @@ static int ext4_journalled_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1450,9 +1591,9 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve space for a single cluster + * Reserve space for 'nr_resv' clusters */ -static int ext4_da_reserve_space(struct inode *inode) +static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1463,18 +1604,18 @@ static int ext4_da_reserve_space(struct inode *inode) * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); - if (ext4_claim_free_clusters(sbi, 1, 0)) { + if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); - dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } - ei->i_reserved_data_blocks++; - trace_ext4_da_reserve_space(inode); + ei->i_reserved_data_blocks += nr_resv; + trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1621,24 +1762,58 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * ext4_insert_delayed_block - adds a delayed block to the extents status - * tree, incrementing the reserved cluster/block - * count or making a pending reservation - * where needed + * Check whether the cluster containing lblk has been allocated or has + * delalloc reservation. + * + * Returns 0 if the cluster doesn't have either, 1 if it has delalloc + * reservation, 2 if it's already been allocated, negative error code on + * failure. + */ +static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + /* Has delalloc reservation? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) + return 1; + + /* Already been allocated? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) + return 2; + ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); + if (ret < 0) + return ret; + if (ret > 0) + return 2; + + return 0; +} + +/* + * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents + * status tree, incrementing the reserved + * cluster/block count or making pending + * reservations where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added + * @lblk - start logical block to be added + * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ -static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; - bool allocated = false; + bool lclu_allocated = false; + bool end_allocated = false; + ext4_lblk_t resv_clu; + ext4_lblk_t end = lblk + len - 1; /* - * If the cluster containing lblk is shared with a delayed, + * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's @@ -1649,81 +1824,86 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { - ret = ext4_da_reserve_space(inode); + ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ - if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { - if (!ext4_es_scan_clu(inode, - &ext4_es_is_mapped, lblk)) { - ret = ext4_clu_mapped(inode, - EXT4_B2C(sbi, lblk)); - if (ret < 0) - return ret; - if (ret == 0) { - ret = ext4_da_reserve_space(inode); - if (ret != 0) /* ENOSPC */ - return ret; - } else { - allocated = true; - } - } else { - allocated = true; + resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; + + ret = ext4_clu_alloc_state(inode, lblk); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + lclu_allocated = (ret == 2); + } + + if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { + ret = ext4_clu_alloc_state(inode, end); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + end_allocated = (ret == 2); } } + + if (resv_clu) { + ret = ext4_da_reserve_space(inode, resv_clu); + if (ret != 0) /* ENOSPC */ + return ret; + } } - ext4_es_insert_delayed_block(inode, lblk, allocated); + ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, + end_allocated); return 0; } /* - * This function is grabs code from the very beginning of - * ext4_map_blocks, but assumes that the caller is from delayed write - * time. This function looks up the requested blocks and sets the - * buffer delay bit under the protection of i_data_sem. + * Looks up the requested blocks and sets the delalloc extent map. + * First try to look up for the extent entry that contains the requested + * blocks in the extent status tree without i_data_sem, then try to look + * up for the ondisk extent mapping with i_data_sem in read mode, + * finally hold i_data_sem in write mode, looks up again and add a + * delalloc extent entry if it still couldn't find any extent. Pass out + * the mapped extent through @map and return 0 on success. */ -static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, - struct ext4_map_blocks *map, - struct buffer_head *bh) +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; - sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; - map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + if (ext4_es_is_hole(&es)) goto add_delayed; +found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delayed(&es)) { + map->m_flags |= EXT4_MAP_DELAYED; return 0; } - map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; - retval = es.es_len - (iblock - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) @@ -1734,7 +1914,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif - return retval; + return 0; } /* @@ -1744,44 +1924,41 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; - else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); - if (retval < 0) { - up_read(&EXT4_I(inode)->i_data_sem); - return retval; - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - up_read(&EXT4_I(inode)->i_data_sem); - return retval; - } + retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); - retval = ext4_insert_delayed_block(inode, map->m_lblk); + /* + * Page fault path (ext4_page_mkwrite does not take i_rwsem) + * and fallocate path (no folio lock) can race. Make sure we + * lookup the extent status tree here again while i_data_sem + * is held in write mode, before inserting a new da entry in + * the extent status tree. + */ + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + + if (!ext4_es_is_hole(&es)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto found; + } + } else if (!ext4_has_inline_data(inode)) { + retval = ext4_map_query_blocks(NULL, inode, map, 0); + if (retval) { + up_write(&EXT4_I(inode)->i_data_sem); + return retval < 0 ? retval : 0; + } + } + + map->m_flags |= EXT4_MAP_DELAYED; + retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); - if (retval) - return retval; - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); return retval; } @@ -1801,11 +1978,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; + sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + map.m_lblk = iblock; map.m_len = 1; @@ -1814,10 +1995,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_da_map_blocks(inode, iblock, &map, bh); - if (ret <= 0) + ret = ext4_da_map_blocks(inode, &map); + if (ret < 0) return ret; + if (map.m_flags & EXT4_MAP_DELAYED) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); @@ -1865,10 +2053,10 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) - len = size & ~PAGE_MASK; + len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } @@ -2091,7 +2279,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; - lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2102,6 +2289,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; + lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2144,21 +2332,18 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if - * the blocks in question are delalloc blocks. This indicates - * that the blocks and quotas has already been checked when - * the data was copied into the page cache. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (map->m_flags & BIT(BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) @@ -2169,7 +2354,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } - ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); @@ -2217,7 +2402,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2294,7 +2479,7 @@ update_disksize: */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); @@ -2330,11 +2515,11 @@ static int mpage_journal_page_buffers(handle_t *handle, size_t len = folio_size(folio); folio_clear_checked(folio); - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size - folio_pos(folio); + len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } @@ -2372,7 +2557,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; - int bpp = ext4_journal_blocks_per_page(mpd->inode); + int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -2543,10 +2728,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { - ret = -EROFS; + ret = ext4_emergency_state(mapping->host->i_sb); + if (unlikely(ret)) goto out_writepages; - } /* * If we have inline data and arrive here, it means that @@ -2761,8 +2945,9 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); @@ -2802,8 +2987,9 @@ static int ext4_dax_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); @@ -2852,29 +3038,31 @@ static int ext4_nonda_switch(struct super_block *sb) static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; + fgf_t fgp = FGP_WRITEBEGIN; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, - len, pagep, fsdata); + len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - pagep, fsdata); + ret = ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) @@ -2882,19 +3070,17 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - /* In case writeback began while the folio was unlocked */ - folio_wait_stable(folio); + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; -#ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); -#else - ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); -#endif + ret = ext4_block_write_begin(NULL, folio, pos, len, + ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); @@ -2912,7 +3098,7 @@ retry: return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -2946,14 +3132,20 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size; + loff_t new_i_size, zero_len = 0; + handle_t *handle; + if (unlikely(!folio_buffers(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, - &folio->page, NULL); + folio, NULL); new_i_size = pos + copied; /* @@ -2975,7 +3167,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, unsigned long end; i_size_write(inode, new_i_size); - end = (new_i_size - 1) & (PAGE_SIZE - 1); + end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; @@ -2985,18 +3177,21 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (old_size < pos) + if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); + zero_len = pos - old_size; + } - if (disksize_changed) { - handle_t *handle; + if (!disksize_changed && !zero_len) + return copied; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); return copied; } @@ -3004,15 +3199,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; - struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, &folio->page, fsdata); + len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); @@ -3232,6 +3426,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + /* HW-offload atomics are always used */ + if (flags & IOMAP_ATOMIC) + iomap->flags |= IOMAP_F_ATOMIC_BIO; + if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else @@ -3271,12 +3469,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, } } +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at @@ -3284,7 +3619,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } retry: /* @@ -3315,7 +3673,11 @@ retry: else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ret = ext4_map_blocks(handle, inode, map, m_flags); + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could @@ -3329,6 +3691,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + return ret; } @@ -3339,6 +3717,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; @@ -3352,6 +3731,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* @@ -3362,11 +3742,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) - goto out; + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3380,6 +3772,16 @@ out: */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; @@ -3401,17 +3803,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } +static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) +{ + /* must be a directio to fall back to buffered */ + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != + (IOMAP_WRITE | IOMAP_DIRECT)) + return false; + + /* atomic writes are all-or-nothing */ + if (flags & IOMAP_ATOMIC) + return false; + + /* can only try again if we wrote nothing */ + return written == 0; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code so that we - * fallback to buffered I/O and attempt to complete the remainder of - * the I/O. Any blocks that may have been allocated in preparation for - * the direct I/O will be reused during buffered I/O. + * the allocated blocks. If so, return the magic error code for + * non-atomic write so that we fallback to buffered I/O and attempt to + * complete the remainder of the I/O. + * For non-atomic writes, any blocks that may have been + * allocated in preparation for the direct I/O will be reused during + * buffered I/O. For atomic write, we never fallback to buffered-io. */ - if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; @@ -3530,7 +3949,6 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3547,7 +3965,6 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3564,7 +3981,6 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3573,7 +3989,6 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, @@ -3608,9 +4023,7 @@ void ext4_set_aops(struct inode *inode) static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, pos; + unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3625,13 +4038,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -3831,6 +4245,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3840,24 +4316,10 @@ static void ext4_wait_dax_page(struct inode *inode) int ext4_break_layouts(struct inode *inode) { - struct page *page; - int error; - if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; - do { - page = dax_layout_busy_page(inode->i_mapping); - if (!page) - return 0; - - error = ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, - TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(inode)); - } while (error == 0); - - return error; + return dax_break_layout_inode(inode, ext4_wait_dax_page); } /* @@ -3875,91 +4337,56 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset, max_length; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t start_lblk, end_lblk; + loff_t max_end = sb->s_maxbytes; + loff_t end = offset + length; handle_t *handle; unsigned int credits; - int ret = 0, ret2 = 0; + int ret; trace_ext4_punch_hole(inode, offset, length, 0); + WARN_ON_ONCE(!inode_is_locked(inode)); /* - * Write out all dirty pages to avoid race conditions - * Then release them. + * For indirect-block based inodes, make sure that the hole within + * one block before last range. */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - - inode_lock(inode); + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; + if (offset >= inode->i_size || offset >= max_end) + return 0; /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size + * If the hole extends beyond i_size, set the hole to end after + * the page that contains i_size. */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - - offset; - } + if (end > inode->i_size) + end = round_up(inode->i_size, PAGE_SIZE); + if (end > max_end) + end = max_end; + length = end - offset; /* - * For punch hole the length + offset needs to be within one block - * before last range. Adjust the length if it goes beyond that limit. + * Attach jinode to inode for jbd2 if we do any zeroing of partial + * block. */ - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; - if (offset + length > max_length) - length = max_length - offset; - - if (offset & (sb->s_blocksize - 1) || - (offset + length) & (sb->s_blocksize - 1)) { - /* - * Attach jinode to inode for jbd2 if we do any zeroing of - * partial block - */ + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) - goto out_mutex; - + return ret; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - ret = ext4_break_layouts(inode); + ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) - goto out_dio; - - first_block_offset = round_up(offset, sb->s_blocksize); - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + return ret; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) { - ret = ext4_update_disksize_before_punch(inode, offset, length); - if (ret) - goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - } + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3969,54 +4396,53 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); - goto out_dio; + return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, - length); + ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) - goto out_stop; - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + goto out_handle; /* If there are blocks to remove, do it */ - if (stop_block > first_block) { - ext4_lblk_t hole_len = stop_block - first_block; + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> inode->i_blkbits; + + if (end_lblk > start_lblk) { + ext4_lblk_t hole_len = end_lblk - start_lblk; + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, hole_len); + ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); + ret = ext4_ext_remove_space(inode, start_lblk, + end_lblk - 1); else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + ret = ext4_ind_remove_space(handle, inode, start_lblk, + end_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } - ext4_es_insert_extent(inode, first_block, hole_len, ~0, - EXTENT_STATUS_HOLE); + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, + EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(handle, inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); + + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - ret2 = ext4_mark_inode_dirty(handle, inode); - if (unlikely(ret2)) - ret = ret2; - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_dio: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -4138,8 +4564,10 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; - down_write(&EXT4_I(inode)->i_data_sem); + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4458,10 +4886,10 @@ make_io: * Read the block from disk. */ trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, + ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; @@ -4603,6 +5031,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode, *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; + err = xattr_check_inode(inode, IHDR(inode, raw_inode), + ITAIL(inode, raw_inode)); + if (err) + return err; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) @@ -4634,22 +5067,60 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } -static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) - +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, + const char *function, unsigned int line) { + const char *err_str; + if (flags & EXT4_IGET_EA_INODE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "missing EA_INODE flag"; + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + err_str = "missing EA_INODE flag"; + goto error; + } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - EXT4_I(inode)->i_file_acl) - return "ea_inode with extended attributes"; + EXT4_I(inode)->i_file_acl) { + err_str = "ea_inode with extended attributes"; + goto error; + } } else { - if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "unexpected EA_INODE flag"; + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + /* + * open_by_handle_at() could provide an old inode number + * that has since been reused for an ea_inode; this does + * not indicate filesystem corruption + */ + if (flags & EXT4_IGET_HANDLE) + return -ESTALE; + err_str = "unexpected EA_INODE flag"; + goto error; + } } - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) - return "unexpected bad inode w/o EXT4_IGET_BAD"; - return NULL; + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; + goto error; + } + return 0; + +error: + ext4_error_inode(inode, function, line, 0, "%s", err_str); + return -EFSCORRUPTED; +} + +bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -4661,7 +5132,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; - const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4670,12 +5140,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if ((!(flags & EXT4_IGET_SPECIAL) && - ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || - ino == le32_to_cpu(es->s_usr_quota_inum) || - ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum) || - ino == le32_to_cpu(es->s_orphan_file_inum))) || + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) @@ -4690,10 +5155,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); + ret = check_igot_inode(inode, flags, function, line); + if (ret) { iput(inode); - return ERR_PTR(-EFSCORRUPTED); + return ERR_PTR(ret); } return inode; } @@ -4729,15 +5194,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || @@ -4805,7 +5269,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -4816,7 +5281,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ - if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + if (!ext4_has_feature_dir_index(sb) && + ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); @@ -4935,10 +5401,19 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { - inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } @@ -4965,13 +5440,24 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); - ret = -EFSCORRUPTED; - goto bad_inode; - } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ret = check_igot_inode(inode, flags, function, line); + /* + * -ESTALE here means there is nothing inherently wrong with the inode, + * it's just not an inode we can return for an fhandle lookup. + */ + if (ret == -ESTALE) { + brelse(iloc.bh); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-ESTALE); + } + if (ret) + goto bad_inode; brelse(iloc.bh); + unlock_new_inode(inode); return inode; @@ -5156,8 +5642,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { @@ -5207,8 +5694,9 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = 0; + tid_t commit_tid; int ret; + bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* @@ -5233,12 +5721,14 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) folio_put(folio); if (ret != -EBUSY) return; - commit_tid = 0; + has_transaction = false; read_lock(&journal->j_state_lock); - if (journal->j_committing_transaction) + if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; + has_transaction = true; + } read_unlock(&journal->j_state_lock); - if (commit_tid) + if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } @@ -5276,8 +5766,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + error = ext4_emergency_state(inode->i_sb); + if (unlikely(error)) + return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -5384,6 +5875,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto out_mmap_sem; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -5394,12 +5893,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, orphan = 1; } /* - * Update c/mtime on truncate up, ext4_truncate() will - * update c/mtime in shrink case below + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. */ - if (!shrink) + if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + } if (shrink) ext4_fc_track_range(handle, inode, @@ -5417,9 +5921,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code @@ -5430,6 +5932,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; @@ -5536,6 +6041,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, } } + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; @@ -5614,8 +6131,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5623,18 +6139,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int ret; /* - * How many index blocks need to touch to map @lblocks logical blocks - * to @pextents physical extents? + * How many index and lead blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); - ret = idxblocks; - /* * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks + pextents; + groups = idxblocks; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -5642,7 +6156,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; + ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); @@ -5662,7 +6176,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_writepage_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); @@ -5696,9 +6210,10 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) { + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) { put_bh(iloc->bh); - return -EIO; + return err; } ext4_fc_track_inode(handle, inode); @@ -5722,8 +6237,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { @@ -5734,6 +6250,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; @@ -6147,7 +6664,8 @@ retry_alloc: if (folio_pos(folio) + len > size) len = size - folio_pos(folio); - err = __block_write_begin(&folio->page, 0, len, ext4_get_block); + err = ext4_block_write_begin(handle, folio, 0, len, + ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) |