diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 697 |
1 files changed, 399 insertions, 298 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2ccf3b5e3a7c..7c54ae5fcbd4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -49,6 +49,11 @@ #include <trace/events/ext4.h> +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, + struct folio *folio, + unsigned from, unsigned to); + static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { @@ -453,6 +458,117 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map) +{ + unsigned int status; + int retval; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(handle, inode, map, 0); + else + retval = ext4_ind_map_blocks(handle, inode, map, 0); + + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; +} + +static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + unsigned int status; + int err, retval = 0; + + /* + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE + * indicates that the blocks and quotas has already been + * checked when the data was copied into the page cache. + */ + if (map->m_flags & EXT4_MAP_DELAYED) + flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + /* + * Here we clear m_flags because after allocating an new extent, + * it will be set again. + */ + map->m_flags &= ~EXT4_MAP_FLAGS; + + /* + * We need to check for EXT4 here because migrate could have + * changed the inode type in between. + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + /* + * We allocated new blocks which will result in i_data's + * format changing. Force the migrate to fail by clearing + * migrate flags. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode %lu: " + "retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, + map->m_len); + if (err) + return err; + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if (flags & EXT4_GET_BLOCKS_PRE_IO && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_is_written(&es)) + return retval; + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, + status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + + return retval; +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -465,9 +581,10 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. if - * create==0 and the blocks are pre-allocated and unwritten, the resulting @map - * is marked as unwritten. If the create == 1, it will mark @map as mapped. + * On success, it returns the number of blocks being mapped or allocated. + * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are + * pre-allocated and unwritten, the resulting @map is marked as unwritten. + * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to @@ -546,32 +663,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, 0); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, 0); - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } + retval = ext4_map_query_blocks(handle, inode, map); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -589,8 +681,7 @@ found: * Returns if the blocks have already allocated * * Note that if blocks have been preallocated - * ext4_ext_get_block() returns the create = 0 - * with buffer head unmapped. + * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* @@ -602,88 +693,13 @@ found: return retval; /* - * Here we clear m_flags because after allocating an new extent, - * it will be set again. - */ - map->m_flags &= ~EXT4_MAP_FLAGS; - - /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); - - /* - * We need to check for EXT4 here because migrate - * could have changed the inode type in between - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { - /* - * We allocated new blocks which will result in - * i_data's format changing. Force the migrate - * to fail by clearing migrate flags - */ - ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - } - } - - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - /* - * We have to zeroout blocks before inserting them into extent - * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. We also have to - * unmap metadata before zeroing as otherwise writeback can - * overwrite zeros with stale data from block device. - */ - if (flags & EXT4_GET_BLOCKS_ZERO && - map->m_flags & EXT4_MAP_MAPPED && - map->m_flags & EXT4_MAP_NEW) { - ret = ext4_issue_zeroout(inode, map->m_lblk, - map->m_pblk, map->m_len); - if (ret) { - retval = ret; - goto out_sem; - } - } - - /* - * If the extent has been zeroed out, we don't need to update - * extent status tree. - */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { - if (ext4_es_is_written(&es)) - goto out_sem; - } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } - -out_sem: + retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); @@ -840,7 +856,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); - bh = sb_getblk(inode->i_sb, map.m_pblk); + /* + * Since bh could introduce extra ref count such as referred by + * journal_head etc. Try to avoid using __GFP_MOVABLE here + * as it may fail the migration when journal_head remains. + */ + bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, + inode->i_sb->s_blocksize); + if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { @@ -989,32 +1012,16 @@ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - int dirty = buffer_dirty(bh); - int ret; - if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - /* - * __block_write_begin() could have dirtied some buffers. Clean - * the dirty bit as jbd2_journal_get_write_access() could complain - * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_write_begin() isn't a real problem here as we clear - * the bit before releasing a page lock and thus writeback cannot - * ever write the buffer. - */ - if (dirty) - clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); - ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); - if (!ret && dirty) - ret = ext4_dirty_journalled_data(handle, bh); - return ret; } -#ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, - get_block_t *get_block) +int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, + get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -1027,6 +1034,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; + bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); @@ -1056,10 +1064,22 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { + /* + * We may be zeroing partial buffers or all new + * buffers in case of failure. Prepare JBD2 for + * that. + */ + if (should_journal_data) + do_journal_get_write_access(handle, + inode, bh); if (folio_test_uptodate(folio)) { - clear_buffer_new(bh); + /* + * Unlike __block_write_begin() we leave + * dirtying of new uptodate buffers to + * ->write_end() time or + * folio_zero_new_buffers(). + */ set_buffer_uptodate(bh); - mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) @@ -1089,7 +1109,11 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - folio_zero_new_buffers(folio, from, to); + if (should_journal_data) + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); + else + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1105,7 +1129,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, return err; } -#endif /* * To preserve ordering, it is essential that the hole instantiation and @@ -1116,7 +1139,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; @@ -1141,7 +1164,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, - pagep); + foliop); if (ret < 0) return ret; if (ret == 1) @@ -1187,19 +1210,12 @@ retry_journal: /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); -#ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(folio, pos, len, + ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); -#else - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(&folio->page, pos, len, - ext4_get_block_unwritten); - else - ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); -#endif + ret = ext4_block_write_begin(handle, folio, pos, len, + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, @@ -1212,7 +1228,7 @@ retry_journal: folio_unlock(folio); /* - * __block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * @@ -1241,7 +1257,7 @@ retry_journal: folio_put(folio); return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -1269,9 +1285,8 @@ static int write_end_fn(handle_t *handle, struct inode *inode, static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1286,7 +1301,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1299,8 +1314,10 @@ static int ext4_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1360,9 +1377,9 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, size = min(to, block_end) - start; folio_zero_range(folio, start, size); - write_end_fn(handle, inode, bh); } clear_buffer_new(bh); + write_end_fn(handle, inode, bh); } } block_start = block_end; @@ -1373,9 +1390,8 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1416,8 +1432,10 @@ static int ext4_journalled_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1450,9 +1468,9 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve space for a single cluster + * Reserve space for 'nr_resv' clusters */ -static int ext4_da_reserve_space(struct inode *inode) +static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1463,18 +1481,18 @@ static int ext4_da_reserve_space(struct inode *inode) * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); - if (ext4_claim_free_clusters(sbi, 1, 0)) { + if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); - dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } - ei->i_reserved_data_blocks++; - trace_ext4_da_reserve_space(inode); + ei->i_reserved_data_blocks += nr_resv; + trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1621,24 +1639,58 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * ext4_insert_delayed_block - adds a delayed block to the extents status - * tree, incrementing the reserved cluster/block - * count or making a pending reservation - * where needed + * Check whether the cluster containing lblk has been allocated or has + * delalloc reservation. + * + * Returns 0 if the cluster doesn't have either, 1 if it has delalloc + * reservation, 2 if it's already been allocated, negative error code on + * failure. + */ +static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + /* Has delalloc reservation? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) + return 1; + + /* Already been allocated? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) + return 2; + ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); + if (ret < 0) + return ret; + if (ret > 0) + return 2; + + return 0; +} + +/* + * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents + * status tree, incrementing the reserved + * cluster/block count or making pending + * reservations where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added + * @lblk - start logical block to be added + * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ -static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; - bool allocated = false; + bool lclu_allocated = false; + bool end_allocated = false; + ext4_lblk_t resv_clu; + ext4_lblk_t end = lblk + len - 1; /* - * If the cluster containing lblk is shared with a delayed, + * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's @@ -1649,81 +1701,84 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { - ret = ext4_da_reserve_space(inode); + ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ - if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { - if (!ext4_es_scan_clu(inode, - &ext4_es_is_mapped, lblk)) { - ret = ext4_clu_mapped(inode, - EXT4_B2C(sbi, lblk)); - if (ret < 0) - return ret; - if (ret == 0) { - ret = ext4_da_reserve_space(inode); - if (ret != 0) /* ENOSPC */ - return ret; - } else { - allocated = true; - } - } else { - allocated = true; + resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; + + ret = ext4_clu_alloc_state(inode, lblk); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + lclu_allocated = (ret == 2); + } + + if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { + ret = ext4_clu_alloc_state(inode, end); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + end_allocated = (ret == 2); } } + + if (resv_clu) { + ret = ext4_da_reserve_space(inode, resv_clu); + if (ret != 0) /* ENOSPC */ + return ret; + } } - ext4_es_insert_delayed_block(inode, lblk, allocated); + ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, + end_allocated); return 0; } /* - * This function is grabs code from the very beginning of - * ext4_map_blocks, but assumes that the caller is from delayed write - * time. This function looks up the requested blocks and sets the - * buffer delay bit under the protection of i_data_sem. + * Looks up the requested blocks and sets the delalloc extent map. + * First try to look up for the extent entry that contains the requested + * blocks in the extent status tree without i_data_sem, then try to look + * up for the ondisk extent mapping with i_data_sem in read mode, + * finally hold i_data_sem in write mode, looks up again and add a + * delalloc extent entry if it still couldn't find any extent. Pass out + * the mapped extent through @map and return 0 on success. */ -static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, - struct ext4_map_blocks *map, - struct buffer_head *bh) +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; - sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; - map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + if (ext4_es_is_hole(&es)) goto add_delayed; +found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delayed(&es)) { + map->m_flags |= EXT4_MAP_DELAYED; return 0; } - map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; - retval = es.es_len - (iblock - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) @@ -1734,7 +1789,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif - return retval; + return 0; } /* @@ -1744,44 +1799,41 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; - else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); - if (retval < 0) { - up_read(&EXT4_I(inode)->i_data_sem); - return retval; - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - up_read(&EXT4_I(inode)->i_data_sem); - return retval; - } + retval = ext4_map_query_blocks(NULL, inode, map); up_read(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); - retval = ext4_insert_delayed_block(inode, map->m_lblk); + /* + * Page fault path (ext4_page_mkwrite does not take i_rwsem) + * and fallocate path (no folio lock) can race. Make sure we + * lookup the extent status tree here again while i_data_sem + * is held in write mode, before inserting a new da entry in + * the extent status tree. + */ + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + + if (!ext4_es_is_hole(&es)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto found; + } + } else if (!ext4_has_inline_data(inode)) { + retval = ext4_map_query_blocks(NULL, inode, map); + if (retval) { + up_write(&EXT4_I(inode)->i_data_sem); + return retval < 0 ? retval : 0; + } + } + + map->m_flags |= EXT4_MAP_DELAYED; + retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); - if (retval) - return retval; - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); return retval; } @@ -1801,11 +1853,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; + sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + map.m_lblk = iblock; map.m_len = 1; @@ -1814,10 +1870,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_da_map_blocks(inode, iblock, &map, bh); - if (ret <= 0) + ret = ext4_da_map_blocks(inode, &map); + if (ret < 0) return ret; + if (map.m_flags & EXT4_MAP_DELAYED) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); @@ -1865,7 +1928,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) - len = size & ~PAGE_MASK; + len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; @@ -2145,11 +2208,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if - * the blocks in question are delalloc blocks. This indicates - * that the blocks and quotas has already been checked when - * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | @@ -2157,8 +2215,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (map->m_flags & BIT(BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) @@ -2334,7 +2390,7 @@ static int mpage_journal_page_buffers(handle_t *handle, if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size - folio_pos(folio); + len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } @@ -2852,7 +2908,7 @@ static int ext4_nonda_switch(struct super_block *sb) static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; @@ -2867,14 +2923,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, - len, pagep, fsdata); + len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - pagep, fsdata); + foliop, fsdata); if (ret < 0) return ret; if (ret == 1) @@ -2887,14 +2943,8 @@ retry: if (IS_ERR(folio)) return PTR_ERR(folio); - /* In case writeback began while the folio was unlocked */ - folio_wait_stable(folio); - -#ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); -#else - ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); -#endif + ret = ext4_block_write_begin(NULL, folio, pos, len, + ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); @@ -2912,7 +2962,7 @@ retry: return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -2946,14 +2996,20 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size; + loff_t new_i_size, zero_len = 0; + handle_t *handle; + if (unlikely(!folio_buffers(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, - &folio->page, NULL); + folio, NULL); new_i_size = pos + copied; /* @@ -2985,18 +3041,21 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (old_size < pos) + if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); + zero_len = pos - old_size; + } - if (disksize_changed) { - handle_t *handle; + if (!disksize_changed && !zero_len) + return copied; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); return copied; } @@ -3004,15 +3063,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; - struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, &folio->page, fsdata); + len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); @@ -3401,17 +3459,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } +static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) +{ + /* must be a directio to fall back to buffered */ + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != + (IOMAP_WRITE | IOMAP_DIRECT)) + return false; + + /* atomic writes are all-or-nothing */ + if (flags & IOMAP_ATOMIC) + return false; + + /* can only try again if we wrote nothing */ + return written == 0; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code so that we - * fallback to buffered I/O and attempt to complete the remainder of - * the I/O. Any blocks that may have been allocated in preparation for - * the direct I/O will be reused during buffered I/O. + * the allocated blocks. If so, return the magic error code for + * non-atomic write so that we fallback to buffered I/O and attempt to + * complete the remainder of the I/O. + * For non-atomic writes, any blocks that may have been + * allocated in preparation for the direct I/O will be reused during + * buffered I/O. For atomic write, we never fallback to buffered-io. */ - if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; @@ -3530,7 +3605,6 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3547,7 +3621,6 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3564,7 +3637,6 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3573,7 +3645,6 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, @@ -3998,7 +4069,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) stop_block); ext4_es_insert_extent(inode, first_block, hole_len, ~0, - EXTENT_STATUS_HOLE); + EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); @@ -4458,10 +4529,10 @@ make_io: * Read the block from disk. */ trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, + ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; @@ -4935,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { - inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } @@ -5207,8 +5279,9 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = 0; + tid_t commit_tid; int ret; + bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* @@ -5233,12 +5306,14 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) folio_put(folio); if (ret != -EBUSY) return; - commit_tid = 0; + has_transaction = false; read_lock(&journal->j_state_lock); - if (journal->j_committing_transaction) + if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; + has_transaction = true; + } read_unlock(&journal->j_state_lock); - if (commit_tid) + if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } @@ -5384,6 +5459,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto err_out; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -5394,12 +5477,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, orphan = 1; } /* - * Update c/mtime on truncate up, ext4_truncate() will - * update c/mtime in shrink case below + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. */ - if (!shrink) + if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + } if (shrink) ext4_fc_track_range(handle, inode, @@ -5536,6 +5624,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, } } + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; @@ -6147,7 +6247,8 @@ retry_alloc: if (folio_pos(folio) + len > size) len = size - folio_pos(folio); - err = __block_write_begin(&folio->page, 0, len, ext4_get_block); + err = ext4_block_write_begin(handle, folio, 0, len, + ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) |