diff options
Diffstat (limited to 'fs/ext4/fast_commit.c')
-rw-r--r-- | fs/ext4/fast_commit.c | 897 |
1 files changed, 888 insertions, 9 deletions
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 888d9d217d5b..426c0ab8b70c 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -166,7 +166,8 @@ void ext4_fc_start_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) + if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) return; restart: @@ -205,7 +206,8 @@ void ext4_fc_stop_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) + if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) return; if (atomic_dec_and_test(&ei->i_fc_updates)) @@ -220,11 +222,8 @@ void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) - return; - - - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) + if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) return; restart: @@ -266,6 +265,10 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason) { struct ext4_sb_info *sbi = EXT4_SB(sb); + if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + sbi->s_mount_state |= EXT4_FC_INELIGIBLE; WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; @@ -279,6 +282,10 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason) { struct ext4_sb_info *sbi = EXT4_SB(sb); + if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; atomic_inc(&sbi->s_fc_ineligible_updates); @@ -291,6 +298,10 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason) */ void ext4_fc_stop_ineligible(struct super_block *sb) { + if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE; atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); } @@ -321,7 +332,8 @@ static int ext4_fc_track_template( struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) + if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || + (sbi->s_mount_state & EXT4_FC_REPLAY)) return -EOPNOTSUPP; if (ext4_fc_is_ineligible(inode->i_sb)) @@ -1188,13 +1200,880 @@ static void ext4_fc_cleanup(journal_t *journal, int full) trace_ext4_fc_stats(sb); } +/* Ext4 Replay Path Routines */ + +/* Get length of a particular tlv */ +static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) +{ + return le16_to_cpu(tl->fc_len); +} + +/* Get a pointer to "value" of a tlv */ +static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) +{ + return (u8 *)tl + sizeof(*tl); +} + +/* Helper struct for dentry replay routines */ +struct dentry_info_args { + int parent_ino, dname_len, ino, inode_len; + char *dname; +}; + +static inline void tl_to_darg(struct dentry_info_args *darg, + struct ext4_fc_tl *tl) +{ + struct ext4_fc_dentry_info *fcd; + + fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); + + darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); + darg->ino = le32_to_cpu(fcd->fc_ino); + darg->dname = fcd->fc_dname; + darg->dname_len = ext4_fc_tag_len(tl) - + sizeof(struct ext4_fc_dentry_info); +} + +/* Unlink replay function */ +static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) +{ + struct inode *inode, *old_parent; + struct qstr entry; + struct dentry_info_args darg; + int ret = 0; + + tl_to_darg(&darg, tl); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, + darg.parent_ino, darg.dname_len); + + entry.name = darg.dname; + entry.len = darg.dname_len; + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode %d not found", darg.ino); + return 0; + } + + old_parent = ext4_iget(sb, darg.parent_ino, + EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(old_parent)) { + jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); + iput(inode); + return 0; + } + + ret = __ext4_unlink(old_parent, &entry, inode); + /* -ENOENT ok coz it might not exist anymore. */ + if (ret == -ENOENT) + ret = 0; + iput(old_parent); + iput(inode); + return ret; +} + +static int ext4_fc_replay_link_internal(struct super_block *sb, + struct dentry_info_args *darg, + struct inode *inode) +{ + struct inode *dir = NULL; + struct dentry *dentry_dir = NULL, *dentry_inode = NULL; + struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); + int ret = 0; + + dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); + if (IS_ERR(dir)) { + jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); + dir = NULL; + goto out; + } + + dentry_dir = d_obtain_alias(dir); + if (IS_ERR(dentry_dir)) { + jbd_debug(1, "Failed to obtain dentry"); + dentry_dir = NULL; + goto out; + } + + dentry_inode = d_alloc(dentry_dir, &qstr_dname); + if (!dentry_inode) { + jbd_debug(1, "Inode dentry not created."); + ret = -ENOMEM; + goto out; + } + + ret = __ext4_link(dir, inode, dentry_inode); + /* + * It's possible that link already existed since data blocks + * for the dir in question got persisted before we crashed OR + * we replayed this tag and crashed before the entire replay + * could complete. + */ + if (ret && ret != -EEXIST) { + jbd_debug(1, "Failed to link\n"); + goto out; + } + + ret = 0; +out: + if (dentry_dir) { + d_drop(dentry_dir); + dput(dentry_dir); + } else if (dir) { + iput(dir); + } + if (dentry_inode) { + d_drop(dentry_inode); + dput(dentry_inode); + } + + return ret; +} + +/* Link replay function */ +static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) +{ + struct inode *inode; + struct dentry_info_args darg; + int ret = 0; + + tl_to_darg(&darg, tl); + trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, + darg.parent_ino, darg.dname_len); + + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode not found."); + return 0; + } + + ret = ext4_fc_replay_link_internal(sb, &darg, inode); + iput(inode); + return ret; +} + +/* + * Record all the modified inodes during replay. We use this later to setup + * block bitmaps correctly. + */ +static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) +{ + struct ext4_fc_replay_state *state; + int i; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_modified_inodes_used; i++) + if (state->fc_modified_inodes[i] == ino) + return 0; + if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { + state->fc_modified_inodes_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; + state->fc_modified_inodes = krealloc( + state->fc_modified_inodes, sizeof(int) * + state->fc_modified_inodes_size, + GFP_KERNEL); + if (!state->fc_modified_inodes) + return -ENOMEM; + } + state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; + return 0; +} + +/* + * Inode replay function + */ +static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) +{ + struct ext4_fc_inode *fc_inode; + struct ext4_inode *raw_inode; + struct ext4_inode *raw_fc_inode; + struct inode *inode = NULL; + struct ext4_iloc iloc; + int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); + struct ext4_extent_header *eh; + + fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); + + ino = le32_to_cpu(fc_inode->fc_ino); + trace_ext4_fc_replay(sb, tag, ino, 0, 0); + + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); + if (!IS_ERR_OR_NULL(inode)) { + ext4_ext_clear_bb(inode); + iput(inode); + } + + ext4_fc_record_modified_inode(sb, ino); + + raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; + ret = ext4_get_fc_inode_loc(sb, ino, &iloc); + if (ret) + goto out; + + inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); + raw_inode = ext4_raw_inode(&iloc); + + memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); + memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, + inode_len - offsetof(struct ext4_inode, i_generation)); + if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { + eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); + if (eh->eh_magic != EXT4_EXT_MAGIC) { + memset(eh, 0, sizeof(*eh)); + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = cpu_to_le16( + (sizeof(raw_inode->i_block) - + sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent)); + } + } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { + memcpy(raw_inode->i_block, raw_fc_inode->i_block, + sizeof(raw_inode->i_block)); + } + + /* Immediately update the inode on disk. */ + ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); + if (ret) + goto out; + ret = sync_dirty_buffer(iloc.bh); + if (ret) + goto out; + ret = ext4_mark_inode_used(sb, ino); + if (ret) + goto out; + + /* Given that we just wrote the inode on disk, this SHOULD succeed. */ + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode not found."); + return -EFSCORRUPTED; + } + + /* + * Our allocator could have made different decisions than before + * crashing. This should be fixed but until then, we calculate + * the number of blocks the inode. + */ + ext4_ext_replay_set_iblocks(inode); + + inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); + ext4_reset_inode_seed(inode); + + ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); + ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); + sync_dirty_buffer(iloc.bh); + brelse(iloc.bh); +out: + iput(inode); + if (!ret) + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); + + return 0; +} + +/* + * Dentry create replay function. + * + * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the + * inode for which we are trying to create a dentry here, should already have + * been replayed before we start here. + */ +static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) +{ + int ret = 0; + struct inode *inode = NULL; + struct inode *dir = NULL; + struct dentry_info_args darg; + + tl_to_darg(&darg, tl); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, + darg.parent_ino, darg.dname_len); + + /* This takes care of update group descriptor and other metadata */ + ret = ext4_mark_inode_used(sb, darg.ino); + if (ret) + goto out; + + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "inode %d not found.", darg.ino); + inode = NULL; + ret = -EINVAL; + goto out; + } + + if (S_ISDIR(inode->i_mode)) { + /* + * If we are creating a directory, we need to make sure that the + * dot and dot dot dirents are setup properly. + */ + dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(dir)) { + jbd_debug(1, "Dir %d not found.", darg.ino); + goto out; + } + ret = ext4_init_new_dir(NULL, dir, inode); + iput(dir); + if (ret) { + ret = 0; + goto out; + } + } + ret = ext4_fc_replay_link_internal(sb, &darg, inode); + if (ret) + goto out; + set_nlink(inode, 1); + ext4_mark_inode_dirty(NULL, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * Record physical disk regions which are in use as per fast commit area. Our + * simple replay phase allocator excludes these regions from allocation. + */ +static int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) +{ + struct ext4_fc_replay_state *state; + struct ext4_fc_alloc_region *region; + + state = &EXT4_SB(sb)->s_fc_replay_state; + if (state->fc_regions_used == state->fc_regions_size) { + state->fc_regions_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; + state->fc_regions = krealloc( + state->fc_regions, + state->fc_regions_size * + sizeof(struct ext4_fc_alloc_region), + GFP_KERNEL); + if (!state->fc_regions) + return -ENOMEM; + } + region = &state->fc_regions[state->fc_regions_used++]; + region->ino = ino; + region->lblk = lblk; + region->pblk = pblk; + region->len = len; + + return 0; +} + +/* Replay add range tag */ +static int ext4_fc_replay_add_range(struct super_block *sb, + struct ext4_fc_tl *tl) +{ + struct ext4_fc_add_range *fc_add_ex; + struct ext4_extent newex, *ex; + struct inode *inode; + ext4_lblk_t start, cur; + int remaining, len; + ext4_fsblk_t start_pblk; + struct ext4_map_blocks map; + struct ext4_ext_path *path = NULL; + int ret; + + fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); + ex = (struct ext4_extent *)&fc_add_ex->fc_ex; + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, + le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + + inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), + EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode not found."); + return 0; + } + + ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + + start = le32_to_cpu(ex->ee_block); + start_pblk = ext4_ext_pblock(ex); + len = ext4_ext_get_actual_len(ex); + + cur = start; + remaining = len; + jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", + start, start_pblk, len, ext4_ext_is_unwritten(ex), + inode->i_ino); + + while (remaining > 0) { + map.m_lblk = cur; + map.m_len = remaining; + map.m_pblk = 0; + ret = ext4_map_blocks(NULL, inode, &map, 0); + + if (ret < 0) { + iput(inode); + return 0; + } + + if (ret == 0) { + /* Range is not mapped */ + path = ext4_find_extent(inode, cur, NULL, 0); + if (!path) + continue; + memset(&newex, 0, sizeof(newex)); + newex.ee_block = cpu_to_le32(cur); + ext4_ext_store_pblock( + &newex, start_pblk + cur - start); + newex.ee_len = cpu_to_le16(map.m_len); + if (ext4_ext_is_unwritten(ex)) + ext4_ext_mark_unwritten(&newex); + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_insert_extent( + NULL, inode, &path, &newex, 0); + up_write((&EXT4_I(inode)->i_data_sem)); + ext4_ext_drop_refs(path); + kfree(path); + if (ret) { + iput(inode); + return 0; + } + goto next; + } + + if (start_pblk + cur - start != map.m_pblk) { + /* + * Logical to physical mapping changed. This can happen + * if this range was removed and then reallocated to + * map to new physical blocks during a fast commit. + */ + ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, + ext4_ext_is_unwritten(ex), + start_pblk + cur - start); + if (ret) { + iput(inode); + return 0; + } + /* + * Mark the old blocks as free since they aren't used + * anymore. We maintain an array of all the modified + * inodes. In case these blocks are still used at either + * a different logical range in the same inode or in + * some different inode, we will mark them as allocated + * at the end of the FC replay using our array of + * modified inodes. + */ + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + goto next; + } + + /* Range is mapped and needs a state change */ + jbd_debug(1, "Converting from %d to %d %lld", + map.m_flags & EXT4_MAP_UNWRITTEN, + ext4_ext_is_unwritten(ex), map.m_pblk); + ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, + ext4_ext_is_unwritten(ex), map.m_pblk); + if (ret) { + iput(inode); + return 0; + } + /* + * We may have split the extent tree while toggling the state. + * Try to shrink the extent tree now. + */ + ext4_ext_replay_shrink_inode(inode, start + len); +next: + cur += map.m_len; + remaining -= map.m_len; + } + ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> + sb->s_blocksize_bits); + iput(inode); + return 0; +} + +/* Replay DEL_RANGE tag */ +static int +ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) +{ + struct inode *inode; + struct ext4_fc_del_range *lrange; + struct ext4_map_blocks map; + ext4_lblk_t cur, remaining; + int ret; + + lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); + cur = le32_to_cpu(lrange->fc_lblk); + remaining = le32_to_cpu(lrange->fc_len); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, + le32_to_cpu(lrange->fc_ino), cur, remaining); + + inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); + return 0; + } + + ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + + jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", + inode->i_ino, le32_to_cpu(lrange->fc_lblk), + le32_to_cpu(lrange->fc_len)); + while (remaining > 0) { + map.m_lblk = cur; + map.m_len = remaining; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) { + iput(inode); + return 0; + } + if (ret > 0) { + remaining -= ret; + cur += ret; + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + } else { + remaining -= map.m_len; + cur += map.m_len; + } + } + + ret = ext4_punch_hole(inode, + le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, + le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); + if (ret) + jbd_debug(1, "ext4_punch_hole returned %d", ret); + ext4_ext_replay_shrink_inode(inode, + i_size_read(inode) >> sb->s_blocksize_bits); + ext4_mark_inode_dirty(NULL, inode); + iput(inode); + + return 0; +} + +static inline const char *tag2str(u16 tag) +{ + switch (tag) { + case EXT4_FC_TAG_LINK: + return "TAG_ADD_ENTRY"; + case EXT4_FC_TAG_UNLINK: + return "TAG_DEL_ENTRY"; + case EXT4_FC_TAG_ADD_RANGE: + return "TAG_ADD_RANGE"; + case EXT4_FC_TAG_CREAT: + return "TAG_CREAT_DENTRY"; + case EXT4_FC_TAG_DEL_RANGE: + return "TAG_DEL_RANGE"; + case EXT4_FC_TAG_INODE: + return "TAG_INODE"; + case EXT4_FC_TAG_PAD: + return "TAG_PAD"; + case EXT4_FC_TAG_TAIL: + return "TAG_TAIL"; + case EXT4_FC_TAG_HEAD: + return "TAG_HEAD"; + default: + return "TAG_ERROR"; + } +} + +static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) +{ + struct ext4_fc_replay_state *state; + struct inode *inode; + struct ext4_ext_path *path = NULL; + struct ext4_map_blocks map; + int i, ret, j; + ext4_lblk_t cur, end; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_modified_inodes_used; i++) { + inode = ext4_iget(sb, state->fc_modified_inodes[i], + EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode %d not found.", + state->fc_modified_inodes[i]); + continue; + } + cur = 0; + end = EXT_MAX_BLOCKS; + while (cur < end) { + map.m_lblk = cur; + map.m_len = end - cur; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + break; + + if (ret > 0) { + path = ext4_find_extent(inode, map.m_lblk, NULL, 0); + if (!IS_ERR_OR_NULL(path)) { + for (j = 0; j < path->p_depth; j++) + ext4_mb_mark_bb(inode->i_sb, + path[j].p_block, 1, 1); + ext4_ext_drop_refs(path); + kfree(path); + } + cur += ret; + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, + map.m_len, 1); + } else { + cur = cur + (map.m_len ? map.m_len : 1); + } + } + iput(inode); + } +} + +/* + * Check if block is in excluded regions for block allocation. The simple + * allocator that runs during replay phase is calls this function to see + * if it is okay to use a block. + */ +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) +{ + int i; + struct ext4_fc_replay_state *state; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_regions_valid; i++) { + if (state->fc_regions[i].ino == 0 || + state->fc_regions[i].len == 0) + continue; + if (blk >= state->fc_regions[i].pblk && + blk < state->fc_regions[i].pblk + state->fc_regions[i].len) + return true; + } + return false; +} + +/* Cleanup function called after replay */ +void ext4_fc_replay_cleanup(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + sbi->s_mount_state &= ~EXT4_FC_REPLAY; + kfree(sbi->s_fc_replay_state.fc_regions); + kfree(sbi->s_fc_replay_state.fc_modified_inodes); +} + +/* + * Recovery Scan phase handler + * + * This function is called during the scan phase and is responsible + * for doing following things: + * - Make sure the fast commit area has valid tags for replay + * - Count number of tags that need to be replayed by the replay handler + * - Verify CRC + * - Create a list of excluded blocks for allocation during replay phase + * + * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is + * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP + * to indicate that scan has finished and JBD2 can now start replay phase. + * It returns a negative error to indicate that there was an error. At the end + * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set + * to indicate the number of tags that need to replayed during the replay phase. + */ +static int ext4_fc_replay_scan(journal_t *journal, + struct buffer_head *bh, int off, + tid_t expected_tid) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_replay_state *state; + int ret = JBD2_FC_REPLAY_CONTINUE; + struct ext4_fc_add_range *ext; + struct ext4_fc_tl *tl; + struct ext4_fc_tail *tail; + __u8 *start, *end; + struct ext4_fc_head *head; + struct ext4_extent *ex; + + state = &sbi->s_fc_replay_state; + + start = (u8 *)bh->b_data; + end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + + if (state->fc_replay_expected_off == 0) { + state->fc_cur_tag = 0; + state->fc_replay_num_tags = 0; + state->fc_crc = 0; + state->fc_regions = NULL; + state->fc_regions_valid = state->fc_regions_used = + state->fc_regions_size = 0; + /* Check if we can stop early */ + if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) + != EXT4_FC_TAG_HEAD) + return 0; + } + + if (off != state->fc_replay_expected_off) { + ret = -EFSCORRUPTED; + goto out_err; + } + + state->fc_replay_expected_off++; + fc_for_each_tl(start, end, tl) { + jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", + tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); + switch (le16_to_cpu(tl->fc_tag)) { + case EXT4_FC_TAG_ADD_RANGE: + ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); + ex = (struct ext4_extent *)&ext->fc_ex; + ret = ext4_fc_record_regions(sb, + le32_to_cpu(ext->fc_ino), + le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), + ext4_ext_get_actual_len(ex)); + if (ret < 0) + break; + ret = JBD2_FC_REPLAY_CONTINUE; + fallthrough; + case EXT4_FC_TAG_DEL_RANGE: + case EXT4_FC_TAG_LINK: + case EXT4_FC_TAG_UNLINK: + case EXT4_FC_TAG_CREAT: + case EXT4_FC_TAG_INODE: + case EXT4_FC_TAG_PAD: + state->fc_cur_tag++; + state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, + sizeof(*tl) + ext4_fc_tag_len(tl)); + break; + case EXT4_FC_TAG_TAIL: + state->fc_cur_tag++; + tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); + state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, + sizeof(*tl) + + offsetof(struct ext4_fc_tail, + fc_crc)); + if (le32_to_cpu(tail->fc_tid) == expected_tid && + le32_to_cpu(tail->fc_crc) == state->fc_crc) { + state->fc_replay_num_tags = state->fc_cur_tag; + state->fc_regions_valid = + state->fc_regions_used; + } else { + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -EFSBADCRC; + } + state->fc_crc = 0; + break; + case EXT4_FC_TAG_HEAD: + head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); + if (le32_to_cpu(head->fc_features) & + ~EXT4_FC_SUPPORTED_FEATURES) { + ret = -EOPNOTSUPP; + break; + } + if (le32_to_cpu(head->fc_tid) != expected_tid) { + ret = JBD2_FC_REPLAY_STOP; + break; + } + state->fc_cur_tag++; + state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, + sizeof(*tl) + ext4_fc_tag_len(tl)); + break; + default: + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -ECANCELED; + } + if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) + break; + } + +out_err: + trace_ext4_fc_replay_scan(sb, ret, off); + return ret; +} + /* * Main recovery path entry point. + * The meaning of return codes is similar as above. */ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_tid) { - return 0; + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_tl *tl; + __u8 *start, *end; + int ret = JBD2_FC_REPLAY_CONTINUE; + struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; + struct ext4_fc_tail *tail; + + if (pass == PASS_SCAN) { + state->fc_current_pass = PASS_SCAN; + return ext4_fc_replay_scan(journal, bh, off, expected_tid); + } + + if (state->fc_current_pass != pass) { + state->fc_current_pass = pass; + sbi->s_mount_state |= EXT4_FC_REPLAY; + } + if (!sbi->s_fc_replay_state.fc_replay_num_tags) { + jbd_debug(1, "Replay stops\n"); + ext4_fc_set_bitmaps_and_counters(sb); + return 0; + } + +#ifdef CONFIG_EXT4_DEBUG + if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { + pr_warn("Dropping fc block %d because max_replay set\n", off); + return JBD2_FC_REPLAY_STOP; + } +#endif + + start = (u8 *)bh->b_data; + end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + + fc_for_each_tl(start, end, tl) { + if (state->fc_replay_num_tags == 0) { + ret = JBD2_FC_REPLAY_STOP; + ext4_fc_set_bitmaps_and_counters(sb); + break; + } + jbd_debug(3, "Replay phase, tag:%s\n", + tag2str(le16_to_cpu(tl->fc_tag))); + state->fc_replay_num_tags--; + switch (le16_to_cpu(tl->fc_tag)) { + case EXT4_FC_TAG_LINK: + ret = ext4_fc_replay_link(sb, tl); + break; + case EXT4_FC_TAG_UNLINK: + ret = ext4_fc_replay_unlink(sb, tl); + break; + case EXT4_FC_TAG_ADD_RANGE: + ret = ext4_fc_replay_add_range(sb, tl); + break; + case EXT4_FC_TAG_CREAT: + ret = ext4_fc_replay_create(sb, tl); + break; + case EXT4_FC_TAG_DEL_RANGE: + ret = ext4_fc_replay_del_range(sb, tl); + break; + case EXT4_FC_TAG_INODE: + ret = ext4_fc_replay_inode(sb, tl); + break; + case EXT4_FC_TAG_PAD: + trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, + ext4_fc_tag_len(tl), 0); + break; + case EXT4_FC_TAG_TAIL: + trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, + ext4_fc_tag_len(tl), 0); + tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); + WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); + break; + case EXT4_FC_TAG_HEAD: + break; + default: + trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, + ext4_fc_tag_len(tl), 0); + ret = -ECANCELED; + break; + } + if (ret < 0) + break; + ret = JBD2_FC_REPLAY_CONTINUE; + } + return ret; } void ext4_fc_init(struct super_block *sb, journal_t *journal) |