diff options
Diffstat (limited to 'fs/ext4/fast_commit.c')
| -rw-r--r-- | fs/ext4/fast_commit.c | 1196 |
1 files changed, 690 insertions, 506 deletions
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5ae8026a0c56..fa66b08de999 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -12,6 +12,7 @@ #include "ext4_extents.h" #include "mballoc.h" +#include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- @@ -49,19 +50,27 @@ * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * - * [1] Lock inodes for any further data updates by setting COMMITTING state - * [2] Submit data buffers of all the inodes - * [3] Wait for [2] to complete - * [4] Commit all the directory entry updates in the fast commit space - * [5] Commit all the changed inode structures - * [6] Write tail tag (this tag ensures the atomicity, please read the following + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). - * [7] Wait for [4], [5] and [6] to complete. * - * All the inode updates must call ext4_fc_start_update() before starting an - * update. If such an ongoing update is present, fast commit waits for it to - * complete. The completion of such an update is marked by - * ext4_fc_stop_update(). + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- @@ -142,6 +151,13 @@ * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * * TODOs * ----- * @@ -156,13 +172,12 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Fast commit's commit path locks the entire file system during fast - * commit. This has significant performance penalty. Instead of that, we - * should use ext4_fc_start/stop_update functions to start inode level - * updates from ext4_journal_start/stop. Once we do that we can drop file - * system locking during commit path. + * 1) Handle more ineligible cases. * - * 2) Handle more ineligible cases. + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. */ #include <trace/events/ext4.h> @@ -199,77 +214,14 @@ void ext4_fc_init_inode(struct inode *inode) ext4_fc_reset_inode(inode); ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); + INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); - atomic_set(&ei->i_fc_updates, 0); } -/* This function must be called with sbi->s_fc_lock held. */ -static void ext4_fc_wait_committing_inode(struct inode *inode) -__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) +static bool ext4_fc_disabled(struct super_block *sb) { - wait_queue_head_t *wq; - struct ext4_inode_info *ei = EXT4_I(inode); - -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); -} - -/* - * Inform Ext4's fast about start of an inode update - * - * This function is called by the high level call VFS callbacks before - * performing any inode update. This function blocks if there's an ongoing - * fast commit on the inode in question. - */ -void ext4_fc_start_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) - return; - -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) - goto out; - - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } -out: - atomic_inc(&ei->i_fc_updates); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -} - -/* - * Stop inode update and wake up waiting fast commits if any. - */ -void ext4_fc_stop_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) - return; - - if (atomic_dec_and_test(&ei->i_fc_updates)) - wake_up_all(&ei->i_fc_wait); + return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } /* @@ -279,39 +231,110 @@ void ext4_fc_stop_update(struct inode *inode) void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) { - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); + if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { + mutex_unlock(&sbi->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + mutex_unlock(&sbi->s_fc_lock); + schedule(); + mutex_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); } list_del_init(&ei->i_fc_list); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + + /* + * Since this inode is getting removed, let's also remove all FC + * dentry create references, since it is not needed to log it anyways. + */ + if (list_empty(&ei->i_fc_dilist)) { + mutex_unlock(&sbi->s_fc_lock); + return; + } + + fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); + WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); + list_del_init(&fc_dentry->fcd_list); + list_del_init(&fc_dentry->fcd_dilist); + + WARN_ON(!list_empty(&ei->i_fc_dilist)); + mutex_unlock(&sbi->s_fc_lock); + + release_dentry_name_snapshot(&fc_dentry->fcd_name); + kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); } /* - * Mark file system as fast commit ineligible. This means that next commit - * operation would result in a full jbd2 commit. + * Mark file system as fast commit ineligible, and record latest + * ineligible transaction tid. This means until the recorded + * transaction, commit operation would result in a full jbd2 commit. */ -void ext4_fc_mark_ineligible(struct super_block *sb, int reason) +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); + tid_t tid; + bool has_transaction = true; + bool is_ineligible; - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(sb)) return; + if (handle && !IS_ERR(handle)) + tid = handle->h_transaction->t_tid; + else { + read_lock(&sbi->s_journal->j_state_lock); + if (sbi->s_journal->j_running_transaction) + tid = sbi->s_journal->j_running_transaction->t_tid; + else + has_transaction = false; + read_unlock(&sbi->s_journal->j_state_lock); + } + mutex_lock(&sbi->s_fc_lock); + is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) + sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -328,7 +351,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason) */ static int ext4_fc_track_template( handle_t *handle, struct inode *inode, - int (*__fc_track_fn)(struct inode *, void *, bool), + int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), void *args, int enqueue) { bool update = false; @@ -337,34 +360,27 @@ static int ext4_fc_track_template( tid_t tid = 0; int ret; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) - return -EOPNOTSUPP; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) - return -EINVAL; - tid = handle->h_transaction->t_tid; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); ei->i_sync_tid = tid; } - ret = __fc_track_fn(inode, args, update); - mutex_unlock(&ei->i_fc_lock); - + ret = __fc_track_fn(handle, inode, args, update); + spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? + (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return ret; } @@ -375,52 +391,63 @@ struct __track_dentry_update_args { }; /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ -static int __track_dentry_update(struct inode *inode, void *arg, bool update) +static int __track_dentry_update(handle_t *handle, struct inode *inode, + void *arg, bool update) { struct ext4_fc_dentry_update *node; struct ext4_inode_info *ei = EXT4_I(inode); struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + spin_unlock(&ei->i_fc_lock); + + if (IS_ENCRYPTED(dir)) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, + handle); + spin_lock(&ei->i_fc_lock); + return -EOPNOTSUPP; + } - mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); - mutex_lock(&ei->i_fc_lock); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); + spin_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; - node->fcd_parent = dentry->d_parent->d_inode->i_ino; + node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; - if (dentry->d_name.len > DNAME_INLINE_LEN) { - node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); - if (!node->fcd_name.name) { - kmem_cache_free(ext4_fc_dentry_cachep, node); - ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM); - mutex_lock(&ei->i_fc_lock); - return -ENOMEM; - } - memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, - dentry->d_name.len); - } else { - memcpy(node->fcd_iname, dentry->d_name.name, - dentry->d_name.len); - node->fcd_name.name = node->fcd_iname; - } - node->fcd_name.len = dentry->d_name.len; - - spin_lock(&sbi->s_fc_lock); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) + take_dentry_name_snapshot(&node->fcd_name, dentry); + INIT_LIST_HEAD(&node->fcd_dilist); + INIT_LIST_HEAD(&node->fcd_list); + mutex_lock(&sbi->s_fc_lock); + if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); - mutex_lock(&ei->i_fc_lock); + + /* + * This helps us keep a track of all fc_dentry updates which is part of + * this ext4 inode. So in case the inode is getting unlinked, before + * even we get a chance to fsync, we could remove all fc_dentry + * references while evicting the inode in ext4_fc_del(). + * Also with this, we don't need to loop over all the inodes in + * sbi->s_fc_q to get the corresponding inode in + * ext4_fc_commit_dentry_updates(). + */ + if (dentry_update->op == EXT4_FC_TAG_CREAT) { + WARN_ON(!list_empty(&ei->i_fc_dilist)); + list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); + } + mutex_unlock(&sbi->s_fc_lock); + spin_lock(&ei->i_fc_lock); return 0; } @@ -436,12 +463,20 @@ void __ext4_fc_track_unlink(handle_t *handle, ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); - trace_ext4_fc_track_unlink(inode, dentry, ret); + trace_ext4_fc_track_unlink(handle, inode, dentry, ret); } void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { - __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); + struct inode *inode = d_inode(dentry); + + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, @@ -455,12 +490,20 @@ void __ext4_fc_track_link(handle_t *handle, ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); - trace_ext4_fc_track_link(inode, dentry, ret); + trace_ext4_fc_track_link(handle, inode, dentry, ret); } void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { - __ext4_fc_track_link(handle, d_inode(dentry), dentry); + struct inode *inode = d_inode(dentry); + + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, @@ -474,16 +517,25 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); - trace_ext4_fc_track_create(inode, dentry, ret); + trace_ext4_fc_track_create(handle, inode, dentry, ret); } void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { - __ext4_fc_track_create(handle, d_inode(dentry), dentry); + struct inode *inode = d_inode(dentry); + + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ -static int __track_inode(struct inode *inode, void *arg, bool update) +static int __track_inode(handle_t *handle, struct inode *inode, void *arg, + bool update) { if (update) return -EEXIST; @@ -495,19 +547,56 @@ static int __track_inode(struct inode *inode, void *arg, bool update) void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) return; + if (ext4_fc_disabled(inode->i_sb)) + return; + if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_INODE_JOURNAL_DATA); + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); + return; + } + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); } + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); - trace_ext4_fc_track_inode(inode, ret); + trace_ext4_fc_track_inode(handle, inode, ret); } struct __track_range_args { @@ -515,7 +604,8 @@ struct __track_range_args { }; /* __track_fn for tracking data updates */ -static int __track_range(struct inode *inode, void *arg, bool update) +static int __track_range(handle_t *handle, struct inode *inode, void *arg, + bool update) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_lblk_t oldstart; @@ -551,17 +641,29 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star if (S_ISDIR(inode->i_mode)) return; + if (ext4_fc_disabled(inode->i_sb)) + return; + + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + return; + + if (ext4_has_inline_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, + handle); + return; + } + args.start = start; args.end = end; ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); - trace_ext4_fc_track_range(inode, start, end, ret); + trace_ext4_fc_track_range(handle, inode, start, end, ret); } static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { - int write_flags = REQ_SYNC; + blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ @@ -571,24 +673,12 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE, write_flags, bh); + submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } /* Ext4 commit path routines */ -/* memzero and update CRC */ -static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, - u32 *crc) -{ - void *ret; - - ret = memset(dst, 0, len); - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); - return ret; -} - /* * Allocate len bytes on a fast commit buffer. * @@ -602,62 +692,60 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { - struct ext4_fc_tl *tl; + struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; - int pad_len; + int remaining; + u8 *dst; /* - * After allocating len, we should have space at least for a 0 byte - * padding. + * If 'len' is too long to fit in any block alongside a PAD tlv, then we + * cannot fulfill the request. */ - if (len + sizeof(struct ext4_fc_tl) > bsize) + if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; - if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { - /* - * Only allocate from current buffer if we have enough space for - * this request AND we have space to add a zero byte padding. - */ - if (!sbi->s_fc_bh) { - ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); - if (ret) - return NULL; - sbi->s_fc_bh = bh; - } + if (!sbi->s_fc_bh) { + ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); + if (ret) + return NULL; + sbi->s_fc_bh = bh; + } + dst = sbi->s_fc_bh->b_data + off; + + /* + * Allocate the bytes in the current block if we can do so while still + * leaving enough space for a PAD tlv. + */ + remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; + if (len <= remaining) { sbi->s_fc_bytes += len; - return sbi->s_fc_bh->b_data + off; - } - /* Need to add PAD tag */ - tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); - tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); - pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); - tl->fc_len = cpu_to_le16(pad_len); - if (crc) - *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); - if (pad_len > 0) - ext4_fc_memzero(sb, tl + 1, pad_len, crc); + return dst; + } + + /* + * Else, terminate the current block with a PAD tlv, then allocate a new + * block and allocate the bytes at the start of that new block. + */ + + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); + tl.fc_len = cpu_to_le16(remaining); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); + ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; - sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; + sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } -/* memcpy to fc reserved space and update CRC */ -static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, - int len, u32 *crc) -{ - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); - return memcpy(dst, src, len); -} - /* * Complete a fast commit by writing tail tag. * @@ -678,23 +766,27 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ - dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); - tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); + tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); - dst += sizeof(tl); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); - ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); + memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, + dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); - ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); + memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); + dst += sizeof(tail.fc_crc); + memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); @@ -711,15 +803,15 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, struct ext4_fc_tl tl; u8 *dst; - dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); - ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } @@ -730,9 +822,9 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; - int dlen = fc_dentry->fcd_name.len; - u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, - crc); + int dlen = fc_dentry->fcd_name.name.len; + u8 *dst = ext4_fc_reserve_space(sb, + EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; @@ -741,11 +833,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); - dst += sizeof(tl); - ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; + memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); - ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); + memcpy(dst, fc_dentry->fcd_name.name.name, dlen); return true; } @@ -777,22 +869,21 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); + ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, - sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); + EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) - return -ECANCELED; + goto err; - if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) - return -ECANCELED; - dst += sizeof(tl); - if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) - return -ECANCELED; + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; + memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); - if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), - inode_len, crc)) - return -ECANCELED; - - return 0; + memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); + ret = 0; +err: + brelse(iloc.bh); + return ret; } /* @@ -809,24 +900,26 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) struct ext4_extent *ex; int ret; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; - jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", - __func__, cur_lblk_off, new_blk_size, inode->i_ino); + ext4_debug("will try writing %d to %d for inode %ld\n", + cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; @@ -870,76 +963,37 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) } -/* Submit data for all the fast commit inodes */ -static int ext4_fc_submit_inode_data_all(journal_t *journal) +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; - spin_lock(&sbi->s_fc_lock); - ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } - spin_unlock(&sbi->s_fc_lock); - ret = jbd2_submit_inode_data(ei->jinode); + ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - return ret; -} - -/* Wait for completion of data for all the fast commit inodes */ -static int ext4_fc_wait_inode_data_all(journal_t *journal) -{ - struct super_block *sb = (struct super_block *)(journal->j_private); - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *pos, *n; - int ret = 0; - - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) - continue; - spin_unlock(&sbi->s_fc_lock); - - ret = jbd2_wait_inode_data(journal, pos->jinode); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) -__acquires(&sbi->s_fc_lock) -__releases(&sbi->s_fc_lock) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; - struct ext4_inode_info *ei, *ei_n; + struct ext4_inode_info *ei; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) @@ -947,30 +1001,21 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - spin_lock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; continue; } - - inode = NULL; - list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { - inode = &ei->vfs_inode; - break; - } - } /* - * If we don't find inode in our list, then it was deleted, - * in which case, we don't need to record it's create tag. + * With fcd_dilist we need not loop in sbi->s_fc_q to get the + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. */ - if (!inode) + if (list_empty(&fc_dentry->fcd_dilist)) continue; - spin_unlock(&sbi->s_fc_lock); + ei = list_first_entry(&fc_dentry->fcd_dilist, + struct ext4_inode_info, i_fc_dilist); + inode = &ei->vfs_inode; + WARN_ON(inode->i_ino != fc_dentry->fcd_ino); /* * We first write the inode and then the create dirent. This @@ -981,28 +1026,19 @@ __releases(&sbi->s_fc_lock) */ ret = ext4_fc_write_inode(inode, crc); if (ret) - goto lock_and_exit; - + return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) - goto lock_and_exit; - - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - - spin_lock(&sbi->s_fc_lock); + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; } return 0; -lock_and_exit: - spin_lock(&sbi->s_fc_lock); - return ret; } static int ext4_fc_perform_commit(journal_t *journal) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; @@ -1011,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; - ret = ext4_fc_submit_inode_data_all(journal); - if (ret) - return ret; + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + mutex_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); + + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + mutex_unlock(&sbi->s_fc_lock); - ret = ext4_fc_wait_inode_data_all(journal); + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ if (ret) return ret; + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); /* - * If file system device is different from journal device, issue a cache - * flush before we start writing fast commit blocks. + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + mutex_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + + /* + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* - * Add a head tag only if this is the first fast commit - * in this TID. + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( @@ -1042,42 +1133,41 @@ static int ext4_fc_perform_commit(journal_t *journal) } } - spin_lock(&sbi->s_fc_lock); + /* Step 6.2: Now write all the dentry updates. */ + mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); - if (ret) { - spin_unlock(&sbi->s_fc_lock); + if (ret) goto out; - } + /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: + mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } static void ext4_fc_update_stats(struct super_block *sb, int status, - u64 commit_time, int nblks) + u64 commit_time, int nblks, tid_t commit_tid) { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; - jbd_debug(1, "Fast commit ended with status = %d", status); + ext4_debug("Fast commit ended with status = %d for tid %u", + status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; stats->fc_numblks += nblks; @@ -1095,7 +1185,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status, } else { stats->fc_skipped_commits++; } - trace_ext4_fc_commit_stop(sb, nblks, status); + trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); } /* @@ -1106,35 +1196,39 @@ static void ext4_fc_update_stats(struct super_block *sb, int status, */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; - - trace_ext4_fc_commit_start(sb); - - start_time = ktime_get(); + int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); + trace_ext4_fc_commit_start(sb, commit_tid); + + start_time = ktime_get(); + old_ioprio = get_current_ioprio(); + restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid && - commit_tid > journal->j_commit_sequence) + tid_gt(commit_tid, journal->j_commit_sequence)) goto restart_fc; - ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0); + ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, + commit_tid); return 0; } else if (ret) { /* * Commit couldn't start. Just update stats and perform a * full commit. */ - ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0); + ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, + commit_tid); return jbd2_complete_transaction(journal, commit_tid); } @@ -1147,6 +1241,15 @@ restart_fc: goto fallback; } + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { @@ -1161,17 +1264,19 @@ restart_fc: } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time */ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); - ext4_fc_update_stats(sb, status, commit_time, nblks); + ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); return ret; fallback: + set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); - ext4_fc_update_stats(sb, status, 0, 0); + ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; } @@ -1179,31 +1284,52 @@ fallback: * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ -static void ext4_fc_cleanup(journal_t *journal, int full) +static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter, *iter_n; + struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; + trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - list_del_init(&iter->i_fc_list); - ext4_clear_inode_state(&iter->vfs_inode, + mutex_lock(&sbi->s_fc_lock); + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - ext4_fc_reset_inode(&iter->vfs_inode); - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); + } else if (full) { + /* + * We are called after a full commit, inode has been + * modified while the commit was running. Re-enqueue + * the inode into STAGING, which will then be splice + * back into MAIN. This cannot happen during + * fastcommit because the journal is locked all the + * time in that case (and tid doesn't increase so + * tid check above isn't reliable). + */ + list_add_tail(&ei->i_fc_list, + &sbi->s_fc_q[FC_Q_STAGING]); + } + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); #if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } @@ -1212,13 +1338,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full) struct ext4_fc_dentry_update, fcd_list); list_del_init(&fc_dentry->fcd_list); - spin_unlock(&sbi->s_fc_lock); + list_del_init(&fc_dentry->fcd_dilist); - if (fc_dentry->fcd_name.name && - fc_dentry->fcd_name.len > DNAME_INLINE_LEN) - kfree(fc_dentry->fcd_name.name); + release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], @@ -1226,12 +1349,14 @@ static void ext4_fc_cleanup(journal_t *journal, int full) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { + sbi->s_fc_ineligible_tid = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + } if (full) sbi->s_fc_bytes = 0; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } @@ -1243,8 +1368,14 @@ struct dentry_info_args { char *dname; }; +/* Same as struct ext4_fc_tl, but uses native endianness fields */ +struct ext4_fc_tl_mem { + u16 fc_tag; + u16 fc_len; +}; + static inline void tl_to_darg(struct dentry_info_args *darg, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; @@ -1253,13 +1384,21 @@ static inline void tl_to_darg(struct dentry_info_args *darg, darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); - darg->dname_len = le16_to_cpu(tl->fc_len) - - sizeof(struct ext4_fc_dentry_info); + darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); +} + +static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) +{ + struct ext4_fc_tl tl_disk; + + memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); + tl->fc_len = le16_to_cpu(tl_disk.fc_len); + tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ -static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_unlink(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; @@ -1276,19 +1415,19 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found", darg.ino); + ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { - jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); + ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } - ret = __ext4_unlink(NULL, old_parent, &entry, inode); + ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; @@ -1308,21 +1447,21 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { - jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); + ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { - jbd_debug(1, "Failed to obtain dentry"); + ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { - jbd_debug(1, "Inode dentry not created."); + ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } @@ -1335,7 +1474,7 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, * could complete. */ if (ret && ret != -EEXIST) { - jbd_debug(1, "Failed to link\n"); + ext4_debug("Failed to link\n"); goto out; } @@ -1356,8 +1495,8 @@ out: } /* Link replay function */ -static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_link(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; @@ -1369,7 +1508,7 @@ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return 0; } @@ -1392,14 +1531,17 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { + int *fc_modified_inodes; + + fc_modified_inodes = krealloc(state->fc_modified_inodes, + sizeof(int) * (state->fc_modified_inodes_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); + if (!fc_modified_inodes) + return -ENOMEM; + state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; - state->fc_modified_inodes = krealloc( - state->fc_modified_inodes, sizeof(int) * - state->fc_modified_inodes_size, - GFP_KERNEL); - if (!state->fc_modified_inodes) - return -ENOMEM; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; @@ -1408,16 +1550,17 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) /* * Inode replay function */ -static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_inode(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; - int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); + int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; + size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); @@ -1431,7 +1574,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, } inode = NULL; - ext4_fc_record_modified_inode(sb, ino); + ret = ext4_fc_record_modified_inode(sb, ino); + if (ret) + goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); @@ -1439,12 +1584,12 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, if (ret) goto out; - inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); + inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); - memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, - inode_len - offsetof(struct ext4_inode, i_generation)); + memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, + inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { @@ -1474,7 +1619,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return -EFSCORRUPTED; } @@ -1508,8 +1653,8 @@ out: * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ -static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_create(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; @@ -1528,7 +1673,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "inode %d not found.", darg.ino); + ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; @@ -1541,7 +1686,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { - jbd_debug(1, "Dir %d not found.", darg.ino); + ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); @@ -1557,32 +1702,41 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: - if (inode) - iput(inode); + iput(inode); return ret; } /* - * Record physical disk regions which are in use as per fast commit area. Our - * simple replay phase allocator excludes these regions from allocation. + * Record physical disk regions which are in use as per fast commit area, + * and used by inodes during replay phase. Our simple replay phase + * allocator excludes these regions from allocation. */ -static int ext4_fc_record_regions(struct super_block *sb, int ino, - ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; + /* + * during replay phase, the fc_regions_valid may not same as + * fc_regions_used, update it when do new additions. + */ + if (replay && state->fc_regions_used != state->fc_regions_valid) + state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { + struct ext4_fc_alloc_region *fc_regions; + + fc_regions = krealloc(state->fc_regions, + sizeof(struct ext4_fc_alloc_region) * + (state->fc_regions_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); + if (!fc_regions) + return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; - state->fc_regions = krealloc( - state->fc_regions, - state->fc_regions_size * - sizeof(struct ext4_fc_alloc_region), - GFP_KERNEL); - if (!state->fc_regions) - return -ENOMEM; + state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; @@ -1590,12 +1744,15 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino, region->pblk = pblk; region->len = len; + if (replay) + state->fc_regions_valid++; + return 0; } /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; @@ -1616,11 +1773,13 @@ static int ext4_fc_replay_add_range(struct super_block *sb, inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); @@ -1628,7 +1787,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, cur = start; remaining = len; - jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", + ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); @@ -1638,18 +1797,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb, map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) { - iput(inode); - return 0; - } + if (ret < 0) + goto out; if (ret == 0) { /* Range is not mapped */ - path = ext4_find_extent(inode, cur, NULL, 0); - if (IS_ERR(path)) { - iput(inode); - return 0; - } + path = ext4_find_extent(inode, cur, path, 0); + if (IS_ERR(path)) + goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( @@ -1658,15 +1813,11 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_ext_insert_extent( - NULL, inode, &path, &newex, 0); + path = ext4_ext_insert_extent(NULL, inode, + path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); - ext4_ext_drop_refs(path); - kfree(path); - if (ret) { - iput(inode); - return 0; - } + if (IS_ERR(path)) + goto out; goto next; } @@ -1679,10 +1830,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified @@ -1692,20 +1841,18 @@ static int ext4_fc_replay_add_range(struct super_block *sb, * at the end of the FC replay using our array of * modified inodes. */ - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } /* Range is mapped and needs a state change */ - jbd_debug(1, "Converting from %ld to %d %lld", + ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. @@ -1717,14 +1864,16 @@ next: } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); +out: + ext4_free_ext_path(path); iput(inode); return 0; } /* Replay DEL_RANGE tag */ static int -ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +ext4_fc_replay_del_range(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; @@ -1741,13 +1890,15 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); + ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; - jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", + ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { @@ -1755,14 +1906,12 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) { - iput(inode); - return 0; - } + if (ret < 0) + goto out; if (ret > 0) { remaining -= ret; cur += ret; - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; @@ -1770,18 +1919,17 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, } down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_ext_remove_space(inode, lrange.fc_lblk, - lrange.fc_lblk + lrange.fc_len - 1); + ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), + le32_to_cpu(lrange.fc_lblk) + + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); +out: iput(inode); - return 0; } @@ -1799,7 +1947,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found.", + ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } @@ -1818,23 +1966,25 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) break; if (ret > 0) { - path = ext4_find_extent(inode, map.m_lblk, NULL, 0); + path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 1); - ext4_ext_drop_refs(path); - kfree(path); + path[j].p_block, 1, true); + } else { + path = NULL; } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, - map.m_len, 1); + map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } + + ext4_free_ext_path(path); } /* @@ -1852,8 +2002,8 @@ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) if (state->fc_regions[i].ino == 0 || state->fc_regions[i].len == 0) continue; - if (blk >= state->fc_regions[i].pblk && - blk < state->fc_regions[i].pblk + state->fc_regions[i].len) + if (in_range(blk, state->fc_regions[i].pblk, + state->fc_regions[i].len)) return true; } return false; @@ -1869,6 +2019,33 @@ void ext4_fc_replay_cleanup(struct super_block *sb) kfree(sbi->s_fc_replay_state.fc_modified_inodes); } +static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, + int tag, int len) +{ + switch (tag) { + case EXT4_FC_TAG_ADD_RANGE: + return len == sizeof(struct ext4_fc_add_range); + case EXT4_FC_TAG_DEL_RANGE: + return len == sizeof(struct ext4_fc_del_range); + case EXT4_FC_TAG_CREAT: + case EXT4_FC_TAG_LINK: + case EXT4_FC_TAG_UNLINK: + len -= sizeof(struct ext4_fc_dentry_info); + return len >= 1 && len <= EXT4_NAME_LEN; + case EXT4_FC_TAG_INODE: + len -= sizeof(struct ext4_fc_inode); + return len >= EXT4_GOOD_OLD_INODE_SIZE && + len <= sbi->s_inode_size; + case EXT4_FC_TAG_PAD: + return true; /* padding can have any length */ + case EXT4_FC_TAG_TAIL: + return len >= sizeof(struct ext4_fc_tail); + case EXT4_FC_TAG_HEAD: + return len == sizeof(struct ext4_fc_head); + } + return false; +} + /* * Recovery Scan phase handler * @@ -1895,7 +2072,7 @@ static int ext4_fc_replay_scan(journal_t *journal, struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; - struct ext4_fc_tl tl; + struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; @@ -1904,7 +2081,7 @@ static int ext4_fc_replay_scan(journal_t *journal, state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; - end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; @@ -1925,19 +2102,26 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_replay_expected_off++; - for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { - memcpy(&tl, cur, sizeof(tl)); - val = cur + sizeof(tl); - jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", - tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); - switch (le16_to_cpu(tl.fc_tag)) { + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; + if (tl.fc_len > end - val || + !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -ECANCELED; + goto out_err; + } + ext4_debug("Scan phase, tag:%s, blk %lld\n", + tag2str(tl.fc_tag), bh->b_blocknr); + switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_get_actual_len(ex)); + ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; @@ -1949,14 +2133,14 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + le16_to_cpu(tl.fc_len)); + state->fc_crc = ext4_chksum(state->fc_crc, cur, + EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + + state->fc_crc = ext4_chksum(state->fc_crc, cur, + EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && @@ -1982,8 +2166,8 @@ static int ext4_fc_replay_scan(journal_t *journal, break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + le16_to_cpu(tl.fc_len)); + state->fc_crc = ext4_chksum(state->fc_crc, cur, + EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? @@ -2007,7 +2191,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_fc_tl tl; + struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; @@ -2023,7 +2207,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { - jbd_debug(1, "Replay stops\n"); + ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } @@ -2036,21 +2220,22 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, #endif start = (u8 *)bh->b_data; - end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + end = start + journal->j_blocksize; - for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { - memcpy(&tl, cur, sizeof(tl)); - val = cur + sizeof(tl); + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } - jbd_debug(3, "Replay phase, tag:%s\n", - tag2str(le16_to_cpu(tl.fc_tag))); + + ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; - switch (le16_to_cpu(tl.fc_tag)) { + switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; @@ -2071,19 +2256,18 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, - le16_to_cpu(tl.fc_len), 0); + tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: - trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, - le16_to_cpu(tl.fc_len), 0); + trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, + 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: - trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, - le16_to_cpu(tl.fc_len), 0); + trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } @@ -2107,17 +2291,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) journal->j_fc_cleanup_callback = ext4_fc_cleanup; } -static const char *fc_ineligible_reasons[] = { - "Extended attributes changed", - "Cross rename", - "Journal flag changed", - "Insufficient memory", - "Swap boot", - "Resize", - "Dir renamed", - "Falloc range op", - "Data journalling", - "FC Commit Failed" +static const char * const fc_ineligible_reasons[] = { + [EXT4_FC_REASON_XATTR] = "Extended attributes changed", + [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", + [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", + [EXT4_FC_REASON_NOMEM] = "Insufficient memory", + [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", + [EXT4_FC_REASON_RESIZE] = "Resize", + [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", + [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", + [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", + [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) |
