summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_refcount_item.c
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2017-11-21 20:53:02 -0800
committerDarrick J. Wong <darrick.wong@oracle.com>2017-11-27 09:34:08 -0800
commit509955823cc9cc225c05673b1b83d70ca70c5c60 (patch)
tree2f8bbd973140712bbb026d3af3afffe526b2a4dc /fs/xfs/xfs_refcount_item.c
parent98c4f78dcdd8cec112d1cbc5e9a792ee6e5ab7a6 (diff)
xfs: log recovery should replay deferred ops in order
As part of testing log recovery with dm_log_writes, Amir Goldstein discovered an error in the deferred ops recovery that lead to corruption of the filesystem metadata if a reflink+rmap filesystem happened to shut down midway through a CoW remap: "This is what happens [after failed log recovery]: "Phase 1 - find and verify superblock... "Phase 2 - using internal log " - zero log... " - scan filesystem freespace and inode maps... " - found root inode chunk "Phase 3 - for each AG... " - scan (but don't clear) agi unlinked lists... " - process known inodes and perform inode discovery... " - agno = 0 "data fork in regular inode 134 claims CoW block 376 "correcting nextents for inode 134 "bad data fork in inode 134 "would have cleared inode 134" Hou Tao dissected the log contents of exactly such a crash: "According to the implementation of xfs_defer_finish(), these ops should be completed in the following sequence: "Have been done: "(1) CUI: Oper (160) "(2) BUI: Oper (161) "(3) CUD: Oper (194), for CUI Oper (160) "(4) RUI A: Oper (197), free rmap [0x155, 2, -9] "Should be done: "(5) BUD: for BUI Oper (161) "(6) RUI B: add rmap [0x155, 2, 137] "(7) RUD: for RUI A "(8) RUD: for RUI B "Actually be done by xlog_recover_process_intents() "(5) BUD: for BUI Oper (161) "(6) RUI B: add rmap [0x155, 2, 137] "(7) RUD: for RUI B "(8) RUD: for RUI A "So the rmap entry [0x155, 2, -9] for COW should be freed firstly, then a new rmap entry [0x155, 2, 137] will be added. However, as we can see from the log record in post_mount.log (generated after umount) and the trace print, the new rmap entry [0x155, 2, 137] are added firstly, then the rmap entry [0x155, 2, -9] are freed." When reconstructing the internal log state from the log items found on disk, it's required that deferred ops replay in exactly the same order that they would have had the filesystem not gone down. However, replaying unfinished deferred ops can create /more/ deferred ops. These new deferred ops are finished in the wrong order. This causes fs corruption and replay crashes, so let's create a single defer_ops to handle the subsequent ops created during replay, then use one single transaction at the end of log recovery to ensure that everything is replayed in the same order as they're supposed to be. Reported-by: Amir Goldstein <amir73il@gmail.com> Analyzed-by: Hou Tao <houtao1@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Tested-by: Amir Goldstein <amir73il@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Diffstat (limited to 'fs/xfs/xfs_refcount_item.c')
-rw-r--r--fs/xfs/xfs_refcount_item.c21
1 files changed, 7 insertions, 14 deletions
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 8f2e2fac4255..3a55d6fc271b 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -393,7 +393,8 @@ xfs_cud_init(
int
xfs_cui_recover(
struct xfs_mount *mp,
- struct xfs_cui_log_item *cuip)
+ struct xfs_cui_log_item *cuip,
+ struct xfs_defer_ops *dfops)
{
int i;
int error = 0;
@@ -405,11 +406,9 @@ xfs_cui_recover(
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
enum xfs_refcount_intent_type type;
- xfs_fsblock_t firstfsb;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
struct xfs_bmbt_irec irec;
- struct xfs_defer_ops dfops;
bool requeue_only = false;
ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
@@ -465,7 +464,6 @@ xfs_cui_recover(
return error;
cudp = xfs_trans_get_cud(tp, cuip);
- xfs_defer_init(&dfops, &firstfsb);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
refc = &cuip->cui_format.cui_extents[i];
refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
@@ -485,7 +483,7 @@ xfs_cui_recover(
new_len = refc->pe_len;
} else
error = xfs_trans_log_finish_refcount_update(tp, cudp,
- &dfops, type, refc->pe_startblock, refc->pe_len,
+ dfops, type, refc->pe_startblock, refc->pe_len,
&new_fsb, &new_len, &rcur);
if (error)
goto abort_error;
@@ -497,21 +495,21 @@ xfs_cui_recover(
switch (type) {
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_increase_extent(
- tp->t_mountp, &dfops, &irec);
+ tp->t_mountp, dfops, &irec);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_decrease_extent(
- tp->t_mountp, &dfops, &irec);
+ tp->t_mountp, dfops, &irec);
break;
case XFS_REFCOUNT_ALLOC_COW:
error = xfs_refcount_alloc_cow_extent(
- tp->t_mountp, &dfops,
+ tp->t_mountp, dfops,
irec.br_startblock,
irec.br_blockcount);
break;
case XFS_REFCOUNT_FREE_COW:
error = xfs_refcount_free_cow_extent(
- tp->t_mountp, &dfops,
+ tp->t_mountp, dfops,
irec.br_startblock,
irec.br_blockcount);
break;
@@ -525,17 +523,12 @@ xfs_cui_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- error = xfs_defer_finish(&tp, &dfops);
- if (error)
- goto abort_defer;
set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
error = xfs_trans_commit(tp);
return error;
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
-abort_defer:
- xfs_defer_cancel(&dfops);
xfs_trans_cancel(tp);
return error;
}