bcachefs: Make check_key_has_snapshot safer

Snapshot deletion v2 added sentinal values for deleted snapshots, so "key for deleted snapshot" - i.e. snapshot deletion missed something - is safe to repair automatically. But if we find a key for a missing snapshot we have no idea what happened, and we shouldn't delete it unless we're very sure that everything else is consistent. So hook it up to the new bch2_require_recovery_pass(), we'll now only delete if snapshots and subvolumes have recenlty been checked. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
author: Kent Overstreet <kent.overstreet@linux.dev> 2025-05-31 13:10:43 -0400
committer: Kent Overstreet <kent.overstreet@linux.dev> 2025-06-02 12:16:36 -0400
commit: e49cf9b54bc8b4c41c7aac8f12adb709f2015470 (patch)
tree: f0d915408c3c253bae8ff184756810fba327bc2e
parent: 0942b852d4070e448231d2e204ac82ad47f5920a (diff)
2 files changed, 37 insertions, 22 deletions
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 92035859e588..5f1174348974 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -822,16 +822,11 @@ int bch2_data_update_init(struct btree_trans *trans,
 	int ret = 0;
 
 	if (k.k->p.snapshot) {
-		/*
-		 * We'll go ERO if we see a key for a missing snapshot, and if
-		 * we're still in recovery we want to give that a chance to
-		 * repair:
-		 */
-		if (unlikely(test_bit(BCH_FS_in_recovery, &c->flags) &&
-			     bch2_snapshot_id_state(c, k.k->p.snapshot) == SNAPSHOT_ID_empty))
-			return bch_err_throw(c, data_update_done_no_snapshot);
-
 		ret = bch2_check_key_has_snapshot(trans, iter, k);
+		if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
+			/* Can't repair yet, waiting on other recovery passes */
+			return bch_err_throw(c, data_update_done_no_snapshot);
+		}
 		if (ret < 0)
 			return ret;
 		if (ret) /* key was deleted */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0649bdd5e4d2..35dff323bfdb 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1045,19 +1045,39 @@ int __bch2_check_key_has_snapshot(struct btree_trans *trans,
 		ret = bch2_btree_delete_at(trans, iter,
 					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
 
-	/*
-	 * Snapshot missing: we should have caught this with btree_lost_data and
-	 * kicked off reconstruct_snapshots, so if we end up here we have no
-	 * idea what happened:
-	 */
-	if (fsck_err_on(state == SNAPSHOT_ID_empty,
-			trans, bkey_in_missing_snapshot,
-			"key in missing snapshot %s, delete?",
-			(bch2_btree_id_to_text(&buf, iter->btree_id),
-			 prt_char(&buf, ' '),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-		ret = bch2_btree_delete_at(trans, iter,
-					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
+	if (state == SNAPSHOT_ID_empty) {
+		/*
+		 * Snapshot missing: we should have caught this with btree_lost_data and
+		 * kicked off reconstruct_snapshots, so if we end up here we have no
+		 * idea what happened.
+		 *
+		 * Do not delete unless we know that subvolumes and snapshots
+		 * are consistent:
+		 *
+		 * XXX:
+		 *
+		 * We could be smarter here, and instead of using the generic
+		 * recovery pass ratelimiting, track if there have been any
+		 * changes to the snapshots or inodes btrees since those passes
+		 * last ran.
+		 */
+		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
+		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
+
+		if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
+			ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
+
+		unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
+
+		if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
+			     "key in missing snapshot %s, delete?",
+			     (bch2_btree_id_to_text(&buf, iter->btree_id),
+			      prt_char(&buf, ' '),
+			      bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = bch2_btree_delete_at(trans, iter,
+						   BTREE_UPDATE_internal_snapshot_node) ?: 1;
+		}
+	}
 fsck_err:
 	printbuf_exit(&buf);
 	return ret;
author	Kent Overstreet <kent.overstreet@linux.dev>	2025-05-31 13:10:43 -0400
committer	Kent Overstreet <kent.overstreet@linux.dev>	2025-06-02 12:16:36 -0400
commit	e49cf9b54bc8b4c41c7aac8f12adb709f2015470 (patch)
tree	f0d915408c3c253bae8ff184756810fba327bc2e
parent	0942b852d4070e448231d2e204ac82ad47f5920a (diff)