summaryrefslogtreecommitdiff
path: root/fs/bcachefs/btree_node_scan.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/btree_node_scan.c')
-rw-r--r--fs/bcachefs/btree_node_scan.c260
1 files changed, 174 insertions, 86 deletions
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 866bd278439f..5a97a6b8a757 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -12,6 +12,8 @@
#include "recovery_passes.h"
#include <linux/kthread.h>
+#include <linux/min_heap.h>
+#include <linux/sched/sysctl.h>
#include <linux/sort.h>
struct find_btree_nodes_worker {
@@ -22,15 +24,15 @@ struct find_btree_nodes_worker {
static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
{
- prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+ bch2_btree_id_level_to_text(out, n->btree_id, n->level);
+ prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
+ n->seq, n->journal_seq, n->cookie);
bch2_bpos_to_text(out, n->min_key);
prt_str(out, "-");
bch2_bpos_to_text(out, n->max_key);
if (n->range_updated)
prt_str(out, " range updated");
- if (n->overwritten)
- prt_str(out, " overwritten");
for (unsigned i = 0; i < n->nr_ptrs; i++) {
prt_char(out, ' ');
@@ -57,22 +59,44 @@ static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_n
bp->v.seq = cpu_to_le64(f->cookie);
bp->v.sectors_written = 0;
bp->v.flags = 0;
+ bp->v.sectors_written = cpu_to_le16(f->sectors_written);
bp->v.min_key = f->min_key;
SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
}
+static inline u64 bkey_journal_seq(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode_v3:
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
+ default:
+ return 0;
+ }
+}
+
static bool found_btree_node_is_readable(struct btree_trans *trans,
- const struct found_btree_node *f)
+ struct found_btree_node *f)
{
- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
- found_btree_node_to_key(&k.k, f);
+ found_btree_node_to_key(&tmp.k, f);
- struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+ struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
bool ret = !IS_ERR_OR_NULL(b);
- if (ret)
- six_unlock_read(&b->c.lock);
+ if (!ret)
+ return ret;
+
+ f->sectors_written = b->written;
+ f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
+
+ struct bkey_s_c k;
+ struct bkey unpacked;
+ struct btree_node_iter iter;
+ for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
+ f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
+
+ six_unlock_read(&b->c.lock);
/*
* We might update this node's range; if that happens, we need the node
@@ -80,7 +104,7 @@ static bool found_btree_node_is_readable(struct btree_trans *trans,
* this node
*/
if (b != btree_node_root(trans->c, b))
- bch2_btree_node_evict(trans, &k.k);
+ bch2_btree_node_evict(trans, &tmp.k);
return ret;
}
@@ -101,7 +125,8 @@ static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
static int found_btree_node_cmp_time(const struct found_btree_node *l,
const struct found_btree_node *r)
{
- return cmp_int(l->seq, r->seq);
+ return cmp_int(l->seq, r->seq) ?:
+ cmp_int(l->journal_seq, r->journal_seq);
}
static int found_btree_node_cmp_pos(const void *_l, const void *_r)
@@ -115,6 +140,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r)
-found_btree_node_cmp_time(l, r);
}
+static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
+{
+ return found_btree_node_cmp_pos(l, r) < 0;
+}
+
+static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
+{
+ struct found_btree_node *l = _l;
+ struct found_btree_node *r = _r;
+
+ swap(*l, *r);
+}
+
+static const struct min_heap_callbacks found_btree_node_heap_cbs = {
+ .less = found_btree_node_cmp_pos_less,
+ .swp = found_btree_node_swap,
+};
+
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
struct bio *bio, struct btree_node *bn, u64 offset)
{
@@ -124,16 +167,25 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, bn, PAGE_SIZE);
+ u64 submit_time = local_clock();
submit_bio_wait(bio);
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- "IO error in try_read_btree_node() at %llu: %s",
- offset, bch2_blk_status_to_str(bio->bi_status)))
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status));
return;
+ }
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+ if (!c->chacha20_key_set)
+ return;
+
struct nonce nonce = btree_nonce(&bn->keys, 0);
unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
@@ -146,6 +198,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
return;
+ if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
+ return;
+
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
@@ -158,7 +213,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
.ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
.ptrs[0].offset = offset,
.ptrs[0].dev = ca->dev_idx,
- .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
+ .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
};
rcu_read_unlock();
@@ -216,7 +271,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
- percpu_ref_get(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
closure_put(w->cl);
kfree(w);
return 0;
@@ -230,88 +285,83 @@ static int read_btree_nodes(struct find_btree_nodes *f)
closure_init_stack(&cl);
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
continue;
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
- struct task_struct *t;
-
if (!w) {
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
ret = -ENOMEM;
goto err;
}
- percpu_ref_get(&ca->io_ref);
- closure_get(&cl);
w->cl = &cl;
w->f = f;
w->ca = ca;
- t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
- ret = IS_ERR_OR_NULL(t);
+ struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ ret = PTR_ERR_OR_ZERO(t);
if (ret) {
- percpu_ref_put(&ca->io_ref);
- closure_put(&cl);
- f->ret = ret;
- bch_err(c, "error starting kthread: %i", ret);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
+ kfree(w);
+ bch_err_msg(c, ret, "starting kthread");
break;
}
+
+ closure_get(&cl);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
+ wake_up_process(t);
}
err:
- closure_sync(&cl);
+ while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2))
+ ;
return f->ret ?: ret;
}
-static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+static bool nodes_overlap(const struct found_btree_node *l,
+ const struct found_btree_node *r)
{
- while (n + 1 < end &&
- found_btree_node_cmp_pos(n, n + 1) > 0) {
- swap(n[0], n[1]);
- n++;
- }
+ return (l->btree_id == r->btree_id &&
+ l->level == r->level &&
+ bpos_gt(l->max_key, r->min_key));
}
static int handle_overwrites(struct bch_fs *c,
- struct found_btree_node *start,
- struct found_btree_node *end)
+ struct found_btree_node *l,
+ found_btree_nodes *nodes_heap)
{
- struct found_btree_node *n;
-again:
- for (n = start + 1;
- n < end &&
- n->btree_id == start->btree_id &&
- n->level == start->level &&
- bpos_lt(n->min_key, start->max_key);
- n++) {
- int cmp = found_btree_node_cmp_time(start, n);
+ struct found_btree_node *r;
+
+ while ((r = min_heap_peek(nodes_heap)) &&
+ nodes_overlap(l, r)) {
+ int cmp = found_btree_node_cmp_time(l, r);
if (cmp > 0) {
- if (bpos_cmp(start->max_key, n->max_key) >= 0)
- n->overwritten = true;
+ if (bpos_cmp(l->max_key, r->max_key) >= 0)
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
else {
- n->range_updated = true;
- n->min_key = bpos_successor(start->max_key);
- n->range_updated = true;
- bubble_up(n, end);
- goto again;
+ r->range_updated = true;
+ r->min_key = bpos_successor(l->max_key);
+ r->range_updated = true;
+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
}
} else if (cmp < 0) {
- BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+ BUG_ON(bpos_eq(l->min_key, r->min_key));
- start->max_key = bpos_predecessor(n->min_key);
- start->range_updated = true;
+ l->max_key = bpos_predecessor(r->min_key);
+ l->range_updated = true;
+ } else if (r->level) {
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
} else {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "overlapping btree nodes with same seq! halting\n ");
- found_btree_node_to_text(&buf, c, start);
- prt_str(&buf, "\n ");
- found_btree_node_to_text(&buf, c, n);
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return -BCH_ERR_fsck_repair_unimplemented;
+ if (bpos_cmp(l->max_key, r->max_key) >= 0)
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
+ else {
+ r->range_updated = true;
+ r->min_key = bpos_successor(l->max_key);
+ r->range_updated = true;
+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
+ }
}
}
@@ -322,6 +372,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
{
struct find_btree_nodes *f = &c->found_btree_nodes;
struct printbuf buf = PRINTBUF;
+ found_btree_nodes nodes_heap = {};
size_t dst;
int ret = 0;
@@ -344,10 +395,10 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
}
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
dst = 0;
darray_for_each(f->nodes, i) {
@@ -367,38 +418,66 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
}
f->nodes.nr = dst;
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
if (0 && c->opts.verbose) {
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
}
- dst = 0;
- darray_for_each(f->nodes, i) {
- if (i->overwritten)
- continue;
+ swap(nodes_heap, f->nodes);
- ret = handle_overwrites(c, i, &darray_top(f->nodes));
+ {
+ /* darray must have same layout as a heap */
+ min_heap_char real_heap;
+ BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
+ BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
+ BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
+ BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
+ }
+
+ min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
+
+ if (nodes_heap.nr) {
+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
if (ret)
goto err;
- BUG_ON(i->overwritten);
- f->nodes.data[dst++] = *i;
+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
}
- f->nodes.nr = dst;
- if (c->opts.verbose) {
+ while (true) {
+ ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
+ if (ret)
+ goto err;
+
+ if (!nodes_heap.nr)
+ break;
+
+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
+ if (ret)
+ goto err;
+
+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
+ }
+
+ for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
+ BUG_ON(nodes_overlap(n, n + 1));
+
+ if (0 && c->opts.verbose) {
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
+ } else {
+ bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
}
eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
err:
+ darray_exit(&nodes_heap);
printbuf_exit(&buf);
return ret;
}
@@ -462,14 +541,16 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
struct find_btree_nodes *f = &c->found_btree_nodes;
- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
if (ret)
return ret;
if (c->opts.verbose) {
struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+ prt_str(&buf, "recovery ");
+ bch2_btree_id_level_to_text(&buf, btree, level);
+ prt_str(&buf, " ");
bch2_bpos_to_text(&buf, node_min);
prt_str(&buf, " - ");
bch2_bpos_to_text(&buf, node_max);
@@ -498,12 +579,19 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
found_btree_node_to_key(&tmp.k, &n);
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
- bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
- printbuf_exit(&buf);
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
- BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+ BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
+ (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = level + 1,
+ .btree = btree,
+ }));
ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
if (ret)