diff options
Diffstat (limited to 'fs/bcachefs/btree_update_interior.c')
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 187 |
1 files changed, 123 insertions, 64 deletions
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 00307356d7c8..e77584607f0d 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -14,6 +14,7 @@ #include "btree_locking.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -56,8 +57,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) struct bkey_buf prev; int ret = 0; - printbuf_indent_add_nextline(&buf, 2); - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); @@ -68,20 +67,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { - ret = __bch2_topology_error(c, &buf); - + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect min_key: "); bch2_bpos_to_text(&buf, b->data->min_key); - log_fsck_err(trans, btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); + goto err; } if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - ret = __bch2_topology_error(c, &buf); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect max_key: "); bch2_bpos_to_text(&buf, b->data->max_key); - log_fsck_err(trans, btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); + goto err; } } @@ -99,19 +101,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) : bpos_successor(prev.k->k.p); if (!bpos_eq(expected_min, bp.v->min_key)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "end of prev node doesn't match start of next node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "end of prev node doesn't match start of next node"); prt_str(&buf, "\nprev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); prt_str(&buf, "\nnext "); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); - goto out; + bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); + goto err; } bch2_bkey_buf_reassemble(&prev, c, k); @@ -119,32 +117,34 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) } if (bkey_deleted(&prev.k->k)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "empty interior node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); - } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - ret = __bch2_topology_error(c, &buf); + prt_printf(&buf, "empty interior node\n"); + bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); + goto err; + } - prt_str(&buf, "last child node doesn't end at end of parent node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\nlast key "); + if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); + goto err; } out: -fsck_err: bch2_btree_and_journal_iter_exit(&iter); bch2_bkey_buf_exit(&prev, c); printbuf_exit(&buf); return ret; +err: + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_newline(&buf); + + ret = __bch2_topology_error(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + BUG_ON(!ret); + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -284,6 +284,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, bool interior_node, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -317,6 +318,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, mutex_unlock(&c->btree_reserve_cache_lock); retry: ret = bch2_alloc_sectors_start_trans(trans, + target ?: c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -325,7 +327,9 @@ retry: res->nr_replicas, min(res->nr_replicas, c->opts.metadata_replicas_required), - watermark, 0, cl, &wp); + watermark, + target ? BCH_WRITE_only_specified_devs : 0, + cl, &wp); if (unlikely(ret)) goto err; @@ -505,6 +509,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * static int bch2_btree_reserve_get(struct btree_trans *trans, struct btree_update *as, unsigned nr_nodes[2], + unsigned target, unsigned flags, struct closure *cl) { @@ -527,7 +532,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, while (p->nr < nr_nodes[interior]) { b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, flags); + interior, target, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -679,12 +684,31 @@ static void btree_update_nodes_written(struct btree_update *as) /* * Wait for any in flight writes to finish before we free the old nodes - * on disk: + * on disk. But we haven't pinned those old nodes in the btree cache, + * they might have already been evicted. + * + * The update we're completing deleted references to those nodes from the + * btree, so we know if they've been evicted they can't be pulled back in. + * We just have to check if the nodes we have pointers to are still those + * old nodes, and haven't been reused. + * + * This can't be done locklessly because the data buffer might have been + * vmalloc allocated, and they're not RCU freed. We also need the + * __no_kmsan_checks annotation because even with the btree node read + * lock, nothing tells us that the data buffer has been initialized (if + * the btree node has been reused for a different node, and the data + * buffer swapped for a new data buffer). */ for (i = 0; i < as->nr_old_nodes; i++) { b = as->old_nodes[i]; - if (btree_node_seq_matches(b, as->old_nodes_seq[i])) + bch2_trans_begin(trans); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); + six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); + + if (seq_matches) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -1114,9 +1138,17 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * start_time); } +static const char * const btree_node_reawrite_reason_strs[] = { +#define x(n) #n, + BTREE_NODE_REWRITE_REASON() +#undef x + NULL, +}; + static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, unsigned flags) + unsigned level_start, bool split, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1207,6 +1239,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, list_add_tail(&as->list, &c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); + struct btree *b = btree_path_node(path, path->level); + as->node_start = b->data->min_key; + as->node_end = b->data->max_key; + as->node_needed_rewrite = btree_node_rewrite_reason(b); + as->node_written = b->written; + as->node_sectors = btree_buf_bytes(b) >> 9; + as->node_remaining = __bch2_btree_u64s_remaining(b, + btree_bkey_last(b, bset_tree_last(b))); + /* * We don't want to allocate if we're in an error state, that can cause * deadlock on emergency shutdown due to open buckets getting stuck in @@ -1226,7 +1267,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); if (bch2_err_matches(ret, ENOSPC) || bch2_err_matches(ret, ENOMEM)) { struct closure cl; @@ -1238,14 +1279,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto err; } closure_init_stack(&cl); do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); @@ -1806,10 +1847,10 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t __func__, b->c.level); bch2_btree_update_to_text(&buf, as); bch2_btree_path_to_text(&buf, trans, path_idx); + bch2_fs_emergency_read_only2(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - bch2_fs_emergency_read_only(c); return -EIO; } @@ -1878,7 +1919,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, as = bch2_btree_update_start(trans, trans->paths + path, trans->paths[path].level, - true, flags); + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1948,7 +1989,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, return bch2_btree_split_leaf(trans, path, flags); struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -2077,11 +2119,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, parent = btree_node_parent(trans->paths + path, b); as = bch2_btree_update_start(trans, trans->paths + path, level, false, - BCH_TRANS_COMMIT_no_enospc|flags); + 0, BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; + as->node_start = prev->data->min_key; + as->node_end = next->data->max_key; + trace_and_count(c, btree_node_merge, trans, b); n = bch2_btree_node_alloc(as, trans, b->c.level); @@ -2170,7 +2215,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); - ret = -BCH_ERR_btree_node_dying; + ret = bch_err_throw(trans->c, btree_node_dying); goto err; } @@ -2184,6 +2229,7 @@ err: int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -2196,7 +2242,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, iter); parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, false, flags); + as = bch2_btree_update_start(trans, path, b->c.level, + false, target, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -2261,7 +2308,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans, bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, flags) + ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) : -ENOENT; out: bch2_trans_iter_exit(trans, &iter); @@ -2270,7 +2317,9 @@ out: int bch2_btree_node_rewrite_pos(struct btree_trans *trans, enum btree_id btree, unsigned level, - struct bpos pos, unsigned flags) + struct bpos pos, + unsigned target, + unsigned flags) { BUG_ON(!level); @@ -2282,7 +2331,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans, if (ret) goto err; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -2296,7 +2345,7 @@ int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, if (ret) return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2330,7 +2379,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) closure_wake_up(&c->btree_node_rewrites_wait); bch2_bkey_buf_exit(&a->key, c); - bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2351,8 +2400,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && - bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && + enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { @@ -2391,7 +2440,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) if (!a) break; - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); queue_work(c->btree_node_rewrite_worker, &a->work); } } @@ -2651,9 +2700,19 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update prt_str(out, " "); bch2_btree_id_to_text(out, as->btree_id); - prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, " l=%u-%u ", as->update_level_start, - as->update_level_end, + as->update_level_end); + bch2_bpos_to_text(out, as->node_start); + prt_char(out, ' '); + bch2_bpos_to_text(out, as->node_end); + prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", + as->node_written, + as->node_sectors, + as->node_remaining, + btree_node_reawrite_reason_strs[as->node_needed_rewrite]); + + prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_update_modes[as->mode], as->nodes_written, closure_nr_remaining(&as->cl), @@ -2780,16 +2839,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) c->btree_interior_update_worker = alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); c->btree_node_rewrite_worker = alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); if (!c->btree_node_rewrite_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) - return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); return 0; } |