diff options
Diffstat (limited to 'fs/bcachefs/btree_trans_commit.c')
-rw-r--r-- | fs/bcachefs/btree_trans_commit.c | 510 |
1 files changed, 241 insertions, 269 deletions
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index bbec91e8e650..1c03c965d836 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_foreground.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -9,6 +10,8 @@ #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "disk_accounting.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -18,6 +21,27 @@ #include "snapshot.h" #include <linux/prefetch.h> +#include <linux/string_helpers.h> + +static const char * const trans_commit_flags_strs[] = { +#define x(n, ...) #n, + BCH_TRANS_COMMIT_FLAGS() +#undef x + NULL +}; + +void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags) +{ + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + + prt_printf(out, "watermark=%s", bch2_watermarks[watermark]); + + flags >>= BCH_WATERMARK_BITS; + if (flags) { + prt_char(out, ' '); + bch2_prt_bitflags(out, trans_commit_flags_strs, flags); + } +} static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) { @@ -111,11 +135,12 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) return 0; } -static inline void bch2_trans_unlock_write(struct btree_trans *trans) +static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans) { if (likely(trans->write_locked)) { trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) + if (btree_node_locked_type(trans->paths + i->path, i->level) == + BTREE_NODE_WRITE_LOCKED) bch2_btree_node_unlock_write_inlined(trans, trans->paths + i->path, insert_l(trans, i)->b); trans->write_locked = false; @@ -141,6 +166,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + kmsan_check_memory(insert, bkey_bytes(&insert->k)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) @@ -191,7 +217,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); overwrite: - bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + bch2_bset_insert(b, k, insert, clobber_u64s); new_u64s = k->u64s; fix_iter: if (clobber_u64s != new_u64s) @@ -207,14 +233,14 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); struct btree_trans *trans = bch2_trans_get(c); - unsigned long old, new, v; + unsigned long old, new; unsigned idx = w - b->writes; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - v = READ_ONCE(b->flags); + old = READ_ONCE(b->flags); do { - old = new = v; + new = old; if (!(old & (1 << BTREE_NODE_dirty)) || !!(old & (1 << BTREE_NODE_write_idx)) != idx || @@ -224,9 +250,9 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, new &= ~BTREE_WRITE_TYPE_MASK; new |= BTREE_WRITE_journal_reclaim; new |= 1 << BTREE_NODE_need_write; - } while ((v = cmpxchg(&b->flags, old, new)) != old); + } while (!try_cmpxchg(&b->flags, &old, new)); - btree_node_write_if_need(c, b, SIX_LOCK_read); + btree_node_write_if_need(trans, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); bch2_trans_put(trans); @@ -313,10 +339,11 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->cached != path->cached); BUG_ON(i->level != path->level); BUG_ON(i->btree_id != path->btree_id); + BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && - test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + !(i->flags & BTREE_UPDATE_internal_snapshot_node) && + test_bit(JOURNAL_replay_done, &trans->c->journal.flags) && i->k->k.p.snapshot && bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } @@ -325,7 +352,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags); + trans->journal_u64s, flags, trans); } #define JSET_ENTRY_LOG_U64s 4 @@ -341,7 +368,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans) struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); + memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), + trans->fn, strlen(trans->fn), 0); } static inline int btree_key_can_insert(struct btree_trans *trans, @@ -361,7 +389,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, struct bkey_i *new_k; int ret; - bch2_trans_unlock_write(trans); + bch2_trans_unlock_updates_write(trans); bch2_trans_unlock(trans); new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); @@ -435,34 +463,35 @@ static int run_one_mem_trigger(struct btree_trans *trans, struct btree_insert_entry *i, unsigned flags) { + verify_update_old_key(trans, i); + + if (unlikely(flags & BTREE_TRIGGER_norun)) + return 0; + struct bkey_s_c old = { &i->old_k, i->old_v }; struct bkey_i *new = i->k; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - int ret; - verify_update_old_key(trans, i); - - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (old_ops->trigger == new_ops->trigger) { - ret = bch2_key_trigger(trans, i->btree_id, i->level, + if (old_ops->trigger == new_ops->trigger) + return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - ret = bch2_key_trigger_new(trans, i->btree_id, i->level, + BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); + else + return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(new), flags) ?: - bch2_key_trigger_old(trans, i->btree_id, i->level, + bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags); - } - - return ret; } -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, - bool overwrite) +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) { + verify_update_old_key(trans, i); + + if ((i->flags & BTREE_TRIGGER_norun) || + !btree_node_type_has_trans_triggers(i->bkey_type)) + return 0; + /* * Transactional triggers create new btree_insert_entries, so we can't * pass them a pointer to a btree_insert_entry, that memory is going to @@ -472,13 +501,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ struct bkey_s_c old = { &old_k, i->old_v }; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; - - verify_update_old_key(trans, i); - - if ((i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - return 0; + unsigned flags = i->flags|BTREE_TRIGGER_transactional; if (!i->insert_trigger_run && !i->overwrite_trigger_run && @@ -486,12 +509,12 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ i->overwrite_trigger_run = true; i->insert_trigger_run = true; return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE|flags) ?: 1; - } else if (overwrite && !i->overwrite_trigger_run) { + BTREE_TRIGGER_insert| + BTREE_TRIGGER_overwrite|flags) ?: 1; + } else if (!i->overwrite_trigger_run) { i->overwrite_trigger_run = true; return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; - } else if (!overwrite && !i->insert_trigger_run) { + } else if (!i->insert_trigger_run) { i->insert_trigger_run = true; return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; } else { @@ -499,81 +522,49 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } } -static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - unsigned btree_id_start) +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - bool trans_trigger_run; - int ret, overwrite; + unsigned sort_id_start = 0; - for (overwrite = 1; overwrite >= 0; --overwrite) { + while (sort_id_start < trans->nr_updates) { + unsigned i, sort_id = trans->updates[sort_id_start].sort_order; + bool trans_trigger_run; /* - * Running triggers will append more updates to the list of updates as - * we're walking it: + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being + * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop + * references before they are re-added. + * + * Running triggers will append more updates to the list of + * updates as we're walking it: */ do { trans_trigger_run = false; - for (unsigned i = btree_id_start; - i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; + for (i = sort_id_start; + i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; i++) { - if (trans->updates[i].btree_id != btree_id) + if (trans->updates[i].sort_order < sort_id) { + sort_id_start = i; continue; + } - ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); + int ret = run_one_trans_trigger(trans, trans->updates + i); if (ret < 0) return ret; if (ret) trans_trigger_run = true; } } while (trans_trigger_run); - } - - return 0; -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - unsigned btree_id = 0, btree_id_start = 0; - int ret = 0; - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - if (btree_id == BTREE_ID_alloc) - continue; - - while (btree_id_start < trans->nr_updates && - trans->updates[btree_id_start].btree_id < btree_id) - btree_id_start++; - - ret = run_btree_triggers(trans, btree_id, btree_id_start); - if (ret) - return ret; - } - - for (unsigned idx = 0; idx < trans->nr_updates; idx++) { - struct btree_insert_entry *i = trans->updates + idx; - - if (i->btree_id > BTREE_ID_alloc) - break; - if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); - if (ret) - return ret; - break; - } + sort_id_start = i; } #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && + btree_node_type_has_trans_triggers(i->bkey_type) && (!i->insert_trigger_run || !i->overwrite_trigger_run)); #endif return 0; @@ -581,20 +572,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { - trans_for_each_update(trans, i) { - /* - * XXX: synchronization of cached update triggers with gc - * XXX: synchronization of interior node updates with gc - */ - BUG_ON(i->cached || i->level); - - if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && - gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { - int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + trans_for_each_update(trans, i) + if (btree_node_type_has_triggers(i->bkey_type) && + gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) { + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); if (ret) return ret; } - } return 0; } @@ -607,11 +591,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct bch_fs *c = trans->c; struct btree_trans_commit_hook *h; unsigned u64s = 0; - int ret; + int ret = 0; + + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); } /* @@ -661,32 +647,41 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && !(flags & BCH_TRANS_COMMIT_no_journal_res)) { - if (bch2_journal_seq_verify) + if (static_branch_unlikely(&bch2_journal_seq_verify)) trans_for_each_update(trans, i) - i->k->k.version.lo = trans->journal_res.seq; - else if (bch2_inject_invalid_keys) + i->k->k.bversion.lo = trans->journal_res.seq; + else if (static_branch_unlikely(&bch2_inject_invalid_keys)) trans_for_each_update(trans, i) - i->k->k.version = MAX_VERSION; + i->k->k.bversion = MAX_VERSION; } - if (trans->fs_usage_deltas && - bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) - return -BCH_ERR_btree_insert_need_mark_replicas; - - /* XXX: we only want to run this if deltas are nonzero */ - bch2_trans_account_disk_usage_change(trans); - h = trans->hooks; while (h) { ret = h->fn(trans, h); if (ret) - goto revert_fs_usage; + return ret; h = h->next; } + struct bkey_i *accounting; + + percpu_down_read(&c->mark_lock); + for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); + accounting != btree_trans_subbuf_top(trans, &trans->accounting); + accounting = bkey_next(accounting)) { + ret = bch2_accounting_trans_commit_hook(trans, + bkey_i_to_accounting(accounting), flags); + if (ret) + goto revert_fs_usage; + } + percpu_up_read(&c->mark_lock); + + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); + trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); + if (btree_node_type_has_atomic_triggers(i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); if (ret) goto fatal_err; } @@ -697,6 +692,37 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } + struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; + + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); + i = vstruct_next(i)) { + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, validate_context); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); + goto fatal_err; + } + } + + trans_for_each_update(trans, i) { + validate_context.level = i->level; + validate_context.btree = i->btree_id; + + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); + if (unlikely(ret)){ + bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", + trans->fn, (void *) i->ip_allocated); + goto fatal_err; + } + btree_insert_entry_checks(trans, i); + } + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -705,7 +731,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; verify_update_old_key(trans, i); @@ -727,11 +753,18 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->journal_entries, - trans->journal_entries_u64s); + btree_trans_journal_entries_start(trans), + trans->journal_entries.u64s); + + trans->journal_res.offset += trans->journal_entries.u64s; + trans->journal_res.u64s -= trans->journal_entries.u64s; - trans->journal_res.offset += trans->journal_entries_u64s; - trans->journal_res.u64s -= trans->journal_entries_u64s; + memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_write_buffer_keys, + BTREE_ID_accounting, 0, + trans->accounting.u64s)->_data, + btree_trans_subbuf_base(trans, &trans->accounting), + trans->accounting.u64s); if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; @@ -740,75 +773,38 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_update(trans, i) { struct btree_path *path = trans->paths + i->path; - if (!i->cached) { + if (!i->cached) bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); - } else if (!i->key_cache_already_flushed) + else if (!i->key_cache_already_flushed) bch2_btree_insert_key_cached(trans, flags, i); - else { + else bch2_btree_key_cache_drop(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - } } return 0; fatal_err: - bch2_fatal_error(c); + bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); + percpu_down_read(&c->mark_lock); revert_fs_usage: - if (trans->fs_usage_deltas) - bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != accounting; + i = bkey_next(i)) + bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); + percpu_up_read(&c->mark_lock); return ret; } static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { + /* + * Accounting keys aren't deduped in the journal: we have to compare + * each individual update against what's in the btree to see if it has + * been applied yet, and accounting updates also don't overwrite, + * they're deltas that accumulate. + */ trans_for_each_update(trans, i) - bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); -} - -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, - enum bkey_invalid_flags flags, - struct btree_insert_entry *i, - struct printbuf *err) -{ - struct bch_fs *c = trans->c; - - printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps", - trans->fn, (void *) i->ip_allocated); - prt_newline(err); - printbuf_indent_add(err, 2); - - bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); - prt_newline(err); - - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); - bch2_print_string_as_lines(KERN_ERR, err->buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; -} - -static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, - struct jset_entry *i) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "invalid bkey on insert from %s", trans->fn); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - bch2_journal_entry_to_text(&buf, c, i); - prt_newline(&buf); - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; + if (i->k->k.type != KEY_TYPE_accounting) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); } static int bch2_trans_commit_journal_pin_flush(struct journal *j, @@ -856,7 +852,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); - bch2_trans_unlock_write(trans); + bch2_trans_unlock_updates_write(trans); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, @@ -876,7 +872,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags static int journal_reclaim_wait_done(struct bch_fs *c) { int ret = bch2_journal_error(&c->journal) ?: - !bch2_btree_key_cache_must_wait(c); + bch2_btree_key_cache_wait_done(c); if (!ret) journal_reclaim_kick(&c->journal); @@ -891,18 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, struct bch_fs *c = trans->c; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, - trace_ip, trans->paths + i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); - break; - case -BCH_ERR_journal_res_get_blocked: + if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { /* * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * flag @@ -910,21 +895,38 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; - break; + goto out; } ret = drop_locks_do(trans, bch2_trans_journal_res_get(trans, (flags & BCH_WATERMARK_MASK)| JOURNAL_RES_GET_CHECK)); + goto out; + } + + switch (ret) { + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, + trace_ip, trans->paths + i->path); + break; + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_accounting_update_sb(trans)); break; case -BCH_ERR_btree_insert_need_journal_reclaim: bch2_trans_unlock(trans); trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); + + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false); + if (ret < 0) break; @@ -934,7 +936,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, BUG_ON(ret >= 0); break; } - +out: BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && @@ -944,24 +946,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, return ret; } -static noinline int -bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) -{ - struct bch_fs *c = trans->c; - int ret; - - if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || - test_bit(BCH_FS_started, &c->flags)) - return -BCH_ERR_erofs_trans_commit; - - ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); - if (ret) - return ret; - - bch2_write_ref_get(c, BCH_WRITE_REF_trans); - return 0; -} - /* * This is for updates done in the early part of fsck - btree_gc - before we've * gone RW. we only add the new key to the list of keys for journal replay to @@ -971,15 +955,34 @@ static noinline int do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { struct bch_fs *c = trans->c; - int ret = 0; + + BUG_ON(current != c->recovery_task); trans_for_each_update(trans, i) { - ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) - break; + return ret; } - return ret; + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); + i = vstruct_next(i)) + if (i->type == BCH_JSET_ENTRY_btree_keys || + i->type == BCH_JSET_ENTRY_write_buffer_keys) { + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start); + if (ret) + return ret; + } + + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); + if (ret) + return ret; + } + + return 0; } int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) @@ -988,65 +991,33 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret = 0; - if (!trans->nr_updates && - !trans->journal_entries_u64s) + bch2_trans_verify_not_unlocked_or_in_restart(trans); + + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) goto out_reset; - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + if (!trans->nr_updates && + !trans->journal_entries.u64s && + !trans->accounting.u64s) + goto out_reset; ret = bch2_trans_commit_run_triggers(trans); if (ret) goto out_reset; - trans_for_each_update(trans, i) { - struct printbuf buf = PRINTBUF; - enum bkey_invalid_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; - - if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); - btree_insert_entry_checks(trans, i); - printbuf_exit(&buf); - - if (ret) - return ret; - } - - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) { - enum bkey_invalid_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; - - if (unlikely(bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags))) - ret = bch2_trans_commit_journal_entry_invalid(trans, i); - - if (ret) - return ret; - } - - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { - ret = do_bch2_trans_commit_to_journal_replay(trans); - goto out_reset; - } - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { - ret = bch2_trans_commit_get_rw_cold(trans, flags); - if (ret) - goto out_reset; + unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) + ret = do_bch2_trans_commit_to_journal_replay(trans); + else + ret = -BCH_ERR_erofs_trans_commit; + goto out_reset; } EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->journal_entries_u64s; + trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s); trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); @@ -1065,7 +1036,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; /* we're going to journal the key being updated: */ @@ -1086,9 +1057,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) } retry: errored_at = NULL; - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); @@ -1101,7 +1073,7 @@ retry: trace_and_count(c, transaction_commit, trans, _RET_IP_); out: if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) - bch2_write_ref_put(c, BCH_WRITE_REF_trans); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); out_reset: if (!ret) bch2_trans_downgrade(trans); |