diff options
Diffstat (limited to 'fs/bcachefs/btree_journal_iter.c')
-rw-r--r-- | fs/bcachefs/btree_journal_iter.c | 434 |
1 files changed, 357 insertions, 77 deletions
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 50e04356d72c..ea839560a136 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -16,19 +16,15 @@ * operations for the regular btree iter code to use: */ -static int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) +static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) { - return (cmp_int(l_btree_id, r->btree_id) ?: - cmp_int(l_level, r->level) ?: - bpos_cmp(l_pos, r->k->k.p)); -} + size_t gap_size = keys->size - keys->nr; -static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -{ - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); + BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); + + if (pos >= keys->gap) + pos -= gap_size; + return pos; } static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) @@ -76,7 +72,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, } /* Returns first non-overwritten key >= search key: */ -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, +struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos, struct bpos end_pos, size_t *idx) { @@ -99,27 +95,102 @@ search: } } + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - return NULL; + break; if (k->overwritten) { - (*idx)++; + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->end; + else + *idx += 1; continue; } - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) - return k->k; + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { + ret = k->k; + break; + } (*idx)++; iters++; if (iters == 10) { *idx = 0; + rcu_read_unlock(); goto search; } } - return NULL; + rcu_read_unlock(); + return ret; +} + +struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; + + BUG_ON(*idx > keys->nr); + + if (!keys->nr) + return NULL; +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while (*idx < keys->nr && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + if (*idx == keys->nr) + --(*idx); + + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + + while (true) { + k = idx_to_key(keys, *idx); + if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) + break; + + if (k->overwritten) { + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->start; + if (!*idx) + break; + --(*idx); + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { + ret = k->k; + break; + } + + if (!*idx) + break; + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + rcu_read_unlock(); + return ret; } struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, @@ -127,7 +198,25 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree { size_t idx = 0; - return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); + return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); +} + +static void journal_iter_verify(struct journal_iter *iter) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct journal_keys *keys = iter->keys; + size_t gap_size = keys->size - keys->nr; + + BUG_ON(iter->idx >= keys->gap && + iter->idx < keys->gap + gap_size); + + if (iter->idx < keys->size) { + struct journal_key *k = keys->data + iter->idx; + + int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); + BUG_ON(cmp > 0); + } +#endif } static void journal_iters_fix(struct bch_fs *c) @@ -135,7 +224,8 @@ static void journal_iters_fix(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; /* The key we just inserted is immediately before the gap: */ size_t gap_end = keys->gap + (keys->size - keys->nr); - struct btree_and_journal_iter *iter; + struct journal_key *new_key = &keys->data[keys->gap - 1]; + struct journal_iter *iter; /* * If an iterator points one after the key we just inserted, decrement @@ -143,9 +233,14 @@ static void journal_iters_fix(struct bch_fs *c) * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will * handle that: */ - list_for_each_entry(iter, &c->journal_iters, journal.list) - if (iter->journal.idx == gap_end) - iter->journal.idx = keys->gap - 1; + list_for_each_entry(iter, &c->journal_iters, list) { + journal_iter_verify(iter); + if (iter->idx == gap_end && + new_key->btree_id == iter->btree_id && + new_key->level == iter->level) + iter->idx = keys->gap - 1; + journal_iter_verify(iter); + } } static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) @@ -174,7 +269,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, * Ensure these keys are done last by journal replay, to unblock * journal reclaim: */ - .journal_seq = U32_MAX, + .journal_seq = U64_MAX, }; struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); @@ -192,17 +287,22 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (idx > keys->gap) idx -= keys->size - keys->nr; + size_t old_gap = keys->gap; + if (keys->nr == keys->size) { + journal_iters_move_gap(c, old_gap, keys->size); + old_gap = keys->size; + struct journal_keys new_keys = { .nr = keys->nr, .size = max_t(size_t, keys->size, 8) * 2, }; - new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); + new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL); if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); } /* Since @keys was full, there was no gap: */ @@ -216,7 +316,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, keys->gap = keys->nr; } - journal_iters_move_gap(c, keys->gap, idx); + journal_iters_move_gap(c, old_gap, idx); move_gap(keys, idx); @@ -241,7 +341,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); bkey_copy(n, k); ret = bch2_journal_key_insert_take(c, id, level, n); @@ -261,6 +361,84 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, return bch2_journal_key_insert(c, id, level, &whiteout); } +bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (!trans->journal_replay_not_finished) + return false; + + return (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + bkey_deleted(&keys->data[idx].k->k)); +} + +static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) +{ + struct journal_key *k = keys->data + pos; + size_t idx = pos_to_idx(keys, pos); + + k->overwritten = true; + + struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; + struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; + + bool prev_overwritten = prev && prev->overwritten; + bool next_overwritten = next && next->overwritten; + + struct journal_key_range_overwritten *prev_range = + prev_overwritten ? prev->overwritten_range : NULL; + struct journal_key_range_overwritten *next_range = + next_overwritten ? next->overwritten_range : NULL; + + BUG_ON(prev_range && prev_range->end != idx); + BUG_ON(next_range && next_range->start != idx + 1); + + if (prev_range && next_range) { + prev_range->end = next_range->end; + + keys->data[pos].overwritten_range = prev_range; + for (size_t i = next_range->start; i < next_range->end; i++) { + struct journal_key *ip = keys->data + idx_to_pos(keys, i); + BUG_ON(ip->overwritten_range != next_range); + ip->overwritten_range = prev_range; + } + + kfree_rcu_mightsleep(next_range); + } else if (prev_range) { + prev_range->end++; + k->overwritten_range = prev_range; + if (next_overwritten) { + prev_range->end++; + next->overwritten_range = prev_range; + } + } else if (next_range) { + next_range->start--; + k->overwritten_range = next_range; + if (prev_overwritten) { + next_range->start--; + prev->overwritten_range = next_range; + } + } else if (prev_overwritten || next_overwritten) { + struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return; + + r->start = idx - (size_t) prev_overwritten; + r->end = idx + 1 + (size_t) next_overwritten; + + rcu_assign_pointer(k->overwritten_range, r); + if (prev_overwritten) + prev->overwritten_range = r; + if (next_overwritten) + next->overwritten_range = r; + } +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { @@ -270,8 +448,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, if (idx < keys->size && keys->data[idx].btree_id == btree && keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos)) - keys->data[idx].overwritten = true; + bpos_eq(keys->data[idx].k->k.p, pos) && + !keys->data[idx].overwritten) { + mutex_lock(&keys->overwrite_lock); + __bch2_journal_key_overwritten(keys, idx); + mutex_unlock(&keys->overwrite_lock); + } } static void bch2_journal_iter_advance(struct journal_iter *iter) @@ -285,16 +467,24 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->data + iter->idx; + journal_iter_verify(iter); + + guard(rcu)(); + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; + + int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); + if (cmp < 0) + break; + BUG_ON(cmp); - while (k < iter->keys->data + iter->keys->size && - k->btree_id == iter->btree_id && - k->level == iter->level) { if (!k->overwritten) return bkey_i_to_s_c(k->k); - bch2_journal_iter_advance(iter); - k = iter->keys->data + iter->idx; + if (k->overwritten_range) + iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); + else + bch2_journal_iter_advance(iter); } return bkey_s_c_null; @@ -314,6 +504,8 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->level = level; iter->keys = &c->journal_keys; iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); + + journal_iter_verify(iter); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -346,6 +538,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter : (level > 1 ? 1 : 16); iter.prefetch = false; + iter.fail_if_too_many_whiteouts = true; bch2_bkey_buf_init(&tmp); while (nr--) { @@ -363,7 +556,8 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { - struct bkey_s_c btree_k, journal_k, ret; + struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; + size_t iters = 0; if (iter->prefetch && iter->journal.level) btree_and_journal_iter_prefetch(iter); @@ -371,13 +565,19 @@ again: if (iter->at_end) return bkey_s_c_null; + iters++; + + if (iters > 20 && iter->fail_if_too_many_whiteouts) + return bkey_s_c_null; + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); + if (iter->trans->journal_replay_not_finished) + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); ret = journal_k.k && (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) @@ -417,10 +617,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); - INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; + INIT_LIST_HEAD(&iter->journal.list); + + if (trans->journal_replay_not_finished) { + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); + if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); + } } /* @@ -435,21 +640,10 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ -void bch2_journal_entries_free(struct bch_fs *c) -{ - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - kvfree(*i); - genradix_free(&c->journal_entries); -} - /* * When keys compare equal, oldest compares first: */ @@ -457,10 +651,11 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) { const struct journal_key *l = _l; const struct journal_key *r = _r; + int rewind = l->rewind && r->rewind ? -1 : 1; return journal_key_cmp(l, r) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); + ((cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset)) * rewind); } void bch2_journal_keys_put(struct bch_fs *c) @@ -474,25 +669,45 @@ void bch2_journal_keys_put(struct bch_fs *c) move_gap(keys, keys->nr); - darray_for_each(*keys, i) + darray_for_each(*keys, i) { + if (i->overwritten_range && + (i == &darray_last(*keys) || + i->overwritten_range != i[1].overwritten_range)) + kfree(i->overwritten_range); + if (i->allocated) kfree(i->k); + } kvfree(keys->data); keys->data = NULL; keys->nr = keys->gap = keys->size = 0; - bch2_journal_entries_free(c); + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + kvfree(*i); + genradix_free(&c->journal_entries); } static void __journal_keys_sort(struct journal_keys *keys) { - sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); + sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), + journal_sort_key_cmp, NULL); + + cond_resched(); struct journal_key *dst = keys->data; darray_for_each(*keys, src) { - if (src + 1 < &darray_top(*keys) && + /* + * We don't accumulate accounting keys here because we have to + * compare each individual accounting key against the version in + * the btree during replay: + */ + if (src->k->k.type != KEY_TYPE_accounting && + src + 1 < &darray_top(*keys) && !journal_key_cmp(src, src + 1)) continue; @@ -509,6 +724,8 @@ int bch2_journal_keys_sort(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; size_t nr_read = 0; + u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; + genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; @@ -517,28 +734,43 @@ int bch2_journal_keys_sort(struct bch_fs *c) cond_resched(); - for_each_jset_key(k, entry, &i->j) { - struct journal_key n = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; - - if (darray_push(keys, n)) { - __journal_keys_sort(keys); - - if (keys->nr * 8 > keys->size * 7) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", - keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return -BCH_ERR_ENOMEM_journal_keys_sort; + vstruct_for_each(&i->j, entry) { + bool rewind = !entry->level && + !btree_id_is_alloc(entry->btree_id) && + le64_to_cpu(i->j.seq) >= rewind_seq; + + if (entry->type != (rewind + ? BCH_JSET_ENTRY_overwrite + : BCH_JSET_ENTRY_btree_keys)) + continue; + + if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) + continue; + + jset_entry_for_each_key(entry, k) { + struct journal_key n = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .rewind = rewind, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return bch_err_throw(c, ENOMEM_journal_keys_sort); + } + + BUG_ON(darray_push(keys, n)); } - BUG_ON(darray_push(keys, n)); + nr_read++; } - - nr_read++; } } @@ -548,3 +780,51 @@ int bch2_journal_keys_sort(struct bch_fs *c) bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } + +void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, + unsigned level_min, unsigned level_max, + struct bpos start, struct bpos end) +{ + struct journal_keys *keys = &c->journal_keys; + size_t dst = 0; + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) + if (!(i->btree_id == btree && + i->level >= level_min && + i->level <= level_max && + bpos_ge(i->k->k.p, start) && + bpos_le(i->k->k.p, end))) + keys->data[dst++] = *i; + keys->nr = keys->gap = dst; +} + +void bch2_journal_keys_dump(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct printbuf buf = PRINTBUF; + + pr_info("%zu keys:", keys->nr); + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + printbuf_reset(&buf); + prt_printf(&buf, "btree="); + bch2_btree_id_to_text(&buf, i->btree_id); + prt_printf(&buf, " l=%u ", i->level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err("%s", buf.buf); + } + printbuf_exit(&buf); +} + +void bch2_fs_journal_keys_init(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + atomic_set(&keys->ref, 1); + keys->initial_ref_held = true; + mutex_init(&keys->overwrite_lock); +} |