diff options
Diffstat (limited to 'fs/bcachefs/btree_key_cache.c')
-rw-r--r-- | fs/bcachefs/btree_key_cache.c | 915 |
1 files changed, 364 insertions, 551 deletions
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 88a3582a3275..9da950e7eb7d 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -32,12 +32,22 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .head_offset = offsetof(struct bkey_cached, hash), + .key_offset = offsetof(struct bkey_cached, key), + .key_len = sizeof(struct bkey_cached_key), + .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .automatic_shrinking = true, }; +static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, + struct bkey_cached *ck, + enum btree_node_locked_type lock_held) +{ + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; + mark_btree_node_locked(trans, path, 0, lock_held); +} + __flatten inline struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) @@ -69,226 +79,108 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) return true; } -static void bkey_cached_evict(struct btree_key_cache *c, +static bool bkey_cached_evict(struct btree_key_cache *c, struct bkey_cached *ck) { - BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params)); - memset(&ck->key, ~0, sizeof(ck->key)); + bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, + bch2_btree_key_cache_params); + if (ret) { + memset(&ck->key, ~0, sizeof(ck->key)); + atomic_long_dec(&c->nr_keys); + } - atomic_long_dec(&c->nr_keys); + return ret; } -static void bkey_cached_free(struct btree_key_cache *bc, - struct bkey_cached *ck) +static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) { - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); + struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); - ck->btree_trans_barrier_seq = - start_poll_synchronize_srcu(&c->btree_trans_barrier); - - if (ck->c.lock.readers) { - list_move_tail(&ck->list, &bc->freed_pcpu); - bc->nr_freed_pcpu++; - } else { - list_move_tail(&ck->list, &bc->freed_nonpcpu); - bc->nr_freed_nonpcpu++; - } - atomic_long_inc(&bc->nr_freed); + this_cpu_dec(*c->btree_key_cache.nr_pending); + kmem_cache_free(bch2_key_cache, ck); +} +static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ kfree(ck->k); ck->k = NULL; ck->u64s = 0; six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); -} -#ifdef __KERNEL__ -static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - struct bkey_cached *pos; - - bc->nr_freed_nonpcpu++; - - list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { - if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, - pos->btree_trans_barrier_seq)) { - list_move(&ck->list, &pos->list); - return; - } - } - - list_move(&ck->list, &bc->freed_nonpcpu); + bool pcpu_readers = ck->c.lock.readers != NULL; + rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); + this_cpu_inc(*bc->nr_pending); } -#endif -static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, - struct bkey_cached *ck) +static void bkey_cached_free(struct btree_trans *trans, + struct btree_key_cache *bc, + struct bkey_cached *ck) { - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); - - if (!ck->c.lock.readers) { -#ifdef __KERNEL__ - struct btree_key_cache_freelist *f; - bool freed = false; - - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - if (f->nr < ARRAY_SIZE(f->objs)) { - f->objs[f->nr++] = ck; - freed = true; - } - preempt_enable(); - - if (!freed) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (f->nr > ARRAY_SIZE(f->objs) / 2) { - struct bkey_cached *ck2 = f->objs[--f->nr]; - - __bkey_cached_move_to_freelist_ordered(bc, ck2); - } - preempt_enable(); + /* + * we'll hit strange issues in the SRCU code if we aren't holding an + * SRCU read lock... + */ + EBUG_ON(!trans->srcu_held); - __bkey_cached_move_to_freelist_ordered(bc, ck); - mutex_unlock(&bc->lock); - } -#else - mutex_lock(&bc->lock); - list_move_tail(&ck->list, &bc->freed_nonpcpu); - bc->nr_freed_nonpcpu++; - mutex_unlock(&bc->lock); -#endif - } else { - mutex_lock(&bc->lock); - list_move_tail(&ck->list, &bc->freed_pcpu); - bc->nr_freed_pcpu++; - mutex_unlock(&bc->lock); - } + bkey_cached_free_noassert(bc, ck); } -static void bkey_cached_free_fast(struct btree_key_cache *bc, - struct bkey_cached *ck) +static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) { - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - - ck->btree_trans_barrier_seq = - start_poll_synchronize_srcu(&c->btree_trans_barrier); - - list_del_init(&ck->list); - atomic_long_inc(&bc->nr_freed); - - kfree(ck->k); - ck->k = NULL; - ck->u64s = 0; - - bkey_cached_move_to_freelist(bc, ck); + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); + struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); + if (unlikely(!ck)) + return NULL; + ck->k = kmalloc(key_u64s * sizeof(u64), gfp); + if (unlikely(!ck->k)) { + kmem_cache_free(bch2_key_cache, ck); + return NULL; + } + ck->u64s = key_u64s; + return ck; } static struct bkey_cached * -bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, - bool *was_new) +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck = NULL; bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); int ret; - if (!pcpu_readers) { -#ifdef __KERNEL__ - struct btree_key_cache_freelist *f; - - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - if (f->nr) - ck = f->objs[--f->nr]; - preempt_enable(); - - if (!ck) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (!list_empty(&bc->freed_nonpcpu) && - f->nr < ARRAY_SIZE(f->objs) / 2) { - ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_nonpcpu--; - f->objs[f->nr++] = ck; - } - - ck = f->nr ? f->objs[--f->nr] : NULL; - preempt_enable(); - mutex_unlock(&bc->lock); - } -#else - mutex_lock(&bc->lock); - if (!list_empty(&bc->freed_nonpcpu)) { - ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_nonpcpu--; - } - mutex_unlock(&bc->lock); -#endif - } else { - mutex_lock(&bc->lock); - if (!list_empty(&bc->freed_pcpu)) { - ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_pcpu--; - } - mutex_unlock(&bc->lock); - } - - if (ck) { - ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); - if (unlikely(ret)) { - bkey_cached_move_to_freelist(bc, ck); - return ERR_PTR(ret); - } - - path->l[0].b = (void *) ck; - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); - - ret = bch2_btree_node_lock_write(trans, path, &ck->c); - if (unlikely(ret)) { - btree_node_unlock(trans, path, 0); - bkey_cached_move_to_freelist(bc, ck); - return ERR_PTR(ret); - } - - return ck; - } + struct bkey_cached *ck = container_of_or_null( + rcu_pending_dequeue(&bc->pending[pcpu_readers]), + struct bkey_cached, rcu); + if (ck) + goto lock; ck = allocate_dropping_locks(trans, ret, - kmem_cache_zalloc(bch2_key_cache, _gfp)); + __bkey_cached_alloc(key_u64s, _gfp)); if (ret) { + if (ck) + kfree(ck->k); kmem_cache_free(bch2_key_cache, ck); return ERR_PTR(ret); } - if (!ck) - return NULL; - - INIT_LIST_HEAD(&ck->list); - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); - - ck->c.cached = true; - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); - *was_new = true; + if (ck) { + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); + ck->c.cached = true; + goto lock; + } + + ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), + struct bkey_cached, rcu); + if (ck) + goto lock; +lock: + six_lock_intent(&ck->c.lock, NULL, NULL); + six_lock_write(&ck->c.lock, NULL, NULL); return ck; } @@ -300,312 +192,230 @@ bkey_cached_reuse(struct btree_key_cache *c) struct bkey_cached *ck; unsigned i; - mutex_lock(&c->lock); rcu_read_lock(); tbl = rht_dereference_rcu(c->table.tbl, &c->table); for (i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(c, ck); - goto out; + if (bkey_cached_evict(c, ck)) + goto out; + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); } } ck = NULL; out: rcu_read_unlock(); - mutex_unlock(&c->lock); return ck; } -static struct bkey_cached * -btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) +static int btree_key_cache_create(struct btree_trans *trans, + struct btree_path *path, + struct btree_path *ck_path, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck; - bool was_new = false; - ck = bkey_cached_alloc(trans, path, &was_new); - if (IS_ERR(ck)) - return ck; + /* + * bch2_varint_decode can read past the end of the buffer by at + * most 7 bytes (it won't be used): + */ + unsigned key_u64s = k.k->u64s + 1; + + /* + * Allocate some extra space so that the transaction commit path is less + * likely to have to reallocate, since that requires a transaction + * restart: + */ + key_u64s = min(256U, (key_u64s * 3) / 2); + key_u64s = roundup_pow_of_two(key_u64s); + + struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); + int ret = PTR_ERR_OR_ZERO(ck); + if (ret) + return ret; if (unlikely(!ck)) { ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_id_str(path->btree_id)); - return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); + bch2_btree_id_str(ck_path->btree_id)); + return -BCH_ERR_ENOMEM_btree_key_cache_create; } - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); } ck->c.level = 0; - ck->c.btree_id = path->btree_id; - ck->key.btree_id = path->btree_id; - ck->key.pos = path->pos; - ck->valid = false; + ck->c.btree_id = ck_path->btree_id; + ck->key.btree_id = ck_path->btree_id; + ck->key.pos = ck_path->pos; ck->flags = 1U << BKEY_CACHED_ACCESSED; - if (unlikely(rhashtable_lookup_insert_fast(&bc->table, - &ck->hash, - bch2_btree_key_cache_params))) { - /* We raced with another fill: */ - - if (likely(was_new)) { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - kfree(ck); - } else { - bkey_cached_free_fast(bc, ck); + if (unlikely(key_u64s > ck->u64s)) { + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); + + struct bkey_i *new_k = allocate_dropping_locks(trans, ret, + kmalloc(key_u64s * sizeof(u64), _gfp)); + if (unlikely(!new_k)) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(ck->key.btree_id), key_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + } else if (ret) { + kfree(new_k); + goto err; } - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - return NULL; + kfree(ck->k); + ck->k = new_k; + ck->u64s = key_u64s; } - atomic_long_inc(&bc->nr_keys); + bkey_reassemble(ck->k, k); + + ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); + if (unlikely(ret)) + goto err; + + ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); + + bch2_btree_node_unlock_write(trans, path, path_l(path)->b); + if (unlikely(ret)) /* raced with another fill? */ + goto err; + + atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); - return ck; + enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); + if (lock_want == SIX_LOCK_read) + six_lock_downgrade(&ck->c.lock); + btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); + ck_path->uptodate = BTREE_ITER_UPTODATE; + return 0; +err: + bkey_cached_free(trans, bc, ck); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); + + return ret; +} + +static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); + printbuf_exit(&buf); } -static int btree_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, - struct bkey_cached *ck) +static noinline int btree_key_cache_fill(struct btree_trans *trans, + btree_path_idx_t ck_path_idx, + unsigned flags) { + struct btree_path *ck_path = trans->paths + ck_path_idx; + + if (flags & BTREE_ITER_cached_nofill) { + ck_path->l[0].b = NULL; + return 0; + } + + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - unsigned new_u64s = 0; - struct bkey_i *new_k = NULL; int ret; - bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); - iter.flags &= ~BTREE_ITER_WITH_JOURNAL; - k = bch2_btree_iter_peek_slot(&iter); + bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, + BTREE_ITER_intent| + BTREE_ITER_key_cache_fill| + BTREE_ITER_cached_nofill); + iter.flags &= ~BTREE_ITER_with_journal; + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); - goto err; - } - - /* - * bch2_varint_decode can read past the end of the buffer by at - * most 7 bytes (it won't be used): - */ - new_u64s = k.k->u64s + 1; - - /* - * Allocate some extra space so that the transaction commit path is less - * likely to have to reallocate, since that requires a transaction - * restart: - */ - new_u64s = min(256U, (new_u64s * 3) / 2); - - if (new_u64s > ck->u64s) { - new_u64s = roundup_pow_of_two(new_u64s); - new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); - if (!new_k) { - bch2_trans_unlock(trans); - - new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); - if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(ck->key.btree_id), new_u64s); - ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; - goto err; - } - - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - kfree(new_k); - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); - goto err; - } - - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_k); - goto err; - } - } - } + /* Recheck after btree lookup, before allocating: */ + ck_path = trans->paths + ck_path_idx; + ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; + if (unlikely(ret)) + goto out; - ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); - if (ret) { - kfree(new_k); + ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); + if (ret) goto err; - } - - if (new_k) { - kfree(ck->k); - ck->u64s = new_u64s; - ck->k = new_k; - } - - bkey_reassemble(ck->k, k); - ck->valid = true; - bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); + if (trace_key_cache_fill_enabled()) + do_trace_key_cache_fill(trans, ck_path, k); +out: /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); err: bch2_trans_iter_exit(trans, &iter); return ret; } -static noinline int -bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, - unsigned flags) +static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, + btree_path_idx_t path_idx) { struct bch_fs *c = trans->c; struct bkey_cached *ck; - int ret = 0; - - BUG_ON(path->level); - - path->l[1].b = NULL; - - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - ck = (void *) path->l[0].b; - goto fill; - } + struct btree_path *path = trans->paths + path_idx; retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) { - ck = btree_key_cache_create(trans, path); - ret = PTR_ERR_OR_ZERO(ck); - if (ret) - goto err; - if (!ck) - goto retry; - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); - path->locks_want = 1; - } else { - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - BUG_ON(ret); - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; - } - - mark_btree_node_locked(trans, path, 0, - (enum btree_node_locked_type) lock_want); - } - - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; -fill: - path->uptodate = BTREE_ITER_UPTODATE; - - if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { - /* - * Using the underscore version because we haven't set - * path->uptodate yet: - */ - if (!path->locks_want && - !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { - trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); - goto err; - } + if (!ck) + return -ENOENT; - ret = btree_key_cache_fill(trans, path, ck); - if (ret) - goto err; + enum six_lock_type lock_want = __btree_lock_want(path, 0); - ret = bch2_btree_path_relock(trans, path, _THIS_IP_); - if (ret) - goto err; + int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); + if (ret) + return ret; - path->uptodate = BTREE_ITER_UPTODATE; + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; } if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); - BUG_ON(path->uptodate); - - return ret; -err: - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(ret); - } - return ret; + btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); + path->uptodate = BTREE_ITER_UPTODATE; + return 0; } -int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, +int bch2_btree_path_traverse_cached(struct btree_trans *trans, + btree_path_idx_t path_idx, unsigned flags) { - struct bch_fs *c = trans->c; - struct bkey_cached *ck; - int ret = 0; - - EBUG_ON(path->level); + EBUG_ON(trans->paths[path_idx].level); - path->l[1].b = NULL; - - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - ck = (void *) path->l[0].b; - goto fill; - } -retry: - ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) { - return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - } else { - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); + int ret; + do { + ret = btree_path_traverse_cached_fast(trans, path_idx); + if (unlikely(ret == -ENOENT)) + ret = btree_key_cache_fill(trans, path_idx, flags); + } while (ret == -EEXIST); - if (ret) - return ret; + struct btree_path *path = trans->paths + path_idx; - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; + if (unlikely(ret)) { + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); } - - mark_btree_node_locked(trans, path, 0, - (enum btree_node_locked_type) lock_want); + } else { + BUG_ON(path->uptodate); + BUG_ON(!path->nodes_locked); } - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; -fill: - if (!ck->valid) - return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - - if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - - path->uptodate = BTREE_ITER_UPTODATE; - EBUG_ON(!ck->valid); - EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); - return ret; } @@ -622,15 +432,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_SLOTS| - BTREE_ITER_INTENT| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_slots| + BTREE_ITER_intent| + BTREE_ITER_all_snapshots); bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + BTREE_ITER_cached| + BTREE_ITER_intent); + b_iter.flags &= ~BTREE_ITER_with_key_cache; - ret = bch2_btree_iter_traverse(&c_iter); + ret = bch2_btree_iter_traverse(trans, &c_iter); if (ret) goto out; @@ -644,8 +454,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; } - BUG_ON(!ck->valid); - if (journal_seq && ck->journal.seq != journal_seq) goto out; @@ -661,19 +469,26 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) + !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - ret = bch2_btree_iter_traverse(&b_iter) ?: - bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_KEY_CACHE_RECLAIM| - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - BTREE_TRIGGER_NORUN) ?: + struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); + ret = bkey_err(btree_k); + if (ret) + goto err; + + /* * Check that we're not violating cache coherency rules: */ + BUG_ON(bkey_deleted(btree_k.k)); + + ret = bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_key_cache_reclaim| + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| commit_flags); - +err: bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && @@ -708,8 +523,12 @@ evict: } mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); - bkey_cached_evict(&c->btree_key_cache, ck); - bkey_cached_free_fast(&c->btree_key_cache, ck); + if (bkey_cached_evict(&c->btree_key_cache, ck)) { + bkey_cached_free(trans, &c->btree_key_cache, ck); + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + } } out: bch2_trans_iter_exit(trans, &b_iter); @@ -767,7 +586,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); bkey_copy(ck->k, insert); - ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); @@ -790,7 +608,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, * flushing. The flush callback will not proceed unless ->seq matches * the latest pin, so make sure it starts with a consistent value. */ - if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || !journal_pin_active(&ck->journal)) { ck->seq = trans->journal_res.seq; } @@ -806,10 +624,9 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck = (void *) path->l[0].b; - BUG_ON(!ck->valid); - /* * We just did an update to the btree, bypassing the key cache: the key * cache key is now stale and must be dropped, even if dirty: @@ -820,7 +637,29 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, bch2_journal_pin_drop(&c->journal, &ck->journal); } - ck->valid = false; + bkey_cached_evict(bc, ck); + bkey_cached_free(trans, bc, ck); + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); + + struct btree_path *path2; + unsigned i; + trans_for_each_path(trans, path2, i) + if (path2->l[0].b == (void *) ck) { + /* + * It's safe to clear should_be_locked here because + * we're evicting from the key cache, and we still have + * the underlying btree locked: filling into the key + * cache would require taking a write lock on the btree + * node + */ + path2->should_be_locked = false; + __bch2_btree_path_unlock(trans, path2); + path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); + btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); + } + + bch2_trans_verify_locks(trans); } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, @@ -829,97 +668,75 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct bch_fs *c = shrink->private_data; struct btree_key_cache *bc = &c->btree_key_cache; struct bucket_table *tbl; - struct bkey_cached *ck, *t; + struct bkey_cached *ck; size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned start, flags; + unsigned iter, start; int srcu_idx; - mutex_lock(&bc->lock); srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - flags = memalloc_nofs_save(); + rcu_read_lock(); + + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); /* - * Newest freed entries are at the end of the list - once we hit one - * that's too new to be freed, we can bail out: + * Scanning is expensive while a rehash is in progress - most elements + * will be on the new hashtable, if it's in progress + * + * A rehash could still start while we're scanning - that's ok, we'll + * still see most elements. */ - scanned += bc->nr_freed_nonpcpu; - - list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { - if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) - break; - - list_del(&ck->list); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - atomic_long_dec(&bc->nr_freed); - freed++; - bc->nr_freed_nonpcpu--; - } - - if (scanned >= nr) - goto out; - - scanned += bc->nr_freed_pcpu; - - list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { - if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) - break; - - list_del(&ck->list); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - atomic_long_dec(&bc->nr_freed); - freed++; - bc->nr_freed_pcpu--; + if (unlikely(tbl->nest)) { + rcu_read_unlock(); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + return SHRINK_STOP; } - if (scanned >= nr) - goto out; - - rcu_read_lock(); - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (bc->shrink_iter >= tbl->size) - bc->shrink_iter = 0; - start = bc->shrink_iter; + iter = bc->shrink_iter; + if (iter >= tbl->size) + iter = 0; + start = iter; do { struct rhash_head *pos, *next; - pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + pos = rht_ptr_rcu(&tbl->buckets[iter]); while (!rht_is_a_nulls(pos)) { - next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); + next = rht_dereference_bucket_rcu(pos->next, tbl, iter); ck = container_of(pos, struct bkey_cached, hash); - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) - goto next; - - if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + bc->skipped_dirty++; + } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); - else if (bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(bc, ck); - bkey_cached_free(bc, ck); + bc->skipped_accessed++; + } else if (!bkey_cached_lock_for_evict(ck)) { + bc->skipped_lock_fail++; + } else if (bkey_cached_evict(bc, ck)) { + bkey_cached_free_noassert(bc, ck); + bc->freed++; + freed++; + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); } scanned++; if (scanned >= nr) - break; -next: + goto out; + pos = next; } - bc->shrink_iter++; - if (bc->shrink_iter >= tbl->size) - bc->shrink_iter = 0; - } while (scanned < nr && bc->shrink_iter != start); + iter++; + if (iter >= tbl->size) + iter = 0; + } while (scanned < nr && iter != start); +out: + bc->shrink_iter = iter; rcu_read_unlock(); -out: - memalloc_nofs_restore(flags); srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - mutex_unlock(&bc->lock); return freed; } @@ -932,6 +749,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, long nr = atomic_long_read(&bc->nr_keys) - atomic_long_read(&bc->nr_dirty); + /* + * Avoid hammering our shrinker too much if it's nearly empty - the + * shrinker code doesn't take into account how big our cache is, if it's + * mostly empty but the system is under memory pressure it causes nasty + * lock contention: + */ + nr -= 128; + return max(0L, nr); } @@ -939,60 +764,36 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct bucket_table *tbl; - struct bkey_cached *ck, *n; + struct bkey_cached *ck; struct rhash_head *pos; LIST_HEAD(items); unsigned i; -#ifdef __KERNEL__ - int cpu; -#endif shrinker_free(bc->shrink); - mutex_lock(&bc->lock); - /* * The loop is needed to guard against racing with rehash: */ while (atomic_long_read(&bc->nr_keys)) { rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) + if (tbl) { + if (tbl->nest) { + /* wait for in progress rehash */ + rcu_read_unlock(); + mutex_lock(&bc->table.mutex); + mutex_unlock(&bc->table.mutex); + continue; + } for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - bkey_cached_evict(bc, ck); - list_add(&ck->list, &items); + while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { + ck = container_of(pos, struct bkey_cached, hash); + BUG_ON(!bkey_cached_evict(bc, ck)); + kfree(ck->k); + kmem_cache_free(bch2_key_cache, ck); } - rcu_read_unlock(); - } - -#ifdef __KERNEL__ - for_each_possible_cpu(cpu) { - struct btree_key_cache_freelist *f = - per_cpu_ptr(bc->pcpu_freed, cpu); - - for (i = 0; i < f->nr; i++) { - ck = f->objs[i]; - list_add(&ck->list, &items); } - } -#endif - - BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); - BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); - - list_splice(&bc->freed_pcpu, &items); - list_splice(&bc->freed_nonpcpu, &items); - - mutex_unlock(&bc->lock); - - list_for_each_entry_safe(ck, n, &items, list) { - cond_resched(); - - list_del(&ck->list); - kfree(ck->k); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); + rcu_read_unlock(); } if (atomic_long_read(&bc->nr_dirty) && @@ -1008,14 +809,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (bc->table_init_done) rhashtable_destroy(&bc->table); - free_percpu(bc->pcpu_freed); + rcu_pending_exit(&bc->pending[0]); + rcu_pending_exit(&bc->pending[1]); + + free_percpu(bc->nr_pending); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { - mutex_init(&c->lock); - INIT_LIST_HEAD(&c->freed_pcpu); - INIT_LIST_HEAD(&c->freed_nonpcpu); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) @@ -1023,11 +824,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct shrinker *shrink; -#ifdef __KERNEL__ - bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); - if (!bc->pcpu_freed) + bc->nr_pending = alloc_percpu(size_t); + if (!bc->nr_pending) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + + if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || + rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) return -BCH_ERR_ENOMEM_fs_btree_cache_init; -#endif if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) return -BCH_ERR_ENOMEM_fs_btree_cache_init; @@ -1038,22 +841,32 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) if (!shrink) return -BCH_ERR_ENOMEM_fs_btree_cache_init; bc->shrink = shrink; - shrink->seeks = 0; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->batch = 1 << 14; + shrink->seeks = 0; shrink->private_data = c; shrinker_register(shrink); return 0; } -void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) { - prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); - prt_newline(out); - prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); + printbuf_tabstop_push(out, 24); + printbuf_tabstop_push(out, 12); + + prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); + prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); + prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); prt_newline(out); - prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); + prt_printf(out, "shrinker:\n"); + prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); + prt_printf(out, "freed:\t%lu\r\n", bc->freed); + prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); + prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); + prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); prt_newline(out); + prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); } void bch2_btree_key_cache_exit(void) |