diff options
Diffstat (limited to 'fs/bcachefs/btree_cache.c')
-rw-r--r-- | fs/bcachefs/btree_cache.c | 712 |
1 files changed, 500 insertions, 212 deletions
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 562561a9a510..91e0aa796e6b 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -15,9 +15,13 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#include <linux/swap.h> const char * const bch2_btree_node_flags[] = { -#define x(f) #f, + "typebit", + "typebit", + "typebit", +#define x(f) [BTREE_NODE_##f] = #f, BTREE_FLAGS() #undef x NULL @@ -25,38 +29,77 @@ const char * const bch2_btree_node_flags[] = { void bch2_recalc_btree_reserve(struct bch_fs *c) { - unsigned i, reserve = 16; + unsigned reserve = 16; if (!c->btree_roots_known[0].b) reserve += 8; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) reserve += min_t(unsigned, 1, r->b->c.level) * 8; } - c->btree_cache.reserve = reserve; + c->btree_cache.nr_reserve = reserve; } -static inline unsigned btree_cache_can_free(struct btree_cache *bc) +static inline size_t btree_cache_can_free(struct btree_cache_list *list) { - return max_t(int, 0, bc->used - bc->reserve); + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + size_t can_free = list->nr; + if (!list->idx) + can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); + return can_free; } static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); + if (b->c.lock.readers) - list_move(&b->list, &bc->freed_pcpu); + list_add(&b->list, &bc->freed_pcpu); else - list_move(&b->list, &bc->freed_nonpcpu); + list_add(&b->list, &bc->freed_nonpcpu); +} + +static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); + BUG_ON(!b->data); + + bc->nr_freeable++; + list_add(&b->list, &bc->freeable); } -static void btree_node_data_free(struct bch_fs *c, struct btree *b) +void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; + mutex_lock(&bc->lock); + __bch2_btree_node_to_freelist(bc, b); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + +static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); + BUG_ON(btree_node_hashed(b)); + + /* + * This should really be done in slub/vmalloc, but we're using the + * kmalloc_large() path, so we're working around a slub bug by doing + * this here: + */ + if (b->data) + mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); + if (b->aux_data) + mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); + EBUG_ON(btree_node_write_in_flight(b)); clear_btree_node_just_written(b); @@ -70,11 +113,17 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) #endif b->aux_data = NULL; - bc->used--; - btree_node_to_freedlist(bc, b); } +static void btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(list_empty(&b->list)); + list_del_init(&b->list); + --bc->nr_freeable; + __btree_node_data_free(bc, b); +} + static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -85,19 +134,22 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, + .automatic_shrinking = true, }; static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); #ifdef __KERNEL__ b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else @@ -110,7 +162,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->aux_data) { kvfree(b->data); b->data = NULL; - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); } return 0; @@ -145,51 +197,144 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } - bch2_btree_lock_init(&b->c, 0); + bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - bc->used++; - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); return b; } +static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) +{ + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = bc->pinned_nodes_mask[!!b->c.level]; + + return ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); +} + +void bch2_node_pin(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; + + mutex_lock(&bc->lock); + if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { + set_btree_node_pinned(b); + list_move(&b->list, &bc->live[1].list); + bc->live[0].nr--; + bc->live[1].nr++; + } + mutex_unlock(&bc->lock); +} + +void bch2_btree_cache_unpin(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *n; + + mutex_lock(&bc->lock); + c->btree_cache.pinned_nodes_mask[0] = 0; + c->btree_cache.pinned_nodes_mask[1] = 0; + + list_for_each_entry_safe(b, n, &bc->live[1].list, list) { + clear_btree_node_pinned(b); + list_move(&b->list, &bc->live[0].list); + bc->live[0].nr++; + bc->live[1].nr--; + } + + mutex_unlock(&bc->lock); +} + /* Btree in memory cache - hash table */ -void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { - int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + lockdep_assert_held(&bc->lock); + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); /* Cause future lookups for this node to fail: */ b->hash_val = 0; + + if (b->c.btree_id < BTREE_ID_NR) + --bc->nr_by_btree[b->c.btree_id]; + --bc->live[btree_node_pinned(b)].nr; + list_del_init(&b->list); +} + +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +{ + __bch2_btree_node_hash_remove(bc, b); + __bch2_btree_node_to_freelist(bc, b); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); BUG_ON(b->hash_val); + b->hash_val = btree_ptr_hash_val(&b->key); + int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, + bch_btree_cache_params); + if (ret) + return ret; + + if (b->c.btree_id < BTREE_ID_NR) + bc->nr_by_btree[b->c.btree_id]++; - return rhashtable_lookup_insert_fast(&bc->table, &b->hash, - bch_btree_cache_params); + bool p = __btree_node_pinned(bc, b); + mod_bit(BTREE_NODE_pinned, &b->flags, p); + + list_add_tail(&b->list, &bc->live[p].list); + bc->live[p].nr++; + return 0; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, unsigned level, enum btree_id id) { - int ret; - b->c.level = level; b->c.btree_id = id; mutex_lock(&bc->lock); - ret = __bch2_btree_node_hash_insert(bc, b); - if (!ret) - list_add_tail(&b->list, &bc->live); + int ret = __bch2_btree_node_hash_insert(bc, b); mutex_unlock(&bc->lock); return ret; } +void bch2_btree_node_update_key_early(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + __bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) @@ -199,91 +344,108 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } -/* - * this version is for btree nodes that have already been freed (we're not - * reaping a real btree node) - */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, + bool flush, bool locked) { struct btree_cache *bc = &c->btree_cache; - int ret = 0; lockdep_assert_held(&bc->lock); - struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + if (btree_node_noevict(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + if (btree_node_write_blocked(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + if (btree_node_will_make_reachable(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } - u64 mask = b->c.level - ? bc->pinned_nodes_interior_mask - : bc->pinned_nodes_leaf_mask; + if (btree_node_dirty(b)) { + if (!flush) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } - if ((mask & BIT_ULL(b->c.btree_id)) && - bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && - bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) - return -BCH_ERR_ENOMEM_btree_node_reclaim; + if (locked) { + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); + else + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); + } + } -wait_on_io: - if (b->flags & ((1U << BTREE_NODE_dirty)| - (1U << BTREE_NODE_read_in_flight)| + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) - return -BCH_ERR_ENOMEM_btree_node_reclaim; + if (!flush) { + if (btree_node_read_in_flight(b)) + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; + else if (btree_node_write_in_flight(b)) + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + + if (locked) + return -EINTR; /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } - if (!six_trylock_intent(&b->c.lock)) - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return 0; +} + +/* + * this version is for btree nodes that have already been freed (we're not + * reaping a real btree node) + */ +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +{ + struct btree_cache *bc = &c->btree_cache; + int ret = 0; - if (!six_trylock_write(&b->c.lock)) - goto out_unlock_intent; + lockdep_assert_held(&bc->lock); +retry_unlocked: + ret = __btree_node_reclaim_checks(c, b, flush, false); + if (ret) + return ret; - /* recheck under lock */ - if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) - goto out_unlock; - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - goto wait_on_io; + if (!six_trylock_intent(&b->c.lock)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } - if (btree_node_noevict(b) || - btree_node_write_blocked(b) || - btree_node_will_make_reachable(b)) - goto out_unlock; - - if (btree_node_dirty(b)) { - if (!flush) - goto out_unlock; - /* - * Using the underscore version because we don't want to compact - * bsets after the write, since this node is about to be evicted - * - unless btree verify mode is enabled, since it runs out of - * the post write cleanup: - */ - if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent, - BTREE_WRITE_cache_reclaim); - else - __bch2_btree_node_write(c, b, - BTREE_WRITE_cache_reclaim); + if (!six_trylock_write(&b->c.lock)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; + six_unlock_intent(&b->c.lock); + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + /* recheck under lock */ + ret = __btree_node_reclaim_checks(c, b, flush, true); + if (ret) { six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); - goto wait_on_io; + if (ret == -EINTR) + goto retry_unlocked; + return ret; } -out: + if (b->hash_val && !ret) trace_and_count(c, btree_cache_reap, c, b); - return ret; -out_unlock: - six_unlock_write(&b->c.lock); -out_unlock_intent: - six_unlock_intent(&b->c.lock); - ret = -BCH_ERR_ENOMEM_btree_node_reclaim; - goto out; + return 0; } static int btree_node_reclaim(struct bch_fs *c, struct btree *b) @@ -299,8 +461,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free = 0; @@ -308,10 +471,9 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_read(&bc->dirty) + nr >= - bc->used * 3 / 4; + bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return SHRINK_STOP; mutex_lock(&bc->lock); @@ -324,8 +486,11 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ - can_free = btree_cache_can_free(bc); - nr = min_t(unsigned long, nr, can_free); + can_free = btree_cache_can_free(list); + if (nr > can_free) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; + nr = can_free; + } i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { @@ -342,23 +507,28 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, goto out; if (!btree_node_reclaim(c, b)) { - btree_node_data_free(c, b); + btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; + bc->nr_freed++; } } restart: - list_for_each_entry_safe(b, t, &bc->live, list) { + list_for_each_entry_safe(b, t, &list->list, list) { touched++; if (btree_node_accessed(b)) { clear_btree_node_accessed(b); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; + --touched;; } else if (!btree_node_reclaim(c, b)) { + __bch2_btree_node_hash_remove(bc, b); + __btree_node_data_free(bc, b); + freed++; - btree_node_data_free(c, b); + bc->nr_freed++; - bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -369,7 +539,7 @@ restart: !btree_node_will_make_reachable(b) && !btree_node_write_blocked(b) && six_trylock_read(&b->c.lock)) { - list_move(&bc->live, &b->list); + list_move(&list->list, &b->list); mutex_unlock(&bc->lock); __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); @@ -383,8 +553,8 @@ restart: break; } out_rotate: - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); + if (&t->list != &list->list) + list_move_tail(&list->list, &t->list); out: mutex_unlock(&bc->lock); out_nounlock: @@ -397,57 +567,58 @@ out_nounlock: static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return 0; - return btree_cache_can_free(bc); + return btree_cache_can_free(list); } void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; - struct btree *b; - unsigned i, flags; + struct btree *b, *t; + unsigned long flags; - shrinker_free(bc->shrink); + shrinker_free(bc->live[1].shrink); + shrinker_free(bc->live[0].shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); mutex_lock(&bc->lock); if (c->verify_data) - list_move(&c->verify_data->list, &bc->live); + list_move(&c->verify_data->list, &bc->live[0].list); kvfree(c->verify_ondisk); - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) - list_add(&r->b->list, &bc->live); + list_add(&r->b->list, &bc->live[0].list); } - list_splice(&bc->freeable, &bc->live); - - while (!list_empty(&bc->live)) { - b = list_first_entry(&bc->live, struct btree, list); + list_for_each_entry_safe(b, t, &bc->live[1].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->live[0].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->freeable, list) { BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - btree_node_data_free(c, b); + btree_node_data_free(bc, b); + cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && - atomic_read(&c->btree_cache.dirty)); + atomic_long_read(&c->btree_cache.nr_dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); - while (!list_empty(&bc->freed_nonpcpu)) { - b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { list_del(&b->list); six_lock_exit(&b->c.lock); kfree(b); @@ -456,6 +627,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) mutex_unlock(&bc->lock); memalloc_nofs_restore(flags); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) + BUG_ON(bc->nr_by_btree[i]); + BUG_ON(bc->live[0].nr); + BUG_ON(bc->live[1].nr); + BUG_ON(bc->nr_freeable); + if (bc->table_init_done) rhashtable_destroy(&bc->table); } @@ -475,33 +652,46 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->reserve; i++) + for (i = 0; i < bc->nr_reserve; i++) if (!__bch2_btree_node_mem_alloc(c)) goto err; - list_splice_init(&bc->live, &bc->freeable); + list_splice_init(&bc->live[0].list, &bc->freeable); mutex_init(&c->verify_lock); shrink = shrinker_alloc(0, "%s-btree_cache", c->name); if (!shrink) goto err; - bc->shrink = shrink; + bc->live[0].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 4; - shrink->private_data = c; + shrink->seeks = 2; + shrink->private_data = &bc->live[0]; + shrinker_register(shrink); + + shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); + if (!shrink) + goto err; + bc->live[1].shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->seeks = 8; + shrink->private_data = &bc->live[1]; shrinker_register(shrink); return 0; err: - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) { mutex_init(&bc->lock); - INIT_LIST_HEAD(&bc->live); + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { + bc->live[i].idx = i; + INIT_LIST_HEAD(&bc->live[i].list); + } INIT_LIST_HEAD(&bc->freeable); INIT_LIST_HEAD(&bc->freed_pcpu); INIT_LIST_HEAD(&bc->freed_nonpcpu); @@ -531,27 +721,27 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure struct btree_cache *bc = &c->btree_cache; struct task_struct *old; - old = cmpxchg(&bc->alloc_lock, NULL, current); - if (old == NULL || old == current) + old = NULL; + if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) goto success; if (!cl) { trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; + return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); } closure_wait(&bc->alloc_wait, cl); /* Try again, after adding ourselves to waitlist */ - old = cmpxchg(&bc->alloc_lock, NULL, current); - if (old == NULL || old == current) { + old = NULL; + if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { /* We raced */ closure_wake_up(&bc->alloc_wait); goto success; } trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_btree_cache_cannibalize_lock_blocked; + return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); success: trace_and_count(c, btree_cache_cannibalize_lock, trans); @@ -563,14 +753,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; struct btree *b; - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_reclaim(c, b)) + return b; while (1) { - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_write_and_reclaim(c, b)) + return b; /* * Rare case: all nodes were intent-locked. @@ -590,9 +782,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea : &bc->freed_nonpcpu; struct btree *b, *b2; u64 start_time = local_clock(); - unsigned flags; - flags = memalloc_nofs_save(); mutex_lock(&bc->lock); /* @@ -606,21 +796,22 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea } b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (!b) { + if (b) { + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); + } else { mutex_unlock(&bc->lock); bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); mutex_lock(&bc->lock); } - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); - BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); -got_node: +got_node: /* * btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there: @@ -629,7 +820,12 @@ got_node: if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); + + list_del_init(&b2->list); + --bc->nr_freeable; btree_node_to_freedlist(bc, b2); + mutex_unlock(&bc->lock); + six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); goto got_mem; @@ -643,11 +839,8 @@ got_node: goto err; } - mutex_lock(&bc->lock); - bc->used++; got_mem: - mutex_unlock(&bc->lock); - + BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); @@ -659,12 +852,16 @@ out: b->sib_u64s[1] = 0; b->whiteout_u64s = 0; bch2_btree_keys_init(b); - set_btree_node_accessed(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); - memalloc_nofs_restore(flags); + int ret = bch2_trans_relock(trans); + if (unlikely(ret)) { + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); + } + return b; err: mutex_lock(&bc->lock); @@ -673,7 +870,7 @@ err: if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); clear_btree_node_just_written(b2); - bch2_btree_node_hash_remove(bc, b2); + __bch2_btree_node_hash_remove(bc, b2); if (b) { swap(b->data, b2->data); @@ -683,9 +880,9 @@ err: six_unlock_intent(&b2->c.lock); } else { b = b2; - list_del_init(&b->list); } + BUG_ON(!list_empty(&b->list)); mutex_unlock(&bc->lock); trace_and_count(c, btree_cache_cannibalize, trans); @@ -693,7 +890,6 @@ err: } mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); } @@ -709,9 +905,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - u32 seq; - BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + if (unlikely(level >= BTREE_MAX_DEPTH)) { + int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", + level, BTREE_MAX_DEPTH); + return ERR_PTR(ret); + } + + if (unlikely(!bkey_is_btree_ptr(&k->k))) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + + if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + /* * Parent node must be locked, else we could read in a btree node that's * been freed: @@ -743,7 +961,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b->hash_val = 0; mutex_lock(&bc->lock); - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); mutex_unlock(&bc->lock); six_unlock_write(&b->c.lock); @@ -752,34 +970,30 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - seq = six_lock_seq(&b->c.lock); - six_unlock_intent(&b->c.lock); - /* Unlock before doing IO: */ - if (path && sync) - bch2_trans_unlock_noassert(trans); + if (path) { + u32 seq = six_lock_seq(&b->c.lock); - bch2_btree_node_read(trans, b, sync); + /* Unlock before doing IO: */ + six_unlock_intent(&b->c.lock); + bch2_trans_unlock(trans); - if (!sync) - return NULL; + bch2_btree_node_read(trans, b, sync); - if (path) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); + int ret = bch2_trans_relock(trans); + if (ret) return ERR_PTR(ret); - } - } - if (!six_relock_type(&b->c.lock, lock_type, seq)) { - BUG_ON(!path); + if (!sync) + return NULL; - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + if (!six_relock_type(&b->c.lock, lock_type, seq)) + b = NULL; + } else { + bch2_btree_node_read(trans, b, sync); + if (lock_type == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); } return b; @@ -789,26 +1003,25 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, - "btree node header doesn't match ptr\n" - "btree %s level %u\n" - "ptr: ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + "btree node header doesn't match ptr: "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, "\nptr: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_printf(&buf, "\nheader: btree %s level %llu\n" - "min ", - bch2_btree_id_str(BTREE_NODE_ID(b->data)), - BTREE_NODE_LEVEL(b->data)); + prt_str(&buf, "\nheader: "); + bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); + prt_str(&buf, "\nmin "); bch2_bpos_to_text(&buf, b->data->min_key); prt_printf(&buf, "\nmax "); bch2_bpos_to_text(&buf, b->data->max_key); - bch2_fs_inconsistent(c, "%s", buf.buf); + bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); } @@ -831,7 +1044,6 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; bool need_relock = false; int ret; @@ -889,6 +1101,10 @@ retry: bch2_btree_node_wait_on_read(b); + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + /* * should_be_locked is not set on this path yet, so we need to * relock it specifically: @@ -918,7 +1134,7 @@ retry: if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -951,7 +1167,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * { struct bch_fs *c = trans->c; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1009,7 +1224,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1028,7 +1243,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1071,6 +1285,10 @@ lock_node: six_unlock_read(&b->c.lock); goto retry; } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); } /* XXX: waiting on IO with btree locks held: */ @@ -1086,13 +1304,9 @@ lock_node: prefetch(p + L1_CACHE_BYTES * 2); } - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_error); + b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); goto out; } @@ -1111,18 +1325,22 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; - struct btree *b; BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); - b = btree_cache_find(bc, k); + struct btree *b = btree_cache_find(bc, k); if (b) return 0; b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - return PTR_ERR_OR_ZERO(b); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + if (b) + six_unlock_read(&b->c.lock); + return 0; } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) @@ -1134,6 +1352,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) b = btree_cache_find(bc, k); if (!b) return; + + BUG_ON(b == btree_node_root(trans->c, b)); wait_on_io: /* not allowed to wait on io with btree locks held: */ @@ -1145,6 +1365,8 @@ wait_on_io: btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + if (unlikely(b->hash_val != btree_ptr_hash_val(k))) + goto out; if (btree_node_dirty(b)) { __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); @@ -1156,10 +1378,10 @@ wait_on_io: BUG_ON(btree_node_dirty(b)); mutex_lock(&bc->lock); - btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); + btree_node_data_free(bc, b); mutex_unlock(&bc->lock); - +out: six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } @@ -1169,13 +1391,39 @@ const char *bch2_btree_id_str(enum btree_id btree) return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; } +void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) +{ + if (btree < BTREE_ID_NR) + prt_str(out, __bch2_btree_ids[btree]); + else + prt_printf(out, "(unknown btree %u)", btree); +} + +void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) +{ + prt_str(out, "btree="); + bch2_btree_id_to_text(out, btree); + prt_printf(out, " level=%u", level); +} + +void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, + enum btree_id btree, unsigned level, struct bkey_s_c k) +{ + bch2_btree_id_to_text(out, btree); + prt_printf(out, " level %u/", level); + struct btree_root *r = bch2_btree_id_root(c, btree); + if (r) + prt_printf(out, "%u", r->level); + else + prt_printf(out, "(unknown)"); + prt_newline(out); + + bch2_bkey_val_to_text(out, c, k); +} + void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { - prt_printf(out, "%s level %u/%u\n ", - bch2_btree_id_str(b->c.btree_id), - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) @@ -1220,9 +1468,49 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) +static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, + const char *label, size_t nr) { - prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); - prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); + prt_printf(out, "%s\t", label); + prt_human_readable_u64(out, nr * c->opts.btree_node_size); + prt_printf(out, " (%zu)\n", nr); +} + +static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { +#define x(n) #n, + BCH_BTREE_CACHE_NOT_FREED_REASONS() +#undef x + NULL +}; + +void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_btree_cache_line(out, c, "live:", bc->live[0].nr); + prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); + prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); + prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); + prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); + prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { + bch2_btree_id_to_text(out, i); + prt_printf(out, "\t"); + prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); + prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); + } + + prt_newline(out); + prt_printf(out, "counters since mount:\n"); + prt_printf(out, "freed:\t%zu\n", bc->nr_freed); + prt_printf(out, "not freed:\n"); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) + prt_printf(out, " %s\t%llu\n", + bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); } |