diff options
Diffstat (limited to 'drivers/md/bcache/btree.c')
| -rw-r--r-- | drivers/md/bcache/btree.c | 447 |
1 files changed, 258 insertions, 189 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 39de94edd73a..3ed39c823826 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -36,6 +36,7 @@ #include <linux/sched/clock.h> #include <linux/rculist.h> #include <linux/delay.h> +#include <linux/sort.h> #include <trace/events/bcache.h> /* @@ -88,10 +89,9 @@ * Test module load/unload */ -#define MAX_NEED_GC 64 -#define MAX_SAVE_PRIO 72 -#define MAX_GC_TIMES 100 -#define MIN_GC_NODES 100 +#define MAX_GC_TIMES_SHIFT 7 /* 128 loops */ +#define GC_NODES_MIN 10 +#define GC_SLEEP_MS_MIN 10 #define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) @@ -99,12 +99,14 @@ #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) +static struct workqueue_struct *btree_io_wq; + #define insert_lock(s, b) ((b)->level <= (s)->lock) static inline struct bset *write_block(struct btree *b) { - return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); } static void bch_btree_init_next(struct btree *b) @@ -117,7 +119,7 @@ static void bch_btree_init_next(struct btree *b) if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bset_magic(&b->c->cache->sb)); } @@ -139,7 +141,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) uint64_t crc = b->key.ptr[0]; void *data = (void *) i + 8, *end = bset_bkey_last(i); - crc = bch_crc64_update(crc, data, end - data); + crc = crc64_be(crc, data, end - data); return crc ^ 0xffffffffffffffffULL; } @@ -155,7 +157,7 @@ void bch_btree_node_read_done(struct btree *b) * See the comment arount cache_set->fill_iter. */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter->size = b->c->sb.bucket_size / b->c->sb.block_size; + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG @@ -173,12 +175,12 @@ void bch_btree_node_read_done(struct btree *b) goto err; err = "bad btree header"; - if (b->written + set_blocks(i, block_bytes(b->c)) > + if (b->written + set_blocks(i, block_bytes(b->c->cache)) > btree_blocks(b)) goto err; err = "bad magic"; - if (i->magic != bset_magic(&b->c->sb)) + if (i->magic != bset_magic(&b->c->cache->sb)) goto err; err = "bad checksum"; @@ -199,13 +201,13 @@ void bch_btree_node_read_done(struct btree *b) bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); - b->written += set_blocks(i, block_bytes(b->c)); + b->written += set_blocks(i, block_bytes(b->c->cache)); } err = "corrupted btree"; for (i = write_block(b); bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); - i = ((void *) i) + block_bytes(b->c)) + i = ((void *) i) + block_bytes(b->c->cache)) if (i->seq == b->keys.set[0].data->seq) goto err; @@ -219,7 +221,7 @@ void bch_btree_node_read_done(struct btree *b) if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bset_magic(&b->c->cache->sb)); out: mempool_free(iter, &b->c->fill_iter); return; @@ -291,16 +293,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) w->journal = NULL; } -static void btree_node_write_unlock(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_write_unlock) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); up(&b->io_mutex); } -static void __btree_node_write_done(struct closure *cl) +static CLOSURE_CALLBACK(__btree_node_write_done) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); struct btree_write *w = btree_prev_write(b); bch_bbio_free(b->bio, b->c); @@ -308,17 +310,17 @@ static void __btree_node_write_done(struct closure *cl) btree_complete_write(b, w); if (btree_node_dirty(b)) - schedule_delayed_work(&b->work, 30 * HZ); + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); closure_return_with_destructor(cl, btree_node_write_unlock); } -static void btree_node_write_done(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_write_done) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); bio_free_pages(b->bio); - __btree_node_write_done(cl); + __btree_node_write_done(&cl->work); } static void btree_node_write_endio(struct bio *bio) @@ -347,7 +349,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; - b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache)); b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; bch_bio_map(b->bio, i); @@ -370,7 +372,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, GFP_NOWAIT)) { struct bio_vec *bv; void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); struct bvec_iter_all iter_all; @@ -423,10 +425,10 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); - atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, - &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, + &b->c->cache->btree_sectors_written); - b->written += set_blocks(i, block_bytes(b->c)); + b->written += set_blocks(i, block_bytes(b->c->cache)); } void bch_btree_node_write(struct btree *b, struct closure *parent) @@ -481,7 +483,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) BUG_ON(!i->keys); if (!btree_node_dirty(b)) - schedule_delayed_work(&b->work, 30 * HZ); + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); set_btree_node_dirty(b); @@ -514,7 +516,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) * mca -> memory cache */ -#define mca_reserve(c) (((c->root && c->root->level) \ +#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ max_t(int, 0, c->btree_cache_used - mca_reserve(c)) @@ -557,6 +559,25 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) } } +#ifdef CONFIG_PROVE_LOCKING +static int btree_lock_cmp_fn(const struct lockdep_map *_a, + const struct lockdep_map *_b) +{ + const struct btree *a = container_of(_a, struct btree, lock.dep_map); + const struct btree *b = container_of(_b, struct btree, lock.dep_map); + + return -cmp_int(a->level, b->level) ?: bkey_cmp(&a->key, &b->key); +} + +static void btree_lock_print_fn(const struct lockdep_map *map) +{ + const struct btree *b = container_of(map, struct btree, lock.dep_map); + + printk(KERN_CONT " l=%u %llu:%llu", b->level, + KEY_INODE(&b->key), KEY_OFFSET(&b->key)); +} +#endif + static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { @@ -570,7 +591,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, return NULL; init_rwsem(&b->lock); - lockdep_set_novalidate_class(&b->lock); + lock_set_cmp_fn(&b->lock, btree_lock_cmp_fn, btree_lock_print_fn); mutex_init(&b->write_lock); lockdep_set_novalidate_class(&b->write_lock); INIT_LIST_HEAD(&b->list); @@ -644,7 +665,7 @@ out_unlock: static unsigned long bch_mca_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; struct btree *b, *t; unsigned long i, nr = sc->nr_to_scan; unsigned long freed = 0; @@ -711,7 +732,7 @@ out: static unsigned long bch_mca_count(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; if (c->shrinker_disabled) return 0; @@ -729,8 +750,8 @@ void bch_btree_cache_free(struct cache_set *c) closure_init_stack(&cl); - if (c->shrink.list.next) - unregister_shrinker(&c->shrink); + if (c->shrink) + shrinker_free(c->shrink); mutex_lock(&c->bucket_lock); @@ -738,7 +759,7 @@ void bch_btree_cache_free(struct cache_set *c) if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); - free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); + free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); #endif list_splice(&c->btree_cache_freeable, @@ -785,7 +806,16 @@ int bch_btree_cache_alloc(struct cache_set *c) mutex_init(&c->verify_lock); c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + __get_free_pages(GFP_KERNEL|__GFP_COMP, + ilog2(meta_bucket_pages(&c->cache->sb))); + if (!c->verify_ondisk) { + /* + * Don't worry about the mca_rereserve buckets + * allocated in previous for-loop, they will be + * handled properly in bch_cache_set_unregister(). + */ + return -ENOMEM; + } c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); @@ -796,14 +826,19 @@ int bch_btree_cache_alloc(struct cache_set *c) c->verify_data = NULL; #endif - c->shrink.count_objects = bch_mca_count; - c->shrink.scan_objects = bch_mca_scan; - c->shrink.seeks = 4; - c->shrink.batch = c->btree_pages * 2; + c->shrink = shrinker_alloc(0, "md-bcache:%pU", c->set_uuid); + if (!c->shrink) { + pr_warn("bcache: %s: could not allocate shrinker\n", __func__); + return 0; + } + + c->shrink->count_objects = bch_mca_count; + c->shrink->scan_objects = bch_mca_scan; + c->shrink->seeks = 4; + c->shrink->batch = c->btree_pages * 2; + c->shrink->private_data = c; - if (register_shrinker(&c->shrink)) - pr_warn("bcache: %s: could not register shrinker\n", - __func__); + shrinker_register(c->shrink); return 0; } @@ -874,7 +909,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -static void bch_cannibalize_unlock(struct cache_set *c) +void bch_cannibalize_unlock(struct cache_set *c) { spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { @@ -959,10 +994,13 @@ err: * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * - * If IO is necessary and running under generic_make_request, returns -EAGAIN. + * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. + * + * Note: Only error code or btree pointer will be returned, it is unncessary + * for callers to check NULL pointer. */ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, struct bkey *k, int level, bool write, @@ -1074,16 +1112,22 @@ retry: mutex_unlock(&b->c->bucket_lock); } +/* + * Only error code or btree pointer will be returned, it is unncessary for + * callers to check NULL pointer. + */ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, int level, bool wait, struct btree *parent) { BKEY_PADDED(key) k; - struct btree *b = ERR_PTR(-EAGAIN); + struct btree *b; mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) + /* return ERR_PTR(-EAGAIN) when it fails */ + b = ERR_PTR(-EAGAIN); + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; bkey_put(c, &k.key); @@ -1100,7 +1144,7 @@ retry: } b->parent = parent; - bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); mutex_unlock(&c->bucket_lock); @@ -1127,7 +1171,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, { struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bkey_copy_key(&n->key, &b->key); @@ -1150,7 +1194,7 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) SET_PTR_GEN(k, i, - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + bch_inc_gen(b->c->cache, PTR_BUCKET(b->c, &b->key, i))); mutex_unlock(&b->c->bucket_lock); @@ -1159,19 +1203,18 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; - struct cache *ca; - unsigned int i, reserve = (c->root->level - b->level) * 2 + 1; + struct cache *ca = c->cache; + unsigned int reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) - if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { - if (op) - prepare_to_wait(&c->btree_cache_wait, &op->wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&c->bucket_lock); - return -EINTR; - } + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->bucket_lock); + return -EINTR; + } mutex_unlock(&c->bucket_lock); @@ -1264,7 +1307,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct bset_tree *t; gc->nodes++; @@ -1337,12 +1380,12 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (nodes < 2 || __set_blocks(b->keys.set[0].data, keys, - block_bytes(b->c)) > blocks * (nodes - 1)) + block_bytes(b->c->cache)) > blocks * (nodes - 1)) return 0; for (i = 0; i < nodes; i++) { new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); - if (IS_ERR_OR_NULL(new_nodes[i])) + if (IS_ERR(new_nodes[i])) goto out_nocoalesce; } @@ -1371,7 +1414,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + bkey_u64s(k), - block_bytes(b->c)) > blocks) + block_bytes(b->c->cache)) > blocks) break; last = k; @@ -1387,16 +1430,16 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, * though) */ if (__set_blocks(n1, n1->keys + n2->keys, - block_bytes(b->c)) > + block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])) - goto out_nocoalesce; + goto out_unlock_nocoalesce; keys = n2->keys; /* Take the key of the node we're getting rid of */ last = &r->b->key; } - BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])); if (last) @@ -1418,7 +1461,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (__bch_keylist_realloc(&keylist, bkey_u64s(&new_nodes[i]->key))) - goto out_nocoalesce; + goto out_unlock_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); bch_keylist_add(&keylist, &new_nodes[i]->key); @@ -1464,6 +1507,10 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, /* Invalidated our iterator */ return -EINTR; +out_unlock_nocoalesce: + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + out_nocoalesce: closure_sync(&cl); @@ -1490,6 +1537,8 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, return 0; n = btree_node_alloc_replacement(replace, NULL); + if (IS_ERR(n)) + return 0; /* recheck reserve after allocating replacement node */ if (btree_check_reserve(b, NULL)) { @@ -1519,7 +1568,7 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; unsigned int ret = 0; for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) @@ -1530,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b) static size_t btree_gc_min_nodes(struct cache_set *c) { - size_t min_nodes; + size_t min_nodes = GC_NODES_MIN; - /* - * Since incremental GC would stop 100ms when front - * side I/O comes, so when there are many btree nodes, - * if GC only processes constant (100) nodes each time, - * GC would last a long time, and the front side I/Os - * would run out of the buckets (since no new bucket - * can be allocated during GC), and be blocked again. - * So GC should not process constant nodes, but varied - * nodes according to the number of btree nodes, which - * realized by dividing GC into constant(100) times, - * so when there are many btree nodes, GC can process - * more nodes each time, otherwise, GC will process less - * nodes each time (but no less than MIN_GC_NODES) - */ - min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; - if (min_nodes < MIN_GC_NODES) - min_nodes = MIN_GC_NODES; + if (atomic_read(&c->search_inflight) == 0) { + size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT; + + if (min_nodes < n) + min_nodes = n; + } return min_nodes; } +static uint64_t btree_gc_sleep_ms(struct cache_set *c) +{ + uint64_t sleep_ms; + + if (atomic_read(&c->bucket_wait_cnt) > 0) + sleep_ms = GC_SLEEP_MS_MIN; + else + sleep_ms = GC_SLEEP_MS; + + return sleep_ms; +} static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) @@ -1560,17 +1609,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1619,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; - if (atomic_read(&b->c->search_inflight) && - gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) { gc->nodes_pre = gc->nodes; ret = -EAGAIN; break; @@ -1655,7 +1704,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (should_rewrite) { n = btree_node_alloc_replacement(b, NULL); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { bch_btree_node_write_sync(n); bch_btree_set_root(n); @@ -1683,25 +1732,26 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned int i; if (!c->gc_mark_valid) return; mutex_lock(&c->bucket_lock); - c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; - for_each_cache(ca, c, i) - for_each_bucket(b, ca) { - b->last_gc = b->gen; - if (!atomic_read(&b->pin)) { - SET_GC_MARK(b, 0); - SET_GC_SECTORS_USED(b, 0); - } + ca = c->cache; + for_each_bucket(b, ca) { + b->last_gc = b->gen; + if (bch_can_invalidate_bucket(ca, b)) + b->reclaimable_in_gc = 1; + if (!atomic_read(&b->pin)) { + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); } + } + c->gc_mark_valid = 0; mutex_unlock(&c->bucket_lock); } @@ -1709,7 +1759,8 @@ static void bch_btree_gc_finish(struct cache_set *c) { struct bucket *b; struct cache *ca; - unsigned int i; + unsigned int i, j; + uint64_t *k; mutex_lock(&c->bucket_lock); @@ -1727,7 +1778,6 @@ static void bch_btree_gc_finish(struct cache_set *c) struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; - unsigned int j; if (!d || UUID_FLASH_ONLY(&c->uuids[i])) continue; @@ -1744,29 +1794,30 @@ static void bch_btree_gc_finish(struct cache_set *c) rcu_read_unlock(); c->avail_nbuckets = 0; - for_each_cache(ca, c, i) { - uint64_t *i; - ca->invalidate_needs_gc = 0; + ca = c->cache; + ca->invalidate_needs_gc = 0; - for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) - SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); - for (i = ca->prio_buckets; - i < ca->prio_buckets + prio_buckets(ca) * 2; i++) - SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); - for_each_bucket(b, ca) { - c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + for_each_bucket(b, ca) { + c->need_gc = max(c->need_gc, bucket_gc_gen(b)); - if (atomic_read(&b->pin)) - continue; + if (b->reclaimable_in_gc) + b->reclaimable_in_gc = 0; - BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + if (atomic_read(&b->pin)) + continue; - if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - c->avail_nbuckets++; - } + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) + c->avail_nbuckets++; } mutex_unlock(&c->bucket_lock); @@ -1795,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c) cond_resched(); if (ret == -EAGAIN) - schedule_timeout_interruptible(msecs_to_jiffies - (GC_SLEEP_MS)); + schedule_timeout_interruptible( + msecs_to_jiffies(btree_gc_sleep_ms(c))); else if (ret) pr_warn("gc failed!\n"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -1818,12 +1869,10 @@ static void bch_btree_gc(struct cache_set *c) static bool gc_should_run(struct cache_set *c) { - struct cache *ca; - unsigned int i; + struct cache *ca = c->cache; - for_each_cache(ca, c, i) - if (ca->invalidate_needs_gc) - return true; + if (ca->invalidate_needs_gc) + return true; if (atomic_read(&c->sectors_to_gc) < 0) return true; @@ -1865,7 +1914,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1873,10 +1922,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_stack_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, &b->keys, + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1904,7 +1953,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1913,8 +1962,8 @@ static int bch_btree_check_thread(void *arg) ret = 0; /* root node keys are checked before thread created */ - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1932,7 +1981,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -1959,6 +2008,15 @@ static int bch_btree_check_thread(void *arg) c->gc_stats.nodes++; bch_btree_op_init(&op, 0); ret = bcache_btree(check_recurse, p, c->root, &op); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op)->wait); if (ret) goto out; } @@ -1996,9 +2054,8 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter iter; - struct btree_check_state *check_state; - char name[32]; + struct btree_iter_stack iter; + struct btree_check_state check_state; /* check and mark root node keys */ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) @@ -2009,69 +2066,66 @@ int bch_btree_check(struct cache_set *c) if (c->root->level == 0) return 0; - check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL); - if (!check_state) - return -ENOMEM; - - check_state->c = c; - check_state->total_threads = bch_btree_chkthread_nr(); - check_state->key_idx = 0; - spin_lock_init(&check_state->idx_lock); - atomic_set(&check_state->started, 0); - atomic_set(&check_state->enough, 0); - init_waitqueue_head(&check_state->wait); + memset(&check_state, 0, sizeof(struct btree_check_state)); + check_state.c = c; + check_state.total_threads = bch_btree_chkthread_nr(); + check_state.key_idx = 0; + spin_lock_init(&check_state.idx_lock); + atomic_set(&check_state.started, 0); + atomic_set(&check_state.enough, 0); + init_waitqueue_head(&check_state.wait); + rw_lock(0, c->root, c->root->level); /* * Run multiple threads to check btree nodes in parallel, - * if check_state->enough is non-zero, it means current + * if check_state.enough is non-zero, it means current * running check threads are enough, unncessary to create * more. */ - for (i = 0; i < check_state->total_threads; i++) { - /* fetch latest check_state->enough earlier */ + for (i = 0; i < check_state.total_threads; i++) { + /* fetch latest check_state.enough earlier */ smp_mb__before_atomic(); - if (atomic_read(&check_state->enough)) + if (atomic_read(&check_state.enough)) break; - check_state->infos[i].result = 0; - check_state->infos[i].state = check_state; - snprintf(name, sizeof(name), "bch_btrchk[%u]", i); - atomic_inc(&check_state->started); + check_state.infos[i].result = 0; + check_state.infos[i].state = &check_state; - check_state->infos[i].thread = + check_state.infos[i].thread = kthread_run(bch_btree_check_thread, - &check_state->infos[i], - name); - if (IS_ERR(check_state->infos[i].thread)) { + &check_state.infos[i], + "bch_btrchk[%d]", i); + if (IS_ERR(check_state.infos[i].thread)) { pr_err("fails to run thread bch_btrchk[%d]\n", i); for (--i; i >= 0; i--) - kthread_stop(check_state->infos[i].thread); + kthread_stop(check_state.infos[i].thread); ret = -ENOMEM; goto out; } + atomic_inc(&check_state.started); } - wait_event_interruptible(check_state->wait, - atomic_read(&check_state->started) == 0 || - test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + /* + * Must wait for all threads to stop. + */ + wait_event(check_state.wait, atomic_read(&check_state.started) == 0); - for (i = 0; i < check_state->total_threads; i++) { - if (check_state->infos[i].result) { - ret = check_state->infos[i].result; + for (i = 0; i < check_state.total_threads; i++) { + if (check_state.infos[i].result) { + ret = check_state.infos[i].result; goto out; } } out: - kfree(check_state); + rw_unlock(0, c->root); return ret; } void bch_initial_gc_finish(struct cache_set *c) { - struct cache *ca; + struct cache *ca = c->cache; struct bucket *b; - unsigned int i; bch_btree_gc_finish(c); @@ -2086,20 +2140,18 @@ void bch_initial_gc_finish(struct cache_set *c) * This is only safe for buckets that have no live data in them, which * there should always be some of. */ - for_each_cache(ca, c, i) { - for_each_bucket(b, ca) { - if (fifo_full(&ca->free[RESERVE_PRIO]) && - fifo_full(&ca->free[RESERVE_BTREE])) - break; + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) + break; - if (bch_can_invalidate_bucket(ca, b) && - !GC_MARK(b)) { - __bch_invalidate_one_bucket(ca, b); - if (!fifo_push(&ca->free[RESERVE_PRIO], - b - ca->buckets)) - fifo_push(&ca->free[RESERVE_BTREE], - b - ca->buckets); - } + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); } } @@ -2207,7 +2259,7 @@ static int btree_split(struct btree *b, struct btree_op *op, goto err; split = set_blocks(btree_bset_first(n1), - block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; + block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned int keys = 0; @@ -2498,11 +2550,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2531,11 +2583,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, @@ -2760,3 +2813,19 @@ void bch_keybuf_init(struct keybuf *buf) spin_lock_init(&buf->lock); array_allocator_init(&buf->freelist); } + +void bch_btree_exit(void) +{ + if (btree_io_wq) + destroy_workqueue(btree_io_wq); +} + +int __init bch_btree_init(void) +{ + btree_io_wq = alloc_workqueue("bch_btree_io", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); + if (!btree_io_wq) + return -ENOMEM; + + return 0; +} |
