diff options
Diffstat (limited to 'drivers/md/bcache')
26 files changed, 592 insertions, 1194 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 529c9d04e9a4..b2d10063d35f 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -4,6 +4,7 @@ config BCACHE tristate "Block device as cache" select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 + select CLOSURES help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. @@ -19,15 +20,6 @@ config BCACHE_DEBUG Enables extra debugging tools, allows expensive runtime checks to be turned on. -config BCACHE_CLOSURES_DEBUG - bool "Debug closures" - depends on BCACHE - select DEBUG_FS - help - Keeps all active closures in a linked list and provides a debugfs - interface to list them, which makes it possible to see asynchronous - operations that get stuck. - config BCACHE_ASYNC_REGISTRATION bool "Asynchronous device registration" depends on BCACHE diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 5b87e59676b8..054e8a33a7ab 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -2,6 +2,6 @@ obj-$(CONFIG_BCACHE) += bcache.o -bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ - io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ +bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ + journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o features.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ce13c272c387..7708d92df23e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -24,21 +24,18 @@ * Since the gens and priorities are all stored contiguously on disk, we can * batch this up: We fill up the free_inc list with freshly invalidated buckets, * call prio_write(), and when prio_write() finishes we pull buckets off the - * free_inc list and optionally discard them. + * free_inc list. * * free_inc isn't the only freelist - if it was, we'd often to sleep while * priorities and gens were being written before we could allocate. c->free is a * smaller freelist, and buckets on that list are always ready to be used. * - * If we've got discards enabled, that happens when a bucket moves from the - * free_inc list to the free list. - * * There is another freelist, because sometimes we have buckets that we know * have nothing pointing into them - these we can reuse without waiting for * priorities to be rewritten. These come from freed btree nodes and buckets * that garbage collection discovered no longer had valid keys pointing into * them (because they were overwritten). That's the unused list - buckets on the - * unused list move to the free list, optionally being discarded in the process. + * unused list move to the free list. * * It's also important to ensure that gens don't wrap around - with respect to * either the oldest gen in the btree or the gen on disk. This is quite @@ -118,8 +115,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) /* * Background allocation thread: scans for buckets to be invalidated, * invalidates them, rewrites prios/gens (marking them as invalidated on disk), - * then optionally issues discard commands to the newly free buckets, then puts - * them on the various freelists. + * then puts them on the various freelists. */ static inline bool can_inc_bucket_gen(struct bucket *b) @@ -129,12 +125,9 @@ static inline bool can_inc_bucket_gen(struct bucket *b) bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(!ca->set->gc_mark_valid); - - return (!GC_MARK(b) || - GC_MARK(b) == GC_MARK_RECLAIMABLE) && - !atomic_read(&b->pin) && - can_inc_bucket_gen(b); + return (ca->set->gc_mark_valid || b->reclaimable_in_gc) && + ((!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) && + !atomic_read(&b->pin) && can_inc_bucket_gen(b)); } void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -148,6 +141,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); + b->reclaimable_in_gc = 0; } static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -323,8 +317,7 @@ static int bch_allocator_thread(void *arg) while (1) { /* * First, we pull buckets off of the unused and free_inc lists, - * possibly issue discards to them, then we add the bucket to - * the free list: + * then we add the bucket to the free list: */ while (1) { long bucket; @@ -332,14 +325,6 @@ static int bch_allocator_thread(void *arg) if (!fifo_pop(&ca->free_inc, bucket)) break; - if (ca->discard) { - mutex_unlock(&ca->set->bucket_lock); - blkdev_issue_discard(ca->bdev, - bucket_to_sector(ca->set, bucket), - ca->sb.bucket_size, GFP_KERNEL); - mutex_lock(&ca->set->bucket_lock); - } - allocator_wait(ca, bch_allocator_push(ca, bucket)); wake_up(&ca->set->btree_cache_wait); wake_up(&ca->set->bucket_wait); @@ -352,8 +337,7 @@ static int bch_allocator_thread(void *arg) */ retry_invalidate: - allocator_wait(ca, ca->set->gc_mark_valid && - !ca->invalidate_needs_gc); + allocator_wait(ca, !ca->invalidate_needs_gc); invalidate_buckets(ca); /* @@ -415,7 +399,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) TASK_UNINTERRUPTIBLE); mutex_unlock(&ca->set->bucket_lock); + + atomic_inc(&ca->set->bucket_wait_cnt); schedule(); + atomic_dec(&ca->set->bucket_wait_cnt); + mutex_lock(&ca->set->bucket_lock); } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && !fifo_pop(&ca->free[reserve], r)); @@ -501,8 +489,8 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, ca = c->cache; b = bch_bucket_alloc(ca, reserve, wait); - if (b == -1) - goto err; + if (b < 0) + return -1; k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, bucket_to_sector(c, b), @@ -511,10 +499,6 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, SET_KEY_PTRS(k, 1); return 0; -err: - bch_bucket_free(c, k); - bkey_put(c, k); - return -1; } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index aebb7ef10e63..8ccacba85547 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -179,6 +179,7 @@ #define pr_fmt(fmt) "bcache: %s() " fmt, __func__ #include <linux/bio.h> +#include <linux/closure.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mutex.h> @@ -192,7 +193,6 @@ #include "bcache_ondisk.h" #include "bset.h" #include "util.h" -#include "closure.h" struct bucket { atomic_t pin; @@ -200,6 +200,7 @@ struct bucket { uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint16_t gc_mark; /* Bitfield used by GC. See below for field */ + uint16_t reclaimable_in_gc:1; }; /* @@ -265,6 +266,7 @@ struct bcache_device { #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 int nr_stripes; +#define BCH_MIN_STRIPE_SZ ((4 << 20) >> SECTOR_SHIFT) unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; @@ -275,7 +277,7 @@ struct bcache_device { int (*cache_miss)(struct btree *b, struct search *s, struct bio *bio, unsigned int sectors); - int (*ioctl)(struct bcache_device *d, fmode_t mode, + int (*ioctl)(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg); }; @@ -299,6 +301,7 @@ struct cached_dev { struct list_head list; struct bcache_device disk; struct block_device *bdev; + struct file *bdev_file; struct cache_sb sb; struct cache_sb_disk *sb_disk; @@ -421,6 +424,7 @@ struct cache { struct kobject kobj; struct block_device *bdev; + struct file *bdev_file; struct task_struct *alloc_thread; @@ -443,8 +447,7 @@ struct cache { * free_inc: Incoming buckets - these are buckets that currently have * cached data in them, and we can't reuse them until after we write * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) + * gens/prios, they'll be moved to the free list. */ DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); @@ -463,8 +466,6 @@ struct cache { */ unsigned int invalidate_needs_gc; - bool discard; /* Get rid of? */ - struct journal_device journal; /* The rest of this all shows up in sysfs */ @@ -541,7 +542,7 @@ struct cache_set { struct bio_set bio_split; /* For the btree cache */ - struct shrinker shrink; + struct shrinker *shrink; /* For the btree cache and anything allocation related */ struct mutex bucket_lock; @@ -603,6 +604,7 @@ struct cache_set { */ atomic_t prio_blocked; wait_queue_head_t bucket_wait; + atomic_t bucket_wait_cnt; /* * For any bio we don't skip we subtract the number of sectors from @@ -1004,11 +1006,11 @@ extern struct workqueue_struct *bch_flush_wq; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; -extern struct kobj_type bch_cached_dev_ktype; -extern struct kobj_type bch_flash_dev_ktype; -extern struct kobj_type bch_cache_set_ktype; -extern struct kobj_type bch_cache_set_internal_ktype; -extern struct kobj_type bch_cache_ktype; +extern const struct kobj_type bch_cached_dev_ktype; +extern const struct kobj_type bch_flash_dev_ktype; +extern const struct kobj_type bch_cache_set_ktype; +extern const struct kobj_type bch_cache_set_internal_ktype; +extern const struct kobj_type bch_cache_ktype; void bch_cached_dev_release(struct kobject *kobj); void bch_flash_dev_release(struct kobject *kobj); diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h index f96034e0ba4f..6620a7f8fffc 100644 --- a/drivers/md/bcache/bcache_ondisk.h +++ b/drivers/md/bcache/bcache_ondisk.h @@ -360,8 +360,8 @@ struct jset { __u64 prio_bucket[MAX_CACHES_PER_SET]; union { - struct bkey start[0]; - __u64 d[0]; + DECLARE_FLEX_ARRAY(struct bkey, start); + DECLARE_FLEX_ARRAY(__u64, d); }; }; @@ -425,8 +425,8 @@ struct bset { __u32 keys; union { - struct bkey start[0]; - __u64 d[0]; + DECLARE_FLEX_ARRAY(struct bkey, start); + DECLARE_FLEX_ARRAY(__u64, d); }; }; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 2bba4d6aaaa2..463eb13bd0b2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -54,7 +54,7 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { unsigned int ret = 0; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k; if (b->ops->is_extents) @@ -67,7 +67,7 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; const char *err; for_each_key(b, k, &iter) { @@ -879,7 +879,7 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey preceding_key_on_stack = ZERO_KEY; struct bkey *preceding_key_p = &preceding_key_on_stack; @@ -895,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, else preceding_key(k, &preceding_key_p); - m = bch_btree_iter_init(b, &iter, preceding_key_p); + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); - if (b->ops->insert_fixup(b, k, &iter, replace_key)) + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) return status; status = BTREE_INSERT_STATUS_INSERT; @@ -1100,33 +1100,33 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, btree_iter_cmp)); } -static struct bkey *__bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search, - struct bset_tree *start) +static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; - iter->size = ARRAY_SIZE(iter->data); - iter->used = 0; + iter->iter.size = ARRAY_SIZE(iter->stack_data); + iter->iter.used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->iter.b = b; #endif for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->set); + return __bch_btree_iter_stack_init(b, iter, search, b->set); } static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, @@ -1293,10 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; - struct btree_iter iter; + struct btree_iter_stack iter; int oldsize = bch_count_data(b); - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned int i; @@ -1307,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false, state); + __btree_sort(b, &iter.iter, start, order, false, state); EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } @@ -1323,11 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_stack_init(b, &iter, NULL); - btree_mergesort(b, new->set->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter.iter, false, true); bch_time_stats_update(&state->time, start_time); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index d795c84246b0..6ee2c6a506a2 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -321,9 +321,20 @@ struct btree_iter { #endif struct btree_iter_set { struct bkey *k, *end; - } data[MAX_BSETS]; + } data[]; }; +/* Fixed-size btree_iter that can be allocated on the stack */ + +struct btree_iter_stack { + /* Must be last as it ends in a flexible-array member. */ + TRAILING_OVERLAP(struct btree_iter, iter, data, + struct btree_iter_set stack_data[MAX_BSETS]; + ); +}; +static_assert(offsetof(struct btree_iter_stack, iter.data) == + offsetof(struct btree_iter_stack, stack_data)); + typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); struct bkey *bch_btree_iter_next(struct btree_iter *iter); @@ -333,9 +344,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end); -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search); +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search); struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search); @@ -350,13 +361,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, return search ? __bch_bset_search(b, t, search) : t->data->start; } -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) +#define for_each_key_filter(b, k, stack_iter, filter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ + filter));) -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) +#define for_each_key(b, k, stack_iter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) /* Sorting */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 147c493a989a..3ed39c823826 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -36,6 +36,7 @@ #include <linux/sched/clock.h> #include <linux/rculist.h> #include <linux/delay.h> +#include <linux/sort.h> #include <trace/events/bcache.h> /* @@ -88,10 +89,9 @@ * Test module load/unload */ -#define MAX_NEED_GC 64 -#define MAX_SAVE_PRIO 72 -#define MAX_GC_TIMES 100 -#define MIN_GC_NODES 100 +#define MAX_GC_TIMES_SHIFT 7 /* 128 loops */ +#define GC_NODES_MIN 10 +#define GC_SLEEP_MS_MIN 10 #define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) @@ -293,16 +293,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) w->journal = NULL; } -static void btree_node_write_unlock(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_write_unlock) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); up(&b->io_mutex); } -static void __btree_node_write_done(struct closure *cl) +static CLOSURE_CALLBACK(__btree_node_write_done) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); struct btree_write *w = btree_prev_write(b); bch_bbio_free(b->bio, b->c); @@ -315,12 +315,12 @@ static void __btree_node_write_done(struct closure *cl) closure_return_with_destructor(cl, btree_node_write_unlock); } -static void btree_node_write_done(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_write_done) { - struct btree *b = container_of(cl, struct btree, io); + closure_type(b, struct btree, io); bio_free_pages(b->bio); - __btree_node_write_done(cl); + __btree_node_write_done(&cl->work); } static void btree_node_write_endio(struct bio *bio) @@ -372,7 +372,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, GFP_NOWAIT)) { struct bio_vec *bv; void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); struct bvec_iter_all iter_all; @@ -559,6 +559,25 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) } } +#ifdef CONFIG_PROVE_LOCKING +static int btree_lock_cmp_fn(const struct lockdep_map *_a, + const struct lockdep_map *_b) +{ + const struct btree *a = container_of(_a, struct btree, lock.dep_map); + const struct btree *b = container_of(_b, struct btree, lock.dep_map); + + return -cmp_int(a->level, b->level) ?: bkey_cmp(&a->key, &b->key); +} + +static void btree_lock_print_fn(const struct lockdep_map *map) +{ + const struct btree *b = container_of(map, struct btree, lock.dep_map); + + printk(KERN_CONT " l=%u %llu:%llu", b->level, + KEY_INODE(&b->key), KEY_OFFSET(&b->key)); +} +#endif + static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { @@ -572,7 +591,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, return NULL; init_rwsem(&b->lock); - lockdep_set_novalidate_class(&b->lock); + lock_set_cmp_fn(&b->lock, btree_lock_cmp_fn, btree_lock_print_fn); mutex_init(&b->write_lock); lockdep_set_novalidate_class(&b->write_lock); INIT_LIST_HEAD(&b->list); @@ -646,7 +665,7 @@ out_unlock: static unsigned long bch_mca_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; struct btree *b, *t; unsigned long i, nr = sc->nr_to_scan; unsigned long freed = 0; @@ -713,7 +732,7 @@ out: static unsigned long bch_mca_count(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; if (c->shrinker_disabled) return 0; @@ -731,8 +750,8 @@ void bch_btree_cache_free(struct cache_set *c) closure_init_stack(&cl); - if (c->shrink.list.next) - unregister_shrinker(&c->shrink); + if (c->shrink) + shrinker_free(c->shrink); mutex_lock(&c->bucket_lock); @@ -807,14 +826,19 @@ int bch_btree_cache_alloc(struct cache_set *c) c->verify_data = NULL; #endif - c->shrink.count_objects = bch_mca_count; - c->shrink.scan_objects = bch_mca_scan; - c->shrink.seeks = 4; - c->shrink.batch = c->btree_pages * 2; + c->shrink = shrinker_alloc(0, "md-bcache:%pU", c->set_uuid); + if (!c->shrink) { + pr_warn("bcache: %s: could not allocate shrinker\n", __func__); + return 0; + } + + c->shrink->count_objects = bch_mca_count; + c->shrink->scan_objects = bch_mca_scan; + c->shrink->seeks = 4; + c->shrink->batch = c->btree_pages * 2; + c->shrink->private_data = c; - if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) - pr_warn("bcache: %s: could not register shrinker\n", - __func__); + shrinker_register(c->shrink); return 0; } @@ -885,7 +909,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -static void bch_cannibalize_unlock(struct cache_set *c) +void bch_cannibalize_unlock(struct cache_set *c) { spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { @@ -974,6 +998,9 @@ err: * * The btree node will have either a read or a write lock held, depending on * level and op->lock. + * + * Note: Only error code or btree pointer will be returned, it is unncessary + * for callers to check NULL pointer. */ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, struct bkey *k, int level, bool write, @@ -1085,15 +1112,21 @@ retry: mutex_unlock(&b->c->bucket_lock); } +/* + * Only error code or btree pointer will be returned, it is unncessary for + * callers to check NULL pointer. + */ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, int level, bool wait, struct btree *parent) { BKEY_PADDED(key) k; - struct btree *b = ERR_PTR(-EAGAIN); + struct btree *b; mutex_lock(&c->bucket_lock); retry: + /* return ERR_PTR(-EAGAIN) when it fails */ + b = ERR_PTR(-EAGAIN); if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; @@ -1138,7 +1171,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, { struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bkey_copy_key(&n->key, &b->key); @@ -1274,7 +1307,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct bset_tree *t; gc->nodes++; @@ -1352,7 +1385,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, for (i = 0; i < nodes; i++) { new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); - if (IS_ERR_OR_NULL(new_nodes[i])) + if (IS_ERR(new_nodes[i])) goto out_nocoalesce; } @@ -1504,6 +1537,8 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, return 0; n = btree_node_alloc_replacement(replace, NULL); + if (IS_ERR(n)) + return 0; /* recheck reserve after allocating replacement node */ if (btree_check_reserve(b, NULL)) { @@ -1533,7 +1568,7 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; unsigned int ret = 0; for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) @@ -1544,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b) static size_t btree_gc_min_nodes(struct cache_set *c) { - size_t min_nodes; + size_t min_nodes = GC_NODES_MIN; - /* - * Since incremental GC would stop 100ms when front - * side I/O comes, so when there are many btree nodes, - * if GC only processes constant (100) nodes each time, - * GC would last a long time, and the front side I/Os - * would run out of the buckets (since no new bucket - * can be allocated during GC), and be blocked again. - * So GC should not process constant nodes, but varied - * nodes according to the number of btree nodes, which - * realized by dividing GC into constant(100) times, - * so when there are many btree nodes, GC can process - * more nodes each time, otherwise, GC will process less - * nodes each time (but no less than MIN_GC_NODES) - */ - min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; - if (min_nodes < MIN_GC_NODES) - min_nodes = MIN_GC_NODES; + if (atomic_read(&c->search_inflight) == 0) { + size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT; + + if (min_nodes < n) + min_nodes = n; + } return min_nodes; } +static uint64_t btree_gc_sleep_ms(struct cache_set *c) +{ + uint64_t sleep_ms; + + if (atomic_read(&c->bucket_wait_cnt) > 0) + sleep_ms = GC_SLEEP_MS_MIN; + else + sleep_ms = GC_SLEEP_MS; + + return sleep_ms; +} static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) @@ -1574,17 +1609,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1633,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; - if (atomic_read(&b->c->search_inflight) && - gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) { gc->nodes_pre = gc->nodes; ret = -EAGAIN; break; @@ -1669,7 +1704,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (should_rewrite) { n = btree_node_alloc_replacement(b, NULL); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { bch_btree_node_write_sync(n); bch_btree_set_root(n); @@ -1703,18 +1738,20 @@ static void btree_gc_start(struct cache_set *c) mutex_lock(&c->bucket_lock); - c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; ca = c->cache; for_each_bucket(b, ca) { b->last_gc = b->gen; + if (bch_can_invalidate_bucket(ca, b)) + b->reclaimable_in_gc = 1; if (!atomic_read(&b->pin)) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } + c->gc_mark_valid = 0; mutex_unlock(&c->bucket_lock); } @@ -1771,6 +1808,9 @@ static void bch_btree_gc_finish(struct cache_set *c) for_each_bucket(b, ca) { c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + if (b->reclaimable_in_gc) + b->reclaimable_in_gc = 0; + if (atomic_read(&b->pin)) continue; @@ -1806,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c) cond_resched(); if (ret == -EAGAIN) - schedule_timeout_interruptible(msecs_to_jiffies - (GC_SLEEP_MS)); + schedule_timeout_interruptible( + msecs_to_jiffies(btree_gc_sleep_ms(c))); else if (ret) pr_warn("gc failed!\n"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -1874,7 +1914,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1882,10 +1922,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_stack_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, &b->keys, + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1913,7 +1953,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1922,8 +1962,8 @@ static int bch_btree_check_thread(void *arg) ret = 0; /* root node keys are checked before thread created */ - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1941,7 +1981,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -1968,6 +2008,15 @@ static int bch_btree_check_thread(void *arg) c->gc_stats.nodes++; bch_btree_op_init(&op, 0); ret = bcache_btree(check_recurse, p, c->root, &op); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op)->wait); if (ret) goto out; } @@ -2005,7 +2054,7 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct btree_check_state check_state; /* check and mark root node keys */ @@ -2501,11 +2550,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2534,11 +2583,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, @@ -2772,7 +2822,8 @@ void bch_btree_exit(void) int __init bch_btree_init(void) { - btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); + btree_io_wq = alloc_workqueue("bch_btree_io", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!btree_io_wq) return -ENOMEM; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 1b5fdbc0d83e..45d64b54115a 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -247,8 +247,8 @@ static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) static inline void rw_lock(bool w, struct btree *b, int level) { - w ? down_write_nested(&b->lock, level + 1) - : down_read_nested(&b->lock, level + 1); + w ? down_write(&b->lock) + : down_read(&b->lock); if (w) b->seq++; } @@ -282,6 +282,7 @@ void bch_initial_gc_finish(struct cache_set *c); void bch_moving_gc(struct cache_set *c); int bch_btree_check(struct cache_set *c); void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); +void bch_cannibalize_unlock(struct cache_set *c); static inline void wake_up_gc(struct cache_set *c) { diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c deleted file mode 100644 index d8d9394a6beb..000000000000 --- a/drivers/md/bcache/closure.c +++ /dev/null @@ -1,207 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Asynchronous refcounty things - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/seq_file.h> -#include <linux/sched/debug.h> - -#include "closure.h" - -static inline void closure_put_after_sub(struct closure *cl, int flags) -{ - int r = flags & CLOSURE_REMAINING_MASK; - - BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); - - if (!r) { - if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { - atomic_set(&cl->remaining, - CLOSURE_REMAINING_INITIALIZER); - closure_queue(cl); - } else { - struct closure *parent = cl->parent; - closure_fn *destructor = cl->fn; - - closure_debug_destroy(cl); - - if (destructor) - destructor(cl); - - if (parent) - closure_put(parent); - } - } -} - -/* For clearing flags with the same atomic op as a put */ -void closure_sub(struct closure *cl, int v) -{ - closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); -} - -/* - * closure_put - decrement a closure's refcount - */ -void closure_put(struct closure *cl) -{ - closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); -} - -/* - * closure_wake_up - wake up all closures on a wait list, without memory barrier - */ -void __closure_wake_up(struct closure_waitlist *wait_list) -{ - struct llist_node *list; - struct closure *cl, *t; - struct llist_node *reverse = NULL; - - list = llist_del_all(&wait_list->list); - - /* We first reverse the list to preserve FIFO ordering and fairness */ - reverse = llist_reverse_order(list); - - /* Then do the wakeups */ - llist_for_each_entry_safe(cl, t, reverse, list) { - closure_set_waiting(cl, 0); - closure_sub(cl, CLOSURE_WAITING + 1); - } -} - -/** - * closure_wait - add a closure to a waitlist - * @waitlist: will own a ref on @cl, which will be released when - * closure_wake_up() is called on @waitlist. - * @cl: closure pointer. - * - */ -bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) -{ - if (atomic_read(&cl->remaining) & CLOSURE_WAITING) - return false; - - closure_set_waiting(cl, _RET_IP_); - atomic_add(CLOSURE_WAITING + 1, &cl->remaining); - llist_add(&cl->list, &waitlist->list); - - return true; -} - -struct closure_syncer { - struct task_struct *task; - int done; -}; - -static void closure_sync_fn(struct closure *cl) -{ - struct closure_syncer *s = cl->s; - struct task_struct *p; - - rcu_read_lock(); - p = READ_ONCE(s->task); - s->done = 1; - wake_up_process(p); - rcu_read_unlock(); -} - -void __sched __closure_sync(struct closure *cl) -{ - struct closure_syncer s = { .task = current }; - - cl->s = &s; - continue_at(cl, closure_sync_fn, NULL); - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (s.done) - break; - schedule(); - } - - __set_current_state(TASK_RUNNING); -} - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - -static LIST_HEAD(closure_list); -static DEFINE_SPINLOCK(closure_list_lock); - -void closure_debug_create(struct closure *cl) -{ - unsigned long flags; - - BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); - cl->magic = CLOSURE_MAGIC_ALIVE; - - spin_lock_irqsave(&closure_list_lock, flags); - list_add(&cl->all, &closure_list); - spin_unlock_irqrestore(&closure_list_lock, flags); -} - -void closure_debug_destroy(struct closure *cl) -{ - unsigned long flags; - - BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); - cl->magic = CLOSURE_MAGIC_DEAD; - - spin_lock_irqsave(&closure_list_lock, flags); - list_del(&cl->all); - spin_unlock_irqrestore(&closure_list_lock, flags); -} - -static struct dentry *closure_debug; - -static int debug_show(struct seq_file *f, void *data) -{ - struct closure *cl; - - spin_lock_irq(&closure_list_lock); - - list_for_each_entry(cl, &closure_list, all) { - int r = atomic_read(&cl->remaining); - - seq_printf(f, "%p: %pS -> %pS p %p r %i ", - cl, (void *) cl->ip, cl->fn, cl->parent, - r & CLOSURE_REMAINING_MASK); - - seq_printf(f, "%s%s\n", - test_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(&cl->work)) ? "Q" : "", - r & CLOSURE_RUNNING ? "R" : ""); - - if (r & CLOSURE_WAITING) - seq_printf(f, " W %pS\n", - (void *) cl->waiting_on); - - seq_printf(f, "\n"); - } - - spin_unlock_irq(&closure_list_lock); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(debug); - -void __init closure_debug_init(void) -{ - if (!IS_ERR_OR_NULL(bcache_debug)) - /* - * it is unnecessary to check return value of - * debugfs_create_file(), we should not care - * about this. - */ - closure_debug = debugfs_create_file( - "closures", 0400, bcache_debug, NULL, &debug_fops); -} -#endif - -MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>"); -MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h deleted file mode 100644 index c88cdc4ae4ec..000000000000 --- a/drivers/md/bcache/closure.h +++ /dev/null @@ -1,378 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CLOSURE_H -#define _LINUX_CLOSURE_H - -#include <linux/llist.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/workqueue.h> - -/* - * Closure is perhaps the most overused and abused term in computer science, but - * since I've been unable to come up with anything better you're stuck with it - * again. - * - * What are closures? - * - * They embed a refcount. The basic idea is they count "things that are in - * progress" - in flight bios, some other thread that's doing something else - - * anything you might want to wait on. - * - * The refcount may be manipulated with closure_get() and closure_put(). - * closure_put() is where many of the interesting things happen, when it causes - * the refcount to go to 0. - * - * Closures can be used to wait on things both synchronously and asynchronously, - * and synchronous and asynchronous use can be mixed without restriction. To - * wait synchronously, use closure_sync() - you will sleep until your closure's - * refcount hits 1. - * - * To wait asynchronously, use - * continue_at(cl, next_function, workqueue); - * - * passing it, as you might expect, the function to run when nothing is pending - * and the workqueue to run that function out of. - * - * continue_at() also, critically, requires a 'return' immediately following the - * location where this macro is referenced, to return to the calling function. - * There's good reason for this. - * - * To use safely closures asynchronously, they must always have a refcount while - * they are running owned by the thread that is running them. Otherwise, suppose - * you submit some bios and wish to have a function run when they all complete: - * - * foo_endio(struct bio *bio) - * { - * closure_put(cl); - * } - * - * closure_init(cl); - * - * do_stuff(); - * closure_get(cl); - * bio1->bi_endio = foo_endio; - * bio_submit(bio1); - * - * do_more_stuff(); - * closure_get(cl); - * bio2->bi_endio = foo_endio; - * bio_submit(bio2); - * - * continue_at(cl, complete_some_read, system_wq); - * - * If closure's refcount started at 0, complete_some_read() could run before the - * second bio was submitted - which is almost always not what you want! More - * importantly, it wouldn't be possible to say whether the original thread or - * complete_some_read()'s thread owned the closure - and whatever state it was - * associated with! - * - * So, closure_init() initializes a closure's refcount to 1 - and when a - * closure_fn is run, the refcount will be reset to 1 first. - * - * Then, the rule is - if you got the refcount with closure_get(), release it - * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount - * on a closure because you called closure_init() or you were run out of a - * closure - _always_ use continue_at(). Doing so consistently will help - * eliminate an entire class of particularly pernicious races. - * - * Lastly, you might have a wait list dedicated to a specific event, and have no - * need for specifying the condition - you just want to wait until someone runs - * closure_wake_up() on the appropriate wait list. In that case, just use - * closure_wait(). It will return either true or false, depending on whether the - * closure was already on a wait list or not - a closure can only be on one wait - * list at a time. - * - * Parents: - * - * closure_init() takes two arguments - it takes the closure to initialize, and - * a (possibly null) parent. - * - * If parent is non null, the new closure will have a refcount for its lifetime; - * a closure is considered to be "finished" when its refcount hits 0 and the - * function to run is null. Hence - * - * continue_at(cl, NULL, NULL); - * - * returns up the (spaghetti) stack of closures, precisely like normal return - * returns up the C stack. continue_at() with non null fn is better thought of - * as doing a tail call. - * - * All this implies that a closure should typically be embedded in a particular - * struct (which its refcount will normally control the lifetime of), and that - * struct can very much be thought of as a stack frame. - */ - -struct closure; -struct closure_syncer; -typedef void (closure_fn) (struct closure *); -extern struct dentry *bcache_debug; - -struct closure_waitlist { - struct llist_head list; -}; - -enum closure_state { - /* - * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by - * the thread that owns the closure, and cleared by the thread that's - * waking up the closure. - * - * The rest are for debugging and don't affect behaviour: - * - * CLOSURE_RUNNING: Set when a closure is running (i.e. by - * closure_init() and when closure_put() runs then next function), and - * must be cleared before remaining hits 0. Primarily to help guard - * against incorrect usage and accidentally transferring references. - * continue_at() and closure_return() clear it for you, if you're doing - * something unusual you can use closure_set_dead() which also helps - * annotate where references are being transferred. - */ - - CLOSURE_BITS_START = (1U << 26), - CLOSURE_DESTRUCTOR = (1U << 26), - CLOSURE_WAITING = (1U << 28), - CLOSURE_RUNNING = (1U << 30), -}; - -#define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) - -#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) -#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) - -struct closure { - union { - struct { - struct workqueue_struct *wq; - struct closure_syncer *s; - struct llist_node list; - closure_fn *fn; - }; - struct work_struct work; - }; - - struct closure *parent; - - atomic_t remaining; - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -#define CLOSURE_MAGIC_DEAD 0xc054dead -#define CLOSURE_MAGIC_ALIVE 0xc054a11e - - unsigned int magic; - struct list_head all; - unsigned long ip; - unsigned long waiting_on; -#endif -}; - -void closure_sub(struct closure *cl, int v); -void closure_put(struct closure *cl); -void __closure_wake_up(struct closure_waitlist *list); -bool closure_wait(struct closure_waitlist *list, struct closure *cl); -void __closure_sync(struct closure *cl); - -/** - * closure_sync - sleep until a closure a closure has nothing left to wait on - * - * Sleeps until the refcount hits 1 - the thread that's running the closure owns - * the last refcount. - */ -static inline void closure_sync(struct closure *cl) -{ - if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) - __closure_sync(cl); -} - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - -void closure_debug_init(void); -void closure_debug_create(struct closure *cl); -void closure_debug_destroy(struct closure *cl); - -#else - -static inline void closure_debug_init(void) {} -static inline void closure_debug_create(struct closure *cl) {} -static inline void closure_debug_destroy(struct closure *cl) {} - -#endif - -static inline void closure_set_ip(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->ip = _THIS_IP_; -#endif -} - -static inline void closure_set_ret_ip(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->ip = _RET_IP_; -#endif -} - -static inline void closure_set_waiting(struct closure *cl, unsigned long f) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->waiting_on = f; -#endif -} - -static inline void closure_set_stopped(struct closure *cl) -{ - atomic_sub(CLOSURE_RUNNING, &cl->remaining); -} - -static inline void set_closure_fn(struct closure *cl, closure_fn *fn, - struct workqueue_struct *wq) -{ - closure_set_ip(cl); - cl->fn = fn; - cl->wq = wq; - /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic(); -} - -static inline void closure_queue(struct closure *cl) -{ - struct workqueue_struct *wq = cl->wq; - /** - * Changes made to closure, work_struct, or a couple of other structs - * may cause work.func not pointing to the right location. - */ - BUILD_BUG_ON(offsetof(struct closure, fn) - != offsetof(struct work_struct, func)); - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - BUG_ON(!queue_work(wq, &cl->work)); - } else - cl->fn(cl); -} - -/** - * closure_get - increment a closure's refcount - */ -static inline void closure_get(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - BUG_ON((atomic_inc_return(&cl->remaining) & - CLOSURE_REMAINING_MASK) <= 1); -#else - atomic_inc(&cl->remaining); -#endif -} - -/** - * closure_init - Initialize a closure, setting the refcount to 1 - * @cl: closure to initialize - * @parent: parent of the new closure. cl will take a refcount on it for its - * lifetime; may be NULL. - */ -static inline void closure_init(struct closure *cl, struct closure *parent) -{ - memset(cl, 0, sizeof(struct closure)); - cl->parent = parent; - if (parent) - closure_get(parent); - - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); - - closure_debug_create(cl); - closure_set_ip(cl); -} - -static inline void closure_init_stack(struct closure *cl) -{ - memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -} - -/** - * closure_wake_up - wake up all closures on a wait list, - * with memory barrier - */ -static inline void closure_wake_up(struct closure_waitlist *list) -{ - /* Memory barrier for the wait list */ - smp_mb(); - __closure_wake_up(list); -} - -/** - * continue_at - jump to another function with barrier - * - * After @cl is no longer waiting on anything (i.e. all outstanding refs have - * been dropped with closure_put()), it will resume execution at @fn running out - * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). - * - * This is because after calling continue_at() you no longer have a ref on @cl, - * and whatever @cl owns may be freed out from under you - a running closure fn - * has a ref on its own closure which continue_at() drops. - * - * Note you are expected to immediately return after using this macro. - */ -#define continue_at(_cl, _fn, _wq) \ -do { \ - set_closure_fn(_cl, _fn, _wq); \ - closure_sub(_cl, CLOSURE_RUNNING + 1); \ -} while (0) - -/** - * closure_return - finish execution of a closure - * - * This is used to indicate that @cl is finished: when all outstanding refs on - * @cl have been dropped @cl's ref on its parent closure (as passed to - * closure_init()) will be dropped, if one was specified - thus this can be - * thought of as returning to the parent closure. - */ -#define closure_return(_cl) continue_at((_cl), NULL, NULL) - -/** - * continue_at_nobarrier - jump to another function without barrier - * - * Causes @fn to be executed out of @cl, in @wq context (or called directly if - * @wq is NULL). - * - * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, - * thus it's not safe to touch anything protected by @cl after a - * continue_at_nobarrier(). - */ -#define continue_at_nobarrier(_cl, _fn, _wq) \ -do { \ - set_closure_fn(_cl, _fn, _wq); \ - closure_queue(_cl); \ -} while (0) - -/** - * closure_return_with_destructor - finish execution of a closure, - * with destructor - * - * Works like closure_return(), except @destructor will be called when all - * outstanding refs on @cl have been dropped; @destructor may be used to safely - * free the memory occupied by @cl, and it is called with the ref on the parent - * closure still held - so @destructor could safely return an item to a - * freelist protected by @cl's parent. - */ -#define closure_return_with_destructor(_cl, _destructor) \ -do { \ - set_closure_fn(_cl, _destructor, NULL); \ - closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ -} while (0) - -/** - * closure_call - execute @fn out of a new, uninitialized closure - * - * Typically used when running out of one closure, and we want to run @fn - * asynchronously out of a new closure - @parent will then wait for @cl to - * finish. - */ -static inline void closure_call(struct closure *cl, closure_fn fn, - struct workqueue_struct *wq, - struct closure *parent) -{ - closure_init(cl, parent); - continue_at_nobarrier(cl, fn, wq); -} - -#endif /* _LINUX_CLOSURE_H */ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 7510d1c983a5..f327456fc4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, - REQ_OP_READ); + bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 020712c5203f..2386d08bf4e4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, - meta_bucket_pages(&c->cache->sb), 0); + bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c182c21de2e8..144693b7c46a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -275,8 +275,7 @@ bsearch: * ja->cur_idx */ ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - ca->sb.njournal_buckets; + ja->last_idx = (i + 1) % ca->sb.njournal_buckets; } @@ -336,16 +335,6 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } } -static bool is_discard_enabled(struct cache_set *s) -{ - struct cache *ca = s->cache; - - if (ca->discard) - return true; - - return false; -} - int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; @@ -360,15 +349,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) BUG_ON(i->pin && atomic_read(i->pin) != 1); if (n != i->j.seq) { - if (n == start && is_discard_enabled(s)) - pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); - else { - pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); - ret = -EIO; - goto err; - } + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", + n, i->j.seq - 1, start, end); + ret = -EIO; + goto err; } for (k = i->j.start; @@ -568,65 +552,6 @@ out: #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -static void journal_discard_endio(struct bio *bio) -{ - struct journal_device *ja = - container_of(bio, struct journal_device, discard_bio); - struct cache *ca = container_of(ja, struct cache, journal); - - atomic_set(&ja->discard_in_flight, DISCARD_DONE); - - closure_wake_up(&ca->set->journal.wait); - closure_put(&ca->set->cl); -} - -static void journal_discard_work(struct work_struct *work) -{ - struct journal_device *ja = - container_of(work, struct journal_device, discard_work); - - submit_bio(&ja->discard_bio); -} - -static void do_journal_discard(struct cache *ca) -{ - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->discard_bio; - - if (!ca->discard) { - ja->discard_idx = ja->last_idx; - return; - } - - switch (atomic_read(&ja->discard_in_flight)) { - case DISCARD_IN_FLIGHT: - return; - - case DISCARD_DONE: - ja->discard_idx = (ja->discard_idx + 1) % - ca->sb.njournal_buckets; - - atomic_set(&ja->discard_in_flight, DISCARD_READY); - fallthrough; - - case DISCARD_READY: - if (ja->discard_idx == ja->last_idx) - return; - - atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - - bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); - bio->bi_iter.bi_sector = bucket_to_sector(ca->set, - ca->sb.d[ja->discard_idx]); - bio->bi_iter.bi_size = bucket_bytes(ca); - bio->bi_end_io = journal_discard_endio; - - closure_get(&ca->set->cl); - INIT_WORK(&ja->discard_work, journal_discard_work); - queue_work(bch_journal_wq, &ja->discard_work); - } -} - static unsigned int free_journal_buckets(struct cache_set *c) { struct journal *j = &c->journal; @@ -635,10 +560,10 @@ static unsigned int free_journal_buckets(struct cache_set *c) unsigned int n; /* In case njournal_buckets is not power of 2 */ - if (ja->cur_idx >= ja->discard_idx) - n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; + if (ja->cur_idx >= ja->last_idx) + n = ca->sb.njournal_buckets + ja->last_idx - ja->cur_idx; else - n = ja->discard_idx - ja->cur_idx; + n = ja->last_idx - ja->cur_idx; if (n > (1 + j->do_reserve)) return n - (1 + j->do_reserve); @@ -668,8 +593,6 @@ static void journal_reclaim(struct cache_set *c) ja->last_idx = (ja->last_idx + 1) % ca->sb.njournal_buckets; - do_journal_discard(ca); - if (c->journal.blocks_free) goto out; @@ -723,11 +646,11 @@ static void journal_write_endio(struct bio *bio) closure_put(&w->c->journal.io); } -static void journal_write(struct closure *cl); +static CLOSURE_CALLBACK(journal_write); -static void journal_write_done(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_done) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct journal_write *w = (j->cur == j->w) ? &j->w[1] : &j->w[0]; @@ -736,19 +659,19 @@ static void journal_write_done(struct closure *cl) continue_at_nobarrier(cl, journal_write, bch_journal_wq); } -static void journal_write_unlock(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_unlock) __releases(&c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); + closure_type(c, struct cache_set, journal.io); c->journal.io_in_flight = 0; spin_unlock(&c->journal.lock); } -static void journal_write_unlocked(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_unlocked) __releases(c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); + closure_type(c, struct cache_set, journal.io); struct cache *ca = c->cache; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; @@ -823,12 +746,12 @@ static void journal_write_unlocked(struct closure *cl) continue_at(cl, journal_write_done, NULL); } -static void journal_write(struct closure *cl) +static CLOSURE_CALLBACK(journal_write) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); + closure_type(c, struct cache_set, journal.io); spin_lock(&c->journal.lock); - journal_write_unlocked(cl); + journal_write_unlocked(&cl->work); } static void journal_try_write(struct cache_set *c) diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index cd316b4a1e95..9e9d1b3016a5 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -139,19 +139,6 @@ struct journal_device { /* Last journal bucket that still contains an open journal entry */ unsigned int last_idx; - /* Next journal bucket to be discarded */ - unsigned int discard_idx; - -#define DISCARD_READY 0 -#define DISCARD_IN_FLIGHT 1 -#define DISCARD_DONE 2 - /* 1 - discard in flight, -1 - discard completed */ - atomic_t discard_in_flight; - - struct work_struct discard_work; - struct bio discard_bio; - struct bio_vec discard_bv; - /* Bio for journal reads/writes to this device */ struct bio bio; struct bio_vec bv[8]; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 9f32901fdad1..73918e55bf04 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -35,16 +35,16 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) /* Moving GC - IO loop */ -static void moving_io_destructor(struct closure *cl) +static CLOSURE_CALLBACK(moving_io_destructor) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + closure_type(io, struct moving_io, cl); kfree(io); } -static void write_moving_finish(struct closure *cl) +static CLOSURE_CALLBACK(write_moving_finish) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + closure_type(io, struct moving_io, cl); struct bio *bio = &io->bio.bio; bio_free_pages(bio); @@ -79,19 +79,19 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; bio->bi_private = &io->cl; bch_bio_map(bio, NULL); } -static void write_moving(struct closure *cl) +static CLOSURE_CALLBACK(write_moving) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + closure_type(io, struct moving_io, cl); struct data_insert_op *op = &io->op; if (!op->status) { @@ -113,9 +113,9 @@ static void write_moving(struct closure *cl) continue_at(cl, write_moving_finish, op->wq); } -static void read_moving_submit(struct closure *cl) +static CLOSURE_CALLBACK(read_moving_submit) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + closure_type(io, struct moving_io, cl); struct bio *bio = &io->bio.bio; bch_submit_bbio(bio, io->op.c, &io->w->key, 0); @@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 67a2e29e0b40..af345dc6fde1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -25,7 +25,7 @@ struct kmem_cache *bch_search_cache; -static void bch_data_insert_start(struct closure *cl); +static CLOSURE_CALLBACK(bch_data_insert_start); static unsigned int cache_mode(struct cached_dev *dc) { @@ -55,9 +55,9 @@ static void bio_csum(struct bio *bio, struct bkey *k) /* Insert data into cache */ -static void bch_data_insert_keys(struct closure *cl) +static CLOSURE_CALLBACK(bch_data_insert_keys) { - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + closure_type(op, struct data_insert_op, cl); atomic_t *journal_ref = NULL; struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret; @@ -136,9 +136,9 @@ out: continue_at(cl, bch_data_insert_keys, op->wq); } -static void bch_data_insert_error(struct closure *cl) +static CLOSURE_CALLBACK(bch_data_insert_error) { - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + closure_type(op, struct data_insert_op, cl); /* * Our data write just errored, which means we've got a bunch of keys to @@ -163,7 +163,7 @@ static void bch_data_insert_error(struct closure *cl) op->insert_keys.top = dst; - bch_data_insert_keys(cl); + bch_data_insert_keys(&cl->work); } static void bch_data_insert_endio(struct bio *bio) @@ -184,9 +184,9 @@ static void bch_data_insert_endio(struct bio *bio) bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache"); } -static void bch_data_insert_start(struct closure *cl) +static CLOSURE_CALLBACK(bch_data_insert_start) { - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + closure_type(op, struct data_insert_op, cl); struct bio *bio = op->bio, *n; if (op->bypass) @@ -305,16 +305,16 @@ err: * If op->bypass is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode. */ -void bch_data_insert(struct closure *cl) +CLOSURE_CALLBACK(bch_data_insert) { - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + closure_type(op, struct data_insert_op, cl); trace_bcache_write(op->c, op->inode, op->bio, op->writeback, op->bypass); bch_keylist_init(&op->insert_keys); bio_get(op->bio); - bch_data_insert_start(cl); + bch_data_insert_start(&cl->work); } /* @@ -369,10 +369,24 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) struct io *i; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || (bio_op(bio) == REQ_OP_DISCARD)) goto skip; + if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) { + /* + * If cached buckets are all clean now, 'true' will be + * returned and all requests will bypass the cache device. + * Then c->sectors_to_gc has no chance to be negative, and + * gc thread won't wake up and caching won't work forever. + * Here call force_wake_up_gc() to avoid such aftermath. + */ + if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN && + c->gc_mark_valid) + force_wake_up_gc(c); + + goto skip; + } + if (mode == CACHE_MODE_NONE || (mode == CACHE_MODE_WRITEAROUND && op_is_write(bio_op(bio)))) @@ -575,9 +589,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) return n == bio ? MAP_DONE : MAP_CONTINUE; } -static void cache_lookup(struct closure *cl) +static CLOSURE_CALLBACK(cache_lookup) { - struct search *s = container_of(cl, struct search, iop.cl); + closure_type(s, struct search, iop.cl); struct bio *bio = &s->bio.bio; struct cached_dev *dc; int ret; @@ -698,9 +712,9 @@ static void do_bio_hook(struct search *s, bio_cnt_set(bio, 3); } -static void search_free(struct closure *cl) +static CLOSURE_CALLBACK(search_free) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); atomic_dec(&s->iop.c->search_inflight); @@ -749,20 +763,20 @@ static inline struct search *search_alloc(struct bio *bio, /* Cached devices */ -static void cached_dev_bio_complete(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_bio_complete) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); cached_dev_put(dc); - search_free(cl); + search_free(&cl->work); } /* Process reads */ -static void cached_dev_read_error_done(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_read_error_done) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); if (s->iop.replace_collision) bch_mark_cache_miss_collision(s->iop.c, s->d); @@ -770,12 +784,12 @@ static void cached_dev_read_error_done(struct closure *cl) if (s->iop.bio) bio_free_pages(s->iop.bio); - cached_dev_bio_complete(cl); + cached_dev_bio_complete(&cl->work); } -static void cached_dev_read_error(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_read_error) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct bio *bio = &s->bio.bio; /* @@ -801,9 +815,9 @@ static void cached_dev_read_error(struct closure *cl) continue_at(cl, cached_dev_read_error_done, NULL); } -static void cached_dev_cache_miss_done(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_cache_miss_done) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct bcache_device *d = s->d; if (s->iop.replace_collision) @@ -812,13 +826,13 @@ static void cached_dev_cache_miss_done(struct closure *cl) if (s->iop.bio) bio_free_pages(s->iop.bio); - cached_dev_bio_complete(cl); + cached_dev_bio_complete(&cl->work); closure_put(&d->cl); } -static void cached_dev_read_done(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_read_done) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); /* @@ -858,9 +872,9 @@ static void cached_dev_read_done(struct closure *cl) continue_at(cl, cached_dev_cache_miss_done, NULL); } -static void cached_dev_read_done_bh(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_read_done_bh) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s->iop.c, s->d, @@ -955,13 +969,13 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s) /* Process writes */ -static void cached_dev_write_complete(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_write_complete) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); up_read_non_owner(&dc->writeback_lock); - cached_dev_bio_complete(cl); + cached_dev_bio_complete(&cl->work); } static void cached_dev_write(struct cached_dev *dc, struct search *s) @@ -1048,9 +1062,9 @@ insert_data: continue_at(cl, cached_dev_write_complete, NULL); } -static void cached_dev_nodata(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_nodata) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); struct bio *bio = &s->bio.bio; if (s->iop.flush_journal) @@ -1228,7 +1242,7 @@ void cached_dev_submit_bio(struct bio *bio) detached_dev_do_request(d, bio, orig_bdev, start_time); } -static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, +static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); @@ -1265,9 +1279,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s, return MAP_CONTINUE; } -static void flash_dev_nodata(struct closure *cl) +static CLOSURE_CALLBACK(flash_dev_nodata) { - struct search *s = container_of(cl, struct search, cl); + closure_type(s, struct search, cl); if (s->iop.flush_journal) bch_journal_meta(s->iop.c, cl); @@ -1318,7 +1332,7 @@ void flash_dev_submit_bio(struct bio *bio) continue_at(cl, search_free, NULL); } -static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, +static int flash_dev_ioctl(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 38ab4856eaab..46bbef00aebb 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -34,7 +34,7 @@ struct data_insert_op { }; unsigned int bch_get_congested(const struct cache_set *c); -void bch_data_insert(struct closure *cl); +CLOSURE_CALLBACK(bch_data_insert); void bch_cached_dev_request_init(struct cached_dev *dc); void cached_dev_submit_bio(struct bio *bio); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 68b02216033d..0056106495a7 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -123,7 +123,7 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc) kobject_put(&acc->day.kobj); atomic_set(&acc->closing, 1); - if (del_timer_sync(&acc->timer)) + if (timer_delete_sync(&acc->timer)) closure_return(&acc->cl); } @@ -149,7 +149,7 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) static void scale_accounting(struct timer_list *t) { - struct cache_accounting *acc = from_timer(acc, t, timer); + struct cache_accounting *acc = timer_container_of(acc, t, timer); #define move_stat(name) do { \ unsigned int t = atomic_xchg(&acc->collector.name, 0); \ diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index bd3afc856d53..21b445f8af15 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -18,7 +18,6 @@ struct cache_stats { unsigned long cache_misses; unsigned long cache_bypass_hits; unsigned long cache_bypass_misses; - unsigned long cache_readaheads; unsigned long cache_miss_collisions; unsigned long sectors_bypassed; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ba3909bb6bea..c17d4517af22 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -168,14 +168,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, { const char *err; struct cache_sb_disk *s; - struct page *page; + struct folio *folio; unsigned int i; - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, - SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); - if (IS_ERR(page)) + folio = mapping_read_folio_gfp(bdev->bd_mapping, + SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); + if (IS_ERR(folio)) return "IO error"; - s = page_address(page) + offset_in_page(SB_OFFSET); + s = folio_address(folio) + offset_in_folio(folio, SB_OFFSET); sb->offset = le64_to_cpu(s->offset); sb->version = le64_to_cpu(s->version); @@ -272,7 +272,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, *res = s; return NULL; err: - put_page(page); + folio_put(folio); return err; } @@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META; bio->bi_iter.bi_sector = SB_SECTOR; - __bio_add_page(bio, virt_to_page(out), SB_SIZE, - offset_in_page(out)); + bio_add_virt_nofail(bio, out, SB_SIZE); out->offset = cpu_to_le64(sb->offset); @@ -327,9 +326,9 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, submit_bio(bio); } -static void bch_write_bdev_super_unlock(struct closure *cl) +static CLOSURE_CALLBACK(bch_write_bdev_super_unlock) { - struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); + closure_type(dc, struct cached_dev, sb_write); up(&dc->sb_write_mutex); } @@ -363,9 +362,9 @@ static void write_super_endio(struct bio *bio) closure_put(&ca->set->sb_write); } -static void bcache_write_super_unlock(struct closure *cl) +static CLOSURE_CALLBACK(bcache_write_super_unlock) { - struct cache_set *c = container_of(cl, struct cache_set, sb_write); + closure_type(c, struct cache_set, sb_write); up(&c->sb_write_mutex); } @@ -407,9 +406,9 @@ static void uuid_endio(struct bio *bio) closure_put(cl); } -static void uuid_io_unlock(struct closure *cl) +static CLOSURE_CALLBACK(uuid_io_unlock) { - struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + closure_type(c, struct cache_set, uuid_write); up(&c->uuid_write_mutex); } @@ -546,7 +545,8 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) static struct uuid_entry *uuid_find_empty(struct cache_set *c) { - static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + static const char zero_uuid[16] __nonstring = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; return uuid_find(c, zero_uuid); } @@ -732,9 +732,9 @@ out: /* Bcache device */ -static int open_dev(struct block_device *b, fmode_t mode) +static int open_dev(struct gendisk *disk, blk_mode_t mode) { - struct bcache_device *d = b->bd_disk->private_data; + struct bcache_device *d = disk->private_data; if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; @@ -743,14 +743,14 @@ static int open_dev(struct block_device *b, fmode_t mode) return 0; } -static void release_dev(struct gendisk *b, fmode_t mode) +static void release_dev(struct gendisk *b) { struct bcache_device *d = b->private_data; closure_put(&d->cl); } -static int ioctl_dev(struct block_device *b, fmode_t mode, +static int ioctl_dev(struct block_device *b, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct bcache_device *d = b->bd_disk->private_data; @@ -881,8 +881,8 @@ static void bcache_device_free(struct bcache_device *d) bcache_device_detach(d); if (disk) { - ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(disk->first_minor)); + ida_free(&bcache_device_idx, + first_minor_to_idx(disk->first_minor)); put_disk(disk); } @@ -897,14 +897,30 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, sector_t sectors, struct block_device *cached_bdev, const struct block_device_operations *ops) { - struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, SIZE_MAX / sizeof(atomic_t)); + struct queue_limits lim = { + .max_hw_sectors = UINT_MAX, + .max_sectors = UINT_MAX, + .max_segment_size = UINT_MAX, + .max_segments = BIO_MAX_VECS, + .max_hw_discard_sectors = UINT_MAX, + .io_min = block_size, + .logical_block_size = block_size, + .physical_block_size = block_size, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA, + }; uint64_t n; int idx; + if (cached_bdev) { + d->stripe_size = bdev_io_opt(cached_bdev) >> SECTOR_SHIFT; + lim.io_opt = umax(block_size, bdev_io_opt(cached_bdev)); + } if (!d->stripe_size) d->stripe_size = 1 << 31; + else if (d->stripe_size < BCH_MIN_STRIPE_SZ) + d->stripe_size = roundup(BCH_MIN_STRIPE_SZ, d->stripe_size); n = DIV_ROUND_UP_ULL(sectors, d->stripe_size); if (!n || n > max_stripes) { @@ -924,8 +940,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, if (!d->full_dirty_stripes) goto out_free_stripe_sectors_dirty; - idx = ida_simple_get(&bcache_device_idx, 0, - BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); + idx = ida_alloc_max(&bcache_device_idx, BCACHE_DEVICE_IDX_MAX - 1, + GFP_KERNEL); if (idx < 0) goto out_free_full_dirty_stripes; @@ -933,54 +949,37 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto out_ida_remove; - d->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!d->disk) - goto out_bioset_exit; - - set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); - - d->disk->major = bcache_major; - d->disk->first_minor = idx_to_first_minor(idx); - d->disk->minors = BCACHE_MINORS; - d->disk->fops = ops; - d->disk->private_data = d; - - q = d->disk->queue; - q->limits.max_hw_sectors = UINT_MAX; - q->limits.max_sectors = UINT_MAX; - q->limits.max_segment_size = UINT_MAX; - q->limits.max_segments = BIO_MAX_VECS; - blk_queue_max_discard_sectors(q, UINT_MAX); - q->limits.discard_granularity = 512; - q->limits.io_min = block_size; - q->limits.logical_block_size = block_size; - q->limits.physical_block_size = block_size; - - if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) { + if (lim.logical_block_size > PAGE_SIZE && cached_bdev) { /* * This should only happen with BCACHE_SB_VERSION_BDEV. * Block/page size is checked for BCACHE_SB_VERSION_CDEV. */ - pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n", - d->disk->disk_name, q->limits.logical_block_size, + pr_info("bcache%i: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n", + idx, lim.logical_block_size, PAGE_SIZE, bdev_logical_block_size(cached_bdev)); /* This also adjusts physical block size/min io size if needed */ - blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev)); + lim.logical_block_size = bdev_logical_block_size(cached_bdev); } - blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue); + d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(d->disk)) + goto out_bioset_exit; - blk_queue_write_cache(q, true, true); + set_capacity(d->disk, sectors); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); + d->disk->major = bcache_major; + d->disk->first_minor = idx_to_first_minor(idx); + d->disk->minors = BCACHE_MINORS; + d->disk->fops = ops; + d->disk->private_data = d; return 0; out_bioset_exit: bioset_exit(&d->bio_split); out_ida_remove: - ida_simple_remove(&bcache_device_idx, idx); + ida_free(&bcache_device_idx, idx); out_free_full_dirty_stripes: kvfree(d->full_dirty_stripes); out_free_stripe_sectors_dirty: @@ -1343,9 +1342,9 @@ void bch_cached_dev_release(struct kobject *kobj) module_put(THIS_MODULE); } -static void cached_dev_free(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_free) { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + closure_type(dc, struct cached_dev, disk.cl); if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); @@ -1367,19 +1366,19 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); if (dc->sb_disk) - put_page(virt_to_page(dc->sb_disk)); + folio_put(virt_to_folio(dc->sb_disk)); - if (!IS_ERR_OR_NULL(dc->bdev)) - blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + if (dc->bdev_file) + fput(dc->bdev_file); wake_up(&unregister_wait); kobject_put(&dc->disk.kobj); } -static void cached_dev_flush(struct closure *cl) +static CLOSURE_CALLBACK(cached_dev_flush) { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + closure_type(dc, struct cached_dev, disk.cl); struct bcache_device *d = &dc->disk; mutex_lock(&bch_register_lock); @@ -1389,7 +1388,7 @@ static void cached_dev_flush(struct closure *cl) bch_cache_accounting_destroy(&dc->accounting); kobject_del(&d->kobj); - continue_at(cl, cached_dev_free, system_wq); + continue_at(cl, cached_dev_free, system_percpu_wq); } static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) @@ -1401,7 +1400,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) __module_get(THIS_MODULE); INIT_LIST_HEAD(&dc->list); closure_init(&dc->disk.cl, NULL); - set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_percpu_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); INIT_WORK(&dc->detach, cached_dev_detach_finish); sema_init(&dc->sb_write_mutex, 1); @@ -1416,11 +1415,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - dc->disk.stripe_size = q->limits.io_opt >> 9; - - if (dc->disk.stripe_size) - dc->partial_stripes_expensive = - q->limits.raid_partial_stripes_expensive; + if (bdev_io_opt(dc->bdev)) + dc->partial_stripes_expensive = !!(q->limits.features & + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE); ret = bcache_device_init(&dc->disk, block_size, bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, @@ -1428,9 +1425,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) if (ret) return ret; - blk_queue_io_opt(dc->disk.disk->queue, - max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q))); - atomic_set(&dc->io_errors, 0); dc->io_disable = false; dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; @@ -1445,7 +1439,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, - struct block_device *bdev, + struct file *bdev_file, struct cached_dev *dc) { const char *err = "cannot allocate memory"; @@ -1453,15 +1447,15 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, int ret = -ENOMEM; memcpy(&dc->sb, sb, sizeof(struct cache_sb)); - dc->bdev = bdev; - dc->bdev->bd_holder = dc; + dc->bdev_file = bdev_file; + dc->bdev = file_bdev(bdev_file); dc->sb_disk = sb_disk; if (cached_dev_init(dc, sb->block_size << 9)) goto err; err = "error creating kobject"; - if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache")) + if (kobject_add(&dc->disk.kobj, bdev_kobj(dc->bdev), "bcache")) goto err; if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; @@ -1498,9 +1492,9 @@ void bch_flash_dev_release(struct kobject *kobj) kfree(d); } -static void flash_dev_free(struct closure *cl) +static CLOSURE_CALLBACK(flash_dev_free) { - struct bcache_device *d = container_of(cl, struct bcache_device, cl); + closure_type(d, struct bcache_device, cl); mutex_lock(&bch_register_lock); atomic_long_sub(bcache_dev_sectors_dirty(d), @@ -1511,15 +1505,15 @@ static void flash_dev_free(struct closure *cl) kobject_put(&d->kobj); } -static void flash_dev_flush(struct closure *cl) +static CLOSURE_CALLBACK(flash_dev_flush) { - struct bcache_device *d = container_of(cl, struct bcache_device, cl); + closure_type(d, struct bcache_device, cl); mutex_lock(&bch_register_lock); bcache_device_unlink(d); mutex_unlock(&bch_register_lock); kobject_del(&d->kobj); - continue_at(cl, flash_dev_free, system_wq); + continue_at(cl, flash_dev_free, system_percpu_wq); } static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) @@ -1531,7 +1525,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err_ret; closure_init(&d->cl, NULL); - set_closure_fn(&d->cl, flash_dev_flush, system_wq); + set_closure_fn(&d->cl, flash_dev_flush, system_percpu_wq); kobject_init(&d->kobj, &bch_flash_dev_ktype); @@ -1669,9 +1663,9 @@ void bch_cache_set_release(struct kobject *kobj) module_put(THIS_MODULE); } -static void cache_set_free(struct closure *cl) +static CLOSURE_CALLBACK(cache_set_free) { - struct cache_set *c = container_of(cl, struct cache_set, cl); + closure_type(c, struct cache_set, cl); struct cache *ca; debugfs_remove(c->debug); @@ -1710,9 +1704,9 @@ static void cache_set_free(struct closure *cl) kobject_put(&c->kobj); } -static void cache_set_flush(struct closure *cl) +static CLOSURE_CALLBACK(cache_set_flush) { - struct cache_set *c = container_of(cl, struct cache_set, caching); + closure_type(c, struct cache_set, caching); struct cache *ca = c->cache; struct btree *b; @@ -1739,7 +1733,12 @@ static void cache_set_flush(struct closure *cl) mutex_unlock(&b->write_lock); } - if (ca->alloc_thread) + /* + * If the register_cache_set() call to bch_cache_set_alloc() failed, + * ca has not been assigned a value and return error. + * So we need check ca is not NULL during bch_cache_set_unregister(). + */ + if (ca && ca->alloc_thread) kthread_stop(ca->alloc_thread); if (c->journal.cur) { @@ -1807,9 +1806,9 @@ static void conditional_stop_bcache_device(struct cache_set *c, } } -static void __cache_set_unregister(struct closure *cl) +static CLOSURE_CALLBACK(__cache_set_unregister) { - struct cache_set *c = container_of(cl, struct cache_set, caching); + closure_type(c, struct cache_set, caching); struct cached_dev *dc; struct bcache_device *d; size_t i; @@ -1834,7 +1833,7 @@ static void __cache_set_unregister(struct closure *cl) mutex_unlock(&bch_register_lock); - continue_at(cl, cache_set_flush, system_wq); + continue_at(cl, cache_set_flush, system_percpu_wq); } void bch_cache_set_stop(struct cache_set *c) @@ -1864,10 +1863,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) __module_get(THIS_MODULE); closure_init(&c->cl, NULL); - set_closure_fn(&c->cl, cache_set_free, system_wq); + set_closure_fn(&c->cl, cache_set_free, system_percpu_wq); closure_init(&c->caching, &c->cl); - set_closure_fn(&c->caching, __cache_set_unregister, system_wq); + set_closure_fn(&c->caching, __cache_set_unregister, system_percpu_wq); /* Maybe create continue_at_noreturn() and use it here? */ closure_set_stopped(&c->cl); @@ -1913,8 +1912,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) * - sizeof(struct btree_iter_set); + iter_size = sizeof(struct btree_iter) + + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * + sizeof(struct btree_iter_set); c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); if (!c->devices) @@ -1939,7 +1939,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) if (!c->uuids) goto err; - c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0); + c->moving_gc_wq = alloc_workqueue("bcache_gc", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!c->moving_gc_wq) goto err; @@ -2017,7 +2018,7 @@ static int run_cache_set(struct cache_set *c) c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); - if (IS_ERR_OR_NULL(c->root)) + if (IS_ERR(c->root)) goto err; list_del_init(&c->root->list); @@ -2088,7 +2089,7 @@ static int run_cache_set(struct cache_set *c) err = "cannot allocate new btree root"; c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); - if (IS_ERR_OR_NULL(c->root)) + if (IS_ERR(c->root)) goto err; mutex_lock(&c->root->write_lock); @@ -2216,10 +2217,10 @@ void bch_cache_release(struct kobject *kobj) free_fifo(&ca->free[i]); if (ca->sb_disk) - put_page(virt_to_page(ca->sb_disk)); + folio_put(virt_to_folio(ca->sb_disk)); - if (!IS_ERR_OR_NULL(ca->bdev)) - blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + if (ca->bdev_file) + fput(ca->bdev_file); kfree(ca); module_put(THIS_MODULE); @@ -2236,18 +2237,50 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + bio_init_inline(&ca->journal.bio, NULL, 8, 0); /* - * when ca->sb.njournal_buckets is not zero, journal exists, - * and in bch_journal_replay(), tree node may split, - * so bucket of RESERVE_BTREE type is needed, - * the worst situation is all journal buckets are valid journal, - * and all the keys need to replay, - * so the number of RESERVE_BTREE type buckets should be as much - * as journal buckets + * When the cache disk is first registered, ca->sb.njournal_buckets + * is zero, and it is assigned in run_cache_set(). + * + * When ca->sb.njournal_buckets is not zero, journal exists, + * and in bch_journal_replay(), tree node may split. + * The worst situation is all journal buckets are valid journal, + * and all the keys need to replay, so the number of RESERVE_BTREE + * type buckets should be as much as journal buckets. + * + * If the number of RESERVE_BTREE type buckets is too few, the + * bch_allocator_thread() may hang up and unable to allocate + * bucket. The situation is roughly as follows: + * + * 1. In bch_data_insert_keys(), if the operation is not op->replace, + * it will call the bch_journal(), which increments the journal_ref + * counter. This counter is only decremented after bch_btree_insert + * completes. + * + * 2. When calling bch_btree_insert, if the btree needs to split, + * it will call btree_split() and btree_check_reserve() to check + * whether there are enough reserved buckets in the RESERVE_BTREE + * slot. If not enough, bcache_btree_root() will repeatedly retry. + * + * 3. Normally, the bch_allocator_thread is responsible for filling + * the reservation slots from the free_inc bucket list. When the + * free_inc bucket list is exhausted, the bch_allocator_thread + * will call invalidate_buckets() until free_inc is refilled. + * Then bch_allocator_thread calls bch_prio_write() once. and + * bch_prio_write() will call bch_journal_meta() and waits for + * the journal write to complete. + * + * 4. During journal_write, journal_write_unlocked() is be called. + * If journal full occurs, journal_reclaim() and btree_flush_write() + * will be called sequentially, then retry journal_write. + * + * 5. When 2 and 4 occur together, IO will hung up and cannot recover. + * + * Therefore, reserve more RESERVE_BTREE type buckets. */ - btree_buckets = ca->sb.njournal_buckets ?: 8; + btree_buckets = clamp_t(size_t, ca->sb.nbuckets >> 7, + 32, SB_JOURNAL_BUCKETS); free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; if (!free) { ret = -EPERM; @@ -2339,39 +2372,39 @@ err_free: } static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, - struct block_device *bdev, struct cache *ca) + struct file *bdev_file, + struct cache *ca) { const char *err = NULL; /* must be set for any error case */ int ret = 0; memcpy(&ca->sb, sb, sizeof(struct cache_sb)); - ca->bdev = bdev; - ca->bdev->bd_holder = ca; + ca->bdev_file = bdev_file; + ca->bdev = file_bdev(bdev_file); ca->sb_disk = sb_disk; - if (bdev_max_discard_sectors((bdev))) - ca->discard = CACHE_DISCARD(&ca->sb); - ret = cache_alloc(ca); if (ret != 0) { - /* - * If we failed here, it means ca->kobj is not initialized yet, - * kobject_put() won't be called and there is no chance to - * call blkdev_put() to bdev in bch_cache_release(). So we - * explicitly call blkdev_put() here. - */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; else if (ret == -EPERM) err = "cache_alloc(): cache device is too small"; else err = "cache_alloc(): unknown error"; - goto err; + pr_notice("error %pg: %s\n", file_bdev(bdev_file), err); + /* + * If we failed here, it means ca->kobj is not initialized yet, + * kobject_put() won't be called and there is no chance to + * call fput() to bdev in bch_cache_release(). So + * we explicitly call fput() on the block device here. + */ + fput(bdev_file); + return ret; } - if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) { - err = "error calling kobject_add"; + if (kobject_add(&ca->kobj, bdev_kobj(file_bdev(bdev_file)), "bcache")) { + pr_notice("error %pg: error calling kobject_add\n", + file_bdev(bdev_file)); ret = -ENOMEM; goto out; } @@ -2385,15 +2418,10 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto out; } - pr_info("registered cache device %pg\n", ca->bdev); + pr_info("registered cache device %pg\n", file_bdev(ca->bdev_file)); out: kobject_put(&ca->kobj); - -err: - if (err) - pr_notice("error %pg: %s\n", ca->bdev, err); - return ret; } @@ -2448,7 +2476,8 @@ struct async_reg_args { char *path; struct cache_sb *sb; struct cache_sb_disk *sb_disk; - struct block_device *bdev; + struct file *bdev_file; + void *holder; }; static void register_bdev_worker(struct work_struct *work) @@ -2456,22 +2485,13 @@ static void register_bdev_worker(struct work_struct *work) int fail = false; struct async_reg_args *args = container_of(work, struct async_reg_args, reg_work.work); - struct cached_dev *dc; - - dc = kzalloc(sizeof(*dc), GFP_KERNEL); - if (!dc) { - fail = true; - put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - goto out; - } mutex_lock(&bch_register_lock); - if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0) + if (register_bdev(args->sb, args->sb_disk, args->bdev_file, + args->holder) < 0) fail = true; mutex_unlock(&bch_register_lock); -out: if (fail) pr_info("error %s: fail to register backing device\n", args->path); @@ -2486,21 +2506,12 @@ static void register_cache_worker(struct work_struct *work) int fail = false; struct async_reg_args *args = container_of(work, struct async_reg_args, reg_work.work); - struct cache *ca; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) { - fail = true; - put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - goto out; - } /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0) + if (register_cache(args->sb, args->sb_disk, args->bdev_file, + args->holder)) fail = true; -out: if (fail) pr_info("error %s: fail to register cache device\n", args->path); @@ -2518,7 +2529,14 @@ static void register_device_async(struct async_reg_args *args) INIT_DELAYED_WORK(&args->reg_work, register_cache_worker); /* 10 jiffies is enough for a delay */ - queue_delayed_work(system_wq, &args->reg_work, 10); + queue_delayed_work(system_percpu_wq, &args->reg_work, 10); +} + +static void *alloc_holder_object(struct cache_sb *sb) +{ + if (SB_IS_BDEV(sb)) + return kzalloc(sizeof(struct cached_dev), GFP_KERNEL); + return kzalloc(sizeof(struct cache), GFP_KERNEL); } static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, @@ -2528,9 +2546,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, char *path = NULL; struct cache_sb *sb; struct cache_sb_disk *sb_disk; - struct block_device *bdev; + struct file *bdev_file, *bdev_file2; + void *holder = NULL; ssize_t ret; bool async_registration = false; + bool quiet = false; #ifdef CONFIG_BCACHE_ASYNC_REGISTRATION async_registration = true; @@ -2559,11 +2579,30 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ret = -EINVAL; err = "failed to open device"; - bdev = blkdev_get_by_path(strim(path), - FMODE_READ|FMODE_WRITE|FMODE_EXCL, - sb); - if (IS_ERR(bdev)) { - if (bdev == ERR_PTR(-EBUSY)) { + bdev_file = bdev_file_open_by_path(strim(path), BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(bdev_file)) + goto out_free_sb; + + err = read_super(sb, file_bdev(bdev_file), &sb_disk); + if (err) + goto out_blkdev_put; + + holder = alloc_holder_object(sb); + if (!holder) { + ret = -ENOMEM; + err = "cannot allocate memory"; + goto out_put_sb_folio; + } + + /* Now reopen in exclusive mode with proper holder */ + bdev_file2 = bdev_file_open_by_dev(file_bdev(bdev_file)->bd_dev, + BLK_OPEN_READ | BLK_OPEN_WRITE, holder, NULL); + fput(bdev_file); + bdev_file = bdev_file2; + if (IS_ERR(bdev_file)) { + ret = PTR_ERR(bdev_file); + bdev_file = NULL; + if (ret == -EBUSY) { dev_t dev; mutex_lock(&bch_register_lock); @@ -2573,20 +2612,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); - if (attr == &ksysfs_register_quiet) - goto done; + if (attr == &ksysfs_register_quiet) { + quiet = true; + ret = size; + } } - goto out_free_sb; + goto out_free_holder; } - err = "failed to set blocksize"; - if (set_blocksize(bdev, 4096)) - goto out_blkdev_put; - - err = read_super(sb, bdev, &sb_disk); - if (err) - goto out_blkdev_put; - err = "failed to register device"; if (async_registration) { @@ -2597,59 +2630,46 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!args) { ret = -ENOMEM; err = "cannot allocate memory"; - goto out_put_sb_page; + goto out_free_holder; } args->path = path; args->sb = sb; args->sb_disk = sb_disk; - args->bdev = bdev; + args->bdev_file = bdev_file; + args->holder = holder; register_device_async(args); /* No wait and returns to user space */ goto async_done; } if (SB_IS_BDEV(sb)) { - struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); - - if (!dc) { - ret = -ENOMEM; - err = "cannot allocate memory"; - goto out_put_sb_page; - } - mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_disk, bdev, dc); + ret = register_bdev(sb, sb_disk, bdev_file, holder); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) goto out_free_sb; } else { - struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - - if (!ca) { - ret = -ENOMEM; - err = "cannot allocate memory"; - goto out_put_sb_page; - } - /* blkdev_put() will be called in bch_cache_release() */ - ret = register_cache(sb, sb_disk, bdev, ca); + ret = register_cache(sb, sb_disk, bdev_file, holder); if (ret) goto out_free_sb; } -done: kfree(sb); kfree(path); module_put(THIS_MODULE); async_done: return size; -out_put_sb_page: - put_page(virt_to_page(sb_disk)); +out_free_holder: + kfree(holder); +out_put_sb_folio: + folio_put(virt_to_folio(sb_disk)); out_blkdev_put: - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + if (bdev_file) + fput(bdev_file); out_free_sb: kfree(sb); out_free_path: @@ -2658,7 +2678,8 @@ out_free_path: out_module_put: module_put(THIS_MODULE); out: - pr_info("error %s: %s\n", path?path:"", err); + if (!quiet) + pr_info("error %s: %s\n", path?path:"", err); return ret; } @@ -2882,24 +2903,25 @@ static int __init bcache_init(void) if (bch_btree_init()) goto err; - bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!bcache_wq) goto err; /* * Let's not make this `WQ_MEM_RECLAIM` for the following reasons: * - * 1. It used `system_wq` before which also does no memory reclaim. + * 1. It used `system_percpu_wq` before which also does no memory reclaim. * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and * reduced throughput can be observed. * - * We still want to user our own queue to not congest the `system_wq`. + * We still want to user our own queue to not congest the `system_percpu_wq`. */ - bch_flush_wq = alloc_workqueue("bch_flush", 0, 0); + bch_flush_wq = alloc_workqueue("bch_flush", WQ_PERCPU, 0); if (!bch_flush_wq) goto err; - bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0); + bch_journal_wq = alloc_workqueue("bch_journal", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!bch_journal_wq) goto err; @@ -2912,7 +2934,6 @@ static int __init bcache_init(void) goto err; bch_debug_init(); - closure_debug_init(); bcache_is_reboot = false; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c6f677059214..72f38e5b6f5c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,7 +134,6 @@ read_attribute(partial_stripes_expensive); rw_attribute(synchronous); rw_attribute(journal_delay_ms); rw_attribute(io_disable); -rw_attribute(discard); rw_attribute(running); rw_attribute(label); rw_attribute(errors); @@ -660,7 +659,7 @@ static unsigned int bch_root_usage(struct cache_set *c) unsigned int bytes = 0; struct bkey *k; struct btree *b; - struct btree_iter iter; + struct btree_iter_stack iter; goto lock_root; @@ -702,13 +701,7 @@ static unsigned int bch_cache_max_chain(struct cache_set *c) for (h = c->bucket_hash; h < c->bucket_hash + (1 << BUCKET_HASH_BITS); h++) { - unsigned int i = 0; - struct hlist_node *p; - - hlist_for_each(p, h) - i++; - - ret = max(ret, i); + ret = max(ret, hlist_count_nodes(h)); } mutex_unlock(&c->bucket_lock); @@ -866,7 +859,8 @@ STORE(__bch_cache_set) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->shrink.scan_objects(&c->shrink, &sc); + if (c->shrink) + c->shrink->scan_objects(c->shrink, &sc); } sysfs_strtoul_clamp(congested_read_threshold_us, @@ -1041,7 +1035,6 @@ SHOW(__bch_cache) sysfs_hprint(bucket_size, bucket_bytes(ca)); sysfs_hprint(block_size, block_bytes(ca)); sysfs_print(nbuckets, ca->sb.nbuckets); - sysfs_print(discard, ca->discard); sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); sysfs_hprint(btree_written, atomic_long_read(&ca->btree_sectors_written) << 9); @@ -1103,7 +1096,7 @@ SHOW(__bch_cache) sum += INITIAL_PRIO - cached[i]; if (n) - do_div(sum, n); + sum = div64_u64(sum, n); for (i = 0; i < ARRAY_SIZE(q); i++) q[i] = INITIAL_PRIO - cached[n * (i + 1) / @@ -1111,26 +1104,25 @@ SHOW(__bch_cache) vfree(p); - ret = scnprintf(buf, PAGE_SIZE, - "Unused: %zu%%\n" - "Clean: %zu%%\n" - "Dirty: %zu%%\n" - "Metadata: %zu%%\n" - "Average: %llu\n" - "Sectors per Q: %zu\n" - "Quantiles: [", - unused * 100 / (size_t) ca->sb.nbuckets, - available * 100 / (size_t) ca->sb.nbuckets, - dirty * 100 / (size_t) ca->sb.nbuckets, - meta * 100 / (size_t) ca->sb.nbuckets, sum, - n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); + ret = sysfs_emit(buf, + "Unused: %zu%%\n" + "Clean: %zu%%\n" + "Dirty: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + available * 100 / (size_t) ca->sb.nbuckets, + dirty * 100 / (size_t) ca->sb.nbuckets, + meta * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); for (i = 0; i < ARRAY_SIZE(q); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "%u ", q[i]); + ret += sysfs_emit_at(buf, ret, "%u ", q[i]); ret--; - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); + ret += sysfs_emit_at(buf, ret, "]\n"); return ret; } @@ -1148,18 +1140,6 @@ STORE(__bch_cache) if (bcache_is_reboot) return -EBUSY; - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - if (bdev_max_discard_sectors(ca->bdev)) - ca->discard = v; - - if (v != CACHE_DISCARD(&ca->sb)) { - SET_CACHE_DISCARD(&ca->sb, v); - bcache_write_super(ca->set); - } - } - if (attr == &sysfs_cache_replacement_policy) { v = __sysfs_match_string(cache_replacement_policies, -1, buf); if (v < 0) @@ -1191,7 +1171,6 @@ static struct attribute *bch_cache_attrs[] = { &sysfs_block_size, &sysfs_nbuckets, &sysfs_priority_stats, - &sysfs_discard, &sysfs_written, &sysfs_btree_written, &sysfs_metadata_written, diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index a2ff6447b699..65b8bd975ab1 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -3,7 +3,7 @@ #define _BCACHE_SYSFS_H_ #define KTYPE(type) \ -struct kobj_type type ## _ktype = { \ +const struct kobj_type type ## _ktype = { \ .release = type ## _release, \ .sysfs_ops = &((const struct sysfs_ops) { \ .show = type ## _show, \ diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index ae380bc3992e..410d8cb49e50 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * random utiility code, for bcache but in theory not specific to bcache + * random utility code, for bcache but in theory not specific to bcache * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 6f3cb7c92130..f61ab1bada6c 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -4,6 +4,7 @@ #define _BCACHE_UTIL_H #include <linux/blkdev.h> +#include <linux/closure.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/clock.h> @@ -13,8 +14,6 @@ #include <linux/workqueue.h> #include <linux/crc64.h> -#include "closure.h" - struct closure; #ifdef CONFIG_BCACHE_DEBUG diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d4a5fc0650bb..4b237074f453 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -331,26 +331,26 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; bio->bi_private = w; bch_bio_map(bio, NULL); } -static void dirty_io_destructor(struct closure *cl) +static CLOSURE_CALLBACK(dirty_io_destructor) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); kfree(io); } -static void write_dirty_finish(struct closure *cl) +static CLOSURE_CALLBACK(write_dirty_finish) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; @@ -400,9 +400,9 @@ static void dirty_endio(struct bio *bio) closure_put(&io->cl); } -static void write_dirty(struct closure *cl) +static CLOSURE_CALLBACK(write_dirty) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; @@ -462,9 +462,9 @@ static void read_dirty_endio(struct bio *bio) dirty_endio(bio); } -static void read_dirty_submit(struct closure *cl) +static CLOSURE_CALLBACK(read_dirty_submit) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); closure_bio_submit(io->dc->disk.c, &io->bio, cl); @@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; @@ -805,8 +805,7 @@ static int bch_writeback_thread(void *arg) * may set BCH_ENABLE_AUTO_GC via sysfs, then when * BCH_DO_AUTO_GC is set, garbage collection thread * will be wake up here. After moving gc, the shrunk - * btree and discarded free buckets SSD space may be - * helpful for following write requests. + * btree may be helpful for following write requests. */ if (c->gc_after_writeback == (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { @@ -890,6 +889,16 @@ static int bch_root_node_dirty_init(struct cache_set *c, if (ret < 0) pr_warn("sectors dirty init failed, ret=%d!\n", ret); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op.op)->wait); + return ret; } @@ -898,15 +907,15 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; - cur_idx = prev_idx = 0; + prev_idx = 0; - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -920,7 +929,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -967,24 +976,35 @@ static int bch_btre_dirty_init_thread_nr(void) void bch_sectors_dirty_init(struct bcache_device *d) { int i; + struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; +retry_lock: + b = c->root; + rw_lock(0, b, b->level); + if (b != c->root) { + rw_unlock(0, b); + goto retry_lock; + } + /* Just count root keys if no leaf node */ - rw_lock(0, c->root, c->root->level); if (c->root->level == 0) { bch_btree_op_init(&op.op, -1); op.inode = d->id; op.count = 0; for_each_key_filter(&c->root->keys, - k, &iter, bch_ptr_invalid) + k, &iter, bch_ptr_invalid) { + if (KEY_INODE(k) != op.inode) + continue; sectors_dirty_init_fn(&op.op, c->root, k); + } - rw_unlock(0, c->root); + rw_unlock(0, b); return; } @@ -1004,23 +1024,24 @@ void bch_sectors_dirty_init(struct bcache_device *d) if (atomic_read(&state.enough)) break; + atomic_inc(&state.started); state.infos[i].state = &state; state.infos[i].thread = kthread_run(bch_dirty_init_thread, &state.infos[i], "bch_dirtcnt[%d]", i); if (IS_ERR(state.infos[i].thread)) { pr_err("fails to run thread bch_dirty_init[%d]\n", i); + atomic_dec(&state.started); for (--i; i >= 0; i--) kthread_stop(state.infos[i].thread); goto out; } - atomic_inc(&state.started); } out: /* Must wait for all threads to stop. */ wait_event(state.wait, atomic_read(&state.started) == 0); - rw_unlock(0, c->root); + rw_unlock(0, b); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -1054,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!dc->writeback_write_wq) return -ENOMEM; |
