From 6b40bec3b13278d21fa6c1ae7a0bdf2e550eed5f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 11 Feb 2020 11:10:04 +0100 Subject: md: check arrays is suspended in mddev_detach before call quiesce operations Don't call quiesce(1) and quiesce(0) if array is already suspended, otherwise in level_store, the array is writable after mddev_detach in below part though the intention is to make array writable after resume. mddev_suspend(mddev); mddev_detach(mddev); ... mddev_resume(mddev); And it also causes calltrace as follows in [1]. [48005.653834] WARNING: CPU: 1 PID: 45380 at kernel/kthread.c:510 kthread_park+0x77/0x90 [...] [48005.653976] CPU: 1 PID: 45380 Comm: mdadm Tainted: G OE 5.4.10-arch1-1 #1 [48005.653979] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4105-ITX, BIOS P1.40 08/06/2018 [48005.653984] RIP: 0010:kthread_park+0x77/0x90 [48005.654015] Call Trace: [48005.654039] r5l_quiesce+0x3c/0x70 [raid456] [48005.654052] raid5_quiesce+0x228/0x2e0 [raid456] [48005.654073] mddev_detach+0x30/0x70 [md_mod] [48005.654090] level_store+0x202/0x670 [md_mod] [48005.654099] ? security_capable+0x40/0x60 [48005.654114] md_attr_store+0x7b/0xc0 [md_mod] [48005.654123] kernfs_fop_write+0xce/0x1b0 [48005.654132] vfs_write+0xb6/0x1a0 [48005.654138] ksys_write+0x67/0xe0 [48005.654146] do_syscall_64+0x4e/0x140 [48005.654155] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [48005.654161] RIP: 0033:0x7fa0c8737497 [1]: https://bugzilla.kernel.org/show_bug.cgi?id=206161 Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 469f551863be..0b30ada971c1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6184,7 +6184,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { md_bitmap_wait_behind_writes(mddev); - if (mddev->pers && mddev->pers->quiesce) { + if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } -- cgit From 253a99d95d5b30377b0193f1f1294f9068849c0b Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 22 Mar 2020 14:02:59 +0800 Subject: bcache: move macro btree() and btree_root() into btree.h In order to accelerate bcache registration speed, the macro btree() and btree_root() will be referenced out of btree.c. This patch moves them from btree.c into btree.h with other relative function declaration in btree.h, for the following changes. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 60 +----------------------------------------- drivers/md/bcache/btree.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 59 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fa872df4e770..99cb201809af 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -101,64 +101,6 @@ #define insert_lock(s, b) ((b)->level <= (s)->lock) -/* - * These macros are for recursing down the btree - they handle the details of - * locking and looking up nodes in the cache for you. They're best treated as - * mere syntax when reading code that uses them. - * - * op->lock determines whether we take a read or a write lock at a given depth. - * If you've got a read lock and find that you need a write lock (i.e. you're - * going to have to split), set op->lock and return -EINTR; btree_root() will - * call you again and you'll have the correct lock. - */ - -/** - * btree - recurse down the btree on a specified key - * @fn: function to call, which will be passed the child node - * @key: key to recurse on - * @b: parent btree node - * @op: pointer to struct btree_op - */ -#define btree(fn, key, b, op, ...) \ -({ \ - int _r, l = (b)->level - 1; \ - bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ - _w, b); \ - if (!IS_ERR(_child)) { \ - _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ - rw_unlock(_w, _child); \ - } else \ - _r = PTR_ERR(_child); \ - _r; \ -}) - -/** - * btree_root - call a function on the root of the btree - * @fn: function to call, which will be passed the child node - * @c: cache set - * @op: pointer to struct btree_op - */ -#define btree_root(fn, c, op, ...) \ -({ \ - int _r = -EINTR; \ - do { \ - struct btree *_b = (c)->root; \ - bool _w = insert_lock(op, _b); \ - rw_lock(_w, _b, _b->level); \ - if (_b == (c)->root && \ - _w == insert_lock(op, _b)) { \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - } \ - rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c); \ - if (_r == -EINTR) \ - schedule(); \ - } while (_r == -EINTR); \ - \ - finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ - _r; \ -}) static inline struct bset *write_block(struct btree *b) { @@ -2422,7 +2364,7 @@ int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, return btree_root(map_nodes_recurse, c, op, from, fn, flags); } -static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *from, btree_map_keys_fn *fn, int flags) { diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index f4dcca449391..f37153db3f6c 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -260,6 +260,13 @@ void bch_initial_gc_finish(struct cache_set *c); void bch_moving_gc(struct cache_set *c); int bch_btree_check(struct cache_set *c); void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); +typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, + struct bkey *k); +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags); +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags); static inline void wake_up_gc(struct cache_set *c) { @@ -284,6 +291,65 @@ static inline void force_wake_up_gc(struct cache_set *c) wake_up_gc(c); } +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ + _w, b); \ + if (!IS_ERR(_child)) { \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -EINTR) \ + schedule(); \ + } while (_r == -EINTR); \ + \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ + _r; \ +}) + #define MAP_DONE 0 #define MAP_CONTINUE 1 -- cgit From feac1a70b806373d076a95b739c4feeceb21e814 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 22 Mar 2020 14:03:00 +0800 Subject: bcache: add bcache_ prefix to btree_root() and btree() macros This patch changes macro btree_root() and btree() to bcache_btree_root() and bcache_btree(), to avoid potential generic name clash in future. NOTE: for product kernel maintainers, this patch can be skipped if you feel the rename stuffs introduce inconvenince to patch backport. Suggested-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 15 ++++++++------- drivers/md/bcache/btree.h | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 99cb201809af..faf152524a16 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1790,7 +1790,7 @@ static void bch_btree_gc(struct cache_set *c) /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ do { - ret = btree_root(gc_root, c, &op, &writes, &stats); + ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); cond_resched(); @@ -1888,7 +1888,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) } if (p) - ret = btree(check_recurse, p, b, op); + ret = bcache_btree(check_recurse, p, b, op); p = k; } while (p && !ret); @@ -1903,7 +1903,7 @@ int bch_btree_check(struct cache_set *c) bch_btree_op_init(&op, SHRT_MAX); - return btree_root(check_recurse, c, &op); + return bcache_btree_root(check_recurse, c, &op); } void bch_initial_gc_finish(struct cache_set *c) @@ -2343,7 +2343,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { - ret = btree(map_nodes_recurse, k, b, + ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); from = NULL; @@ -2361,7 +2361,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_nodes_fn *fn, int flags) { - return btree_root(map_nodes_recurse, c, op, from, fn, flags); + return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); } int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, @@ -2377,7 +2377,8 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) - : btree(map_keys_recurse, k, b, op, from, fn, flags); + : bcache_btree(map_keys_recurse, k, + b, op, from, fn, flags); from = NULL; if (ret != MAP_CONTINUE) @@ -2394,7 +2395,7 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags) { - return btree_root(map_keys_recurse, c, op, from, fn, flags); + return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); } /* Keybuf code */ diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index f37153db3f6c..19e30266070a 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -309,7 +309,7 @@ static inline void force_wake_up_gc(struct cache_set *c) * @b: parent btree node * @op: pointer to struct btree_op */ -#define btree(fn, key, b, op, ...) \ +#define bcache_btree(fn, key, b, op, ...) \ ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ @@ -329,7 +329,7 @@ static inline void force_wake_up_gc(struct cache_set *c) * @c: cache set * @op: pointer to struct btree_op */ -#define btree_root(fn, c, op, ...) \ +#define bcache_btree_root(fn, c, op, ...) \ ({ \ int _r = -EINTR; \ do { \ -- cgit From 8e7102273f597dbb38af43da874f8c123f8e6dbe Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 22 Mar 2020 14:03:01 +0800 Subject: bcache: make bch_btree_check() to be multithreaded When registering a cache device, bch_btree_check() is called to check all btree nodes, to make sure the btree is consistent and not corrupted. bch_btree_check() is recursively executed in a single thread, when there are a lot of data cached and the btree is huge, it may take very long time to check all the btree nodes. In my testing, I observed it took around 50 minutes to finish bch_btree_check(). When checking the bcache btree nodes, the cache set is not running yet, and indeed the whole tree is in read-only state, it is safe to create multiple threads to check the btree in parallel. This patch tries to create multiple threads, and each thread tries to one-by-one check the sub-tree indexed by a key from the btree root node. The parallel thread number depends on how many keys in the btree root node. At most BCH_BTR_CHKTHREAD_MAX (64) threads can be created, but in practice is should be min(cpu-number/2, root-node-keys-number). Signed-off-by: Coly Li Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 169 +++++++++++++++++++++++++++++++++++++++++++++- drivers/md/bcache/btree.h | 22 ++++++ 2 files changed, 188 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index faf152524a16..74d66b641169 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1897,13 +1897,176 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) return ret; } + +static int bch_btree_check_thread(void *arg) +{ + int ret; + struct btree_check_info *info = arg; + struct btree_check_state *check_state = info->state; + struct cache_set *c = check_state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + int i, n; + + k = p = NULL; + i = n = 0; + cur_idx = prev_idx = 0; + ret = 0; + + /* root node keys are checked before thread created */ + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + while (k) { + /* + * Fetch a root node key index, skip the keys which + * should be fetched by other threads, then check the + * sub-tree indexed by the fetched key. + */ + spin_lock(&check_state->idx_lock); + cur_idx = check_state->key_idx; + check_state->key_idx++; + spin_unlock(&check_state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + /* + * No more keys to check in root node, + * current checking threads are enough, + * stop creating more. + */ + atomic_set(&check_state->enough, 1); + /* Update check_state->enough earlier */ + smp_mb(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + struct btree_op op; + + btree_node_prefetch(c->root, p); + c->gc_stats.nodes++; + bch_btree_op_init(&op, 0); + ret = bcache_btree(check_recurse, p, c->root, &op); + if (ret) + goto out; + } + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + info->result = ret; + /* update check_state->started among all CPUs */ + smp_mb(); + if (atomic_dec_and_test(&check_state->started)) + wake_up(&check_state->wait); + + return ret; +} + + + +static int bch_btree_chkthread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_BTR_CHKTHREAD_MAX) + n = BCH_BTR_CHKTHREAD_MAX; + + return n; +} + int bch_btree_check(struct cache_set *c) { - struct btree_op op; + int ret = 0; + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct btree_check_state *check_state; + char name[32]; - bch_btree_op_init(&op, SHRT_MAX); + /* check and mark root node keys */ + for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(c, c->root->level, k); + + bch_initial_mark_key(c, c->root->level + 1, &c->root->key); + + if (c->root->level == 0) + return 0; + + check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL); + if (!check_state) + return -ENOMEM; + + check_state->c = c; + check_state->total_threads = bch_btree_chkthread_nr(); + check_state->key_idx = 0; + spin_lock_init(&check_state->idx_lock); + atomic_set(&check_state->started, 0); + atomic_set(&check_state->enough, 0); + init_waitqueue_head(&check_state->wait); - return bcache_btree_root(check_recurse, c, &op); + /* + * Run multiple threads to check btree nodes in parallel, + * if check_state->enough is non-zero, it means current + * running check threads are enough, unncessary to create + * more. + */ + for (i = 0; i < check_state->total_threads; i++) { + /* fetch latest check_state->enough earlier */ + smp_mb(); + if (atomic_read(&check_state->enough)) + break; + + check_state->infos[i].result = 0; + check_state->infos[i].state = check_state; + snprintf(name, sizeof(name), "bch_btrchk[%u]", i); + atomic_inc(&check_state->started); + + check_state->infos[i].thread = + kthread_run(bch_btree_check_thread, + &check_state->infos[i], + name); + if (IS_ERR(check_state->infos[i].thread)) { + pr_err("fails to run thread bch_btrchk[%d]", i); + for (--i; i >= 0; i--) + kthread_stop(check_state->infos[i].thread); + ret = -ENOMEM; + goto out; + } + } + + wait_event_interruptible(check_state->wait, + atomic_read(&check_state->started) == 0 || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + for (i = 0; i < check_state->total_threads; i++) { + if (check_state->infos[i].result) { + ret = check_state->infos[i].result; + goto out; + } + } + +out: + kfree(check_state); + return ret; } void bch_initial_gc_finish(struct cache_set *c) diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 19e30266070a..7c884f278da8 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -145,6 +145,9 @@ struct btree { struct bio *bio; }; + + + #define BTREE_FLAG(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ @@ -216,6 +219,25 @@ struct btree_op { unsigned int insert_collision:1; }; +struct btree_check_state; +struct btree_check_info { + struct btree_check_state *state; + struct task_struct *thread; + int result; +}; + +#define BCH_BTR_CHKTHREAD_MAX 64 +struct btree_check_state { + struct cache_set *c; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct btree_check_info infos[BCH_BTR_CHKTHREAD_MAX]; +}; + static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) { memset(op, 0, sizeof(struct btree_op)); -- cgit From b144e45fc57649e15cbc79ff2d32a942af1d91d5 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 22 Mar 2020 14:03:02 +0800 Subject: bcache: make bch_sectors_dirty_init() to be multithreaded When attaching a cached device (a.k.a backing device) to a cache device, bch_sectors_dirty_init() is called to count dirty sectors and stripes (see what bcache_dev_sectors_dirty_add() does) on the cache device. The counting is done by a single thread recursive function bch_btree_map_keys() to iterate all the bcache btree nodes. If the btree has huge number of nodes, bch_sectors_dirty_init() will take quite long time. In my testing, if the registering cache set has a existed UUID which matches a already registered cached device, the automatical attachment during the registration may take more than 55 minutes. This is too long for waiting the bcache to work in real deployment. Fortunately when bch_sectors_dirty_init() is called, no other thread will access the btree yet, it is safe to do a read-only parallelized dirty sectors counting by multiple threads. This patch tries to create multiple threads, and each thread tries to one-by-one count dirty sectors from the sub-tree indexed by a root node key which the thread fetched. After the sub-tree is counted, the counting thread will continue to fetch another root node key, until the fetched key is NULL. How many threads in parallel depends on the number of keys from the btree root node, and the number of online CPU core. The thread number will be the less number but no more than BCH_DIRTY_INIT_THRD_MAX. If there are only 2 keys in root node, it can only be 2x times faster by this patch. But if there are 10 keys in the root node, with this patch it can be 10x times faster. Signed-off-by: Coly Li Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 158 +++++++++++++++++++++++++++++++++++++++++- drivers/md/bcache/writeback.h | 19 +++++ 2 files changed, 174 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4a40f9eadeaf..6673a37c8bd2 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -785,7 +785,9 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct bcache_device *d) +static int bch_root_node_dirty_init(struct cache_set *c, + struct bcache_device *d, + struct bkey *k) { struct sectors_dirty_init op; int ret; @@ -796,8 +798,13 @@ void bch_sectors_dirty_init(struct bcache_device *d) op.start = KEY(op.inode, 0, 0); do { - ret = bch_btree_map_keys(&op.op, d->c, &op.start, - sectors_dirty_init_fn, 0); + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &op.start, + sectors_dirty_init_fn, + 0); if (ret == -EAGAIN) schedule_timeout_interruptible( msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); @@ -806,6 +813,151 @@ void bch_sectors_dirty_init(struct bcache_device *d) break; } } while (ret == -EAGAIN); + + return ret; +} + +static int bch_dirty_init_thread(void *arg) +{ + struct dirty_init_thrd_info *info = arg; + struct bch_dirty_init_state *state = info->state; + struct cache_set *c = state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + int i; + + k = p = NULL; + i = 0; + cur_idx = prev_idx = 0; + + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + + while (k) { + spin_lock(&state->idx_lock); + cur_idx = state->key_idx; + state->key_idx++; + spin_unlock(&state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + atomic_set(&state->enough, 1); + /* Update state->enough earlier */ + smp_mb(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + if (bch_root_node_dirty_init(c, state->d, p) < 0) + goto out; + } + + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + /* In order to wake up state->wait in time */ + smp_mb(); + if (atomic_dec_and_test(&state->started)) + wake_up(&state->wait); + + return 0; +} + +static int bch_btre_dirty_init_thread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_DIRTY_INIT_THRD_MAX) + n = BCH_DIRTY_INIT_THRD_MAX; + + return n; +} + +void bch_sectors_dirty_init(struct bcache_device *d) +{ + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct sectors_dirty_init op; + struct cache_set *c = d->c; + struct bch_dirty_init_state *state; + char name[32]; + + /* Just count root keys if no leaf node */ + if (c->root->level == 0) { + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + op.start = KEY(op.inode, 0, 0); + + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) + sectors_dirty_init_fn(&op.op, c->root, k); + return; + } + + state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL); + if (!state) { + pr_warn("sectors dirty init failed: cannot allocate memory"); + return; + } + + state->c = c; + state->d = d; + state->total_threads = bch_btre_dirty_init_thread_nr(); + state->key_idx = 0; + spin_lock_init(&state->idx_lock); + atomic_set(&state->started, 0); + atomic_set(&state->enough, 0); + init_waitqueue_head(&state->wait); + + for (i = 0; i < state->total_threads; i++) { + /* Fetch latest state->enough earlier */ + smp_mb(); + if (atomic_read(&state->enough)) + break; + + state->infos[i].state = state; + atomic_inc(&state->started); + snprintf(name, sizeof(name), "bch_dirty_init[%d]", i); + + state->infos[i].thread = + kthread_run(bch_dirty_init_thread, + &state->infos[i], + name); + if (IS_ERR(state->infos[i].thread)) { + pr_err("fails to run thread bch_dirty_init[%d]", i); + for (--i; i >= 0; i--) + kthread_stop(state->infos[i].thread); + goto out; + } + } + + wait_event_interruptible(state->wait, + atomic_read(&state->started) == 0 || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + +out: + kfree(state); } void bch_cached_dev_writeback_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 4e4c6810dc3c..b029843ce5b6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -16,6 +16,7 @@ #define BCH_AUTO_GC_DIRTY_THRESHOLD 50 +#define BCH_DIRTY_INIT_THRD_MAX 64 /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up @@ -23,6 +24,24 @@ */ #define WRITEBACK_SHARE_SHIFT 14 +struct bch_dirty_init_state; +struct dirty_init_thrd_info { + struct bch_dirty_init_state *state; + struct task_struct *thread; +}; + +struct bch_dirty_init_state { + struct cache_set *c; + struct bcache_device *d; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct dirty_init_thrd_info infos[BCH_DIRTY_INIT_THRD_MAX]; +}; + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; -- cgit From 9876e38609a8ea98bbb447eb5a8f1c0400a6ccb8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 22 Mar 2020 14:03:03 +0800 Subject: bcache: Use scnprintf() for avoiding potential buffer overflow Since snprintf() returns the would-be-output size instead of the actual output size, the succeeding calls may go beyond the given buffer limit. Fix it by replacing with scnprintf(). Signed-off-by: Takashi Iwai Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 3470fae4eabc..323276994aab 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -154,7 +154,7 @@ static ssize_t bch_snprint_string_list(char *buf, size_t i; for (i = 0; list[i]; i++) - out += snprintf(out, buf + size - out, + out += scnprintf(out, buf + size - out, i == selected ? "[%s] " : "%s ", list[i]); out[-1] = '\n'; -- cgit From b004aa867c48b3232835b61ed9d44b572e29498e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 22 Mar 2020 14:03:04 +0800 Subject: bcache: optimize barrier usage for Rmw atomic bitops We can avoid the unnecessary barrier on non LL/SC architectures, such as x86. Instead, use the smp_mb__after_atomic(). Signed-off-by: Davidlohr Bueso Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6673a37c8bd2..72ba6d015786 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -183,7 +183,7 @@ static void update_writeback_rate(struct work_struct *work) */ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); /* * CACHE_SET_IO_DISABLE might be set via sysfs interface, @@ -193,7 +193,7 @@ static void update_writeback_rate(struct work_struct *work) test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); return; } @@ -229,7 +229,7 @@ static void update_writeback_rate(struct work_struct *work) */ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); } static unsigned int writeback_delay(struct cached_dev *dc, -- cgit From eb9b6666d6ca6f3d9f218fa23ec6135eee1ac3a7 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 22 Mar 2020 14:03:05 +0800 Subject: bcache: optimize barrier usage for atomic operations The idea of this patch is from Davidlohr Bueso, he posts a patch for bcache to optimize barrier usage for read-modify-write atomic bitops. Indeed such optimization can also apply on other locations where smp_mb() is used before or after an atomic operation. This patch replaces smp_mb() with smp_mb__before_atomic() or smp_mb__after_atomic() in btree.c and writeback.c, where it is used to synchronize memory cache just earlier on other cores. Although the locations are not on hot code path, it is always not bad to mkae things a little better. Signed-off-by: Coly Li Cc: Davidlohr Bueso Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 6 +++--- drivers/md/bcache/writeback.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 74d66b641169..72856e5f23a3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1947,7 +1947,7 @@ static int bch_btree_check_thread(void *arg) */ atomic_set(&check_state->enough, 1); /* Update check_state->enough earlier */ - smp_mb(); + smp_mb__after_atomic(); goto out; } skip_nr--; @@ -1972,7 +1972,7 @@ static int bch_btree_check_thread(void *arg) out: info->result = ret; /* update check_state->started among all CPUs */ - smp_mb(); + smp_mb__before_atomic(); if (atomic_dec_and_test(&check_state->started)) wake_up(&check_state->wait); @@ -2031,7 +2031,7 @@ int bch_btree_check(struct cache_set *c) */ for (i = 0; i < check_state->total_threads; i++) { /* fetch latest check_state->enough earlier */ - smp_mb(); + smp_mb__before_atomic(); if (atomic_read(&check_state->enough)) break; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 72ba6d015786..3f7641fb28d5 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -854,7 +854,7 @@ static int bch_dirty_init_thread(void *arg) else { atomic_set(&state->enough, 1); /* Update state->enough earlier */ - smp_mb(); + smp_mb__after_atomic(); goto out; } skip_nr--; @@ -873,7 +873,7 @@ static int bch_dirty_init_thread(void *arg) out: /* In order to wake up state->wait in time */ - smp_mb(); + smp_mb__before_atomic(); if (atomic_dec_and_test(&state->started)) wake_up(&state->wait); @@ -932,7 +932,7 @@ void bch_sectors_dirty_init(struct bcache_device *d) for (i = 0; i < state->total_threads; i++) { /* Fetch latest state->enough earlier */ - smp_mb(); + smp_mb__before_atomic(); if (atomic_read(&state->enough)) break; -- cgit From 5ae3a2c03d1f5b33f53ce2ba2e57773fc8b35128 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 25 Mar 2020 09:30:57 +0800 Subject: bcache: remove dupplicated declaration from btree.h Commit 253a99d95d5b ("bcache: move macro btree() and btree_root() into btree.h") makes two duplicated declaration into btree.h, typedef int (btree_map_keys_fn)(); int bch_btree_map_keys(); The kbuild test robot detects and reports this problem and this patch fixes it by removing the duplicated ones. Fixes: 253a99d95d5b ("bcache: move macro btree() and btree_root() into btree.h") Reported-by: kbuild test robot Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 7c884f278da8..257969980c49 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -282,13 +282,6 @@ void bch_initial_gc_finish(struct cache_set *c); void bch_moving_gc(struct cache_set *c); int bch_btree_check(struct cache_set *c); void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); -typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, - struct bkey *k); -int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, - struct bkey *from, btree_map_keys_fn *fn, - int flags); -int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, - struct bkey *from, btree_map_keys_fn *fn, int flags); static inline void wake_up_gc(struct cache_set *c) { @@ -402,6 +395,9 @@ typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, struct bkey *k); int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags); +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags); typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k); -- cgit