summaryrefslogtreecommitdiff
path: root/drivers/md/bcache
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r--drivers/md/bcache/Kconfig10
-rw-r--r--drivers/md/bcache/Makefile4
-rw-r--r--drivers/md/bcache/alloc.c46
-rw-r--r--drivers/md/bcache/bcache.h26
-rw-r--r--drivers/md/bcache/bcache_ondisk.h8
-rw-r--r--drivers/md/bcache/bset.c44
-rw-r--r--drivers/md/bcache/bset.h32
-rw-r--r--drivers/md/bcache/btree.c195
-rw-r--r--drivers/md/bcache/btree.h5
-rw-r--r--drivers/md/bcache/closure.c207
-rw-r--r--drivers/md/bcache/closure.h378
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/io.c3
-rw-r--r--drivers/md/bcache/journal.c113
-rw-r--r--drivers/md/bcache/journal.h13
-rw-r--r--drivers/md/bcache/movinggc.c26
-rw-r--r--drivers/md/bcache/request.c94
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/stats.c4
-rw-r--r--drivers/md/bcache/stats.h1
-rw-r--r--drivers/md/bcache/super.c429
-rw-r--r--drivers/md/bcache/sysfs.c61
-rw-r--r--drivers/md/bcache/sysfs.h2
-rw-r--r--drivers/md/bcache/util.c2
-rw-r--r--drivers/md/bcache/util.h3
-rw-r--r--drivers/md/bcache/writeback.c75
26 files changed, 592 insertions, 1194 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 529c9d04e9a4..b2d10063d35f 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -4,6 +4,7 @@ config BCACHE
tristate "Block device as cache"
select BLOCK_HOLDER_DEPRECATED if SYSFS
select CRC64
+ select CLOSURES
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
@@ -19,15 +20,6 @@ config BCACHE_DEBUG
Enables extra debugging tools, allows expensive runtime checks to be
turned on.
-config BCACHE_CLOSURES_DEBUG
- bool "Debug closures"
- depends on BCACHE
- select DEBUG_FS
- help
- Keeps all active closures in a linked list and provides a debugfs
- interface to list them, which makes it possible to see asynchronous
- operations that get stuck.
-
config BCACHE_ASYNC_REGISTRATION
bool "Asynchronous device registration"
depends on BCACHE
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 5b87e59676b8..054e8a33a7ab 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -2,6 +2,6 @@
obj-$(CONFIG_BCACHE) += bcache.o
-bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\
+ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
util.o writeback.o features.o
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ce13c272c387..7708d92df23e 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -24,21 +24,18 @@
* Since the gens and priorities are all stored contiguously on disk, we can
* batch this up: We fill up the free_inc list with freshly invalidated buckets,
* call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
+ * free_inc list.
*
* free_inc isn't the only freelist - if it was, we'd often to sleep while
* priorities and gens were being written before we could allocate. c->free is a
* smaller freelist, and buckets on that list are always ready to be used.
*
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
* There is another freelist, because sometimes we have buckets that we know
* have nothing pointing into them - these we can reuse without waiting for
* priorities to be rewritten. These come from freed btree nodes and buckets
* that garbage collection discovered no longer had valid keys pointing into
* them (because they were overwritten). That's the unused list - buckets on the
- * unused list move to the free list, optionally being discarded in the process.
+ * unused list move to the free list.
*
* It's also important to ensure that gens don't wrap around - with respect to
* either the oldest gen in the btree or the gen on disk. This is quite
@@ -118,8 +115,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
/*
* Background allocation thread: scans for buckets to be invalidated,
* invalidates them, rewrites prios/gens (marking them as invalidated on disk),
- * then optionally issues discard commands to the newly free buckets, then puts
- * them on the various freelists.
+ * then puts them on the various freelists.
*/
static inline bool can_inc_bucket_gen(struct bucket *b)
@@ -129,12 +125,9 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
{
- BUG_ON(!ca->set->gc_mark_valid);
-
- return (!GC_MARK(b) ||
- GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
- !atomic_read(&b->pin) &&
- can_inc_bucket_gen(b);
+ return (ca->set->gc_mark_valid || b->reclaimable_in_gc) &&
+ ((!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
+ !atomic_read(&b->pin) && can_inc_bucket_gen(b));
}
void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -148,6 +141,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
bch_inc_gen(ca, b);
b->prio = INITIAL_PRIO;
atomic_inc(&b->pin);
+ b->reclaimable_in_gc = 0;
}
static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -323,8 +317,7 @@ static int bch_allocator_thread(void *arg)
while (1) {
/*
* First, we pull buckets off of the unused and free_inc lists,
- * possibly issue discards to them, then we add the bucket to
- * the free list:
+ * then we add the bucket to the free list:
*/
while (1) {
long bucket;
@@ -332,14 +325,6 @@ static int bch_allocator_thread(void *arg)
if (!fifo_pop(&ca->free_inc, bucket))
break;
- if (ca->discard) {
- mutex_unlock(&ca->set->bucket_lock);
- blkdev_issue_discard(ca->bdev,
- bucket_to_sector(ca->set, bucket),
- ca->sb.bucket_size, GFP_KERNEL);
- mutex_lock(&ca->set->bucket_lock);
- }
-
allocator_wait(ca, bch_allocator_push(ca, bucket));
wake_up(&ca->set->btree_cache_wait);
wake_up(&ca->set->bucket_wait);
@@ -352,8 +337,7 @@ static int bch_allocator_thread(void *arg)
*/
retry_invalidate:
- allocator_wait(ca, ca->set->gc_mark_valid &&
- !ca->invalidate_needs_gc);
+ allocator_wait(ca, !ca->invalidate_needs_gc);
invalidate_buckets(ca);
/*
@@ -415,7 +399,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
TASK_UNINTERRUPTIBLE);
mutex_unlock(&ca->set->bucket_lock);
+
+ atomic_inc(&ca->set->bucket_wait_cnt);
schedule();
+ atomic_dec(&ca->set->bucket_wait_cnt);
+
mutex_lock(&ca->set->bucket_lock);
} while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
!fifo_pop(&ca->free[reserve], r));
@@ -501,8 +489,8 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
ca = c->cache;
b = bch_bucket_alloc(ca, reserve, wait);
- if (b == -1)
- goto err;
+ if (b < 0)
+ return -1;
k->ptr[0] = MAKE_PTR(ca->buckets[b].gen,
bucket_to_sector(c, b),
@@ -511,10 +499,6 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
SET_KEY_PTRS(k, 1);
return 0;
-err:
- bch_bucket_free(c, k);
- bkey_put(c, k);
- return -1;
}
int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index aebb7ef10e63..8ccacba85547 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -179,6 +179,7 @@
#define pr_fmt(fmt) "bcache: %s() " fmt, __func__
#include <linux/bio.h>
+#include <linux/closure.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -192,7 +193,6 @@
#include "bcache_ondisk.h"
#include "bset.h"
#include "util.h"
-#include "closure.h"
struct bucket {
atomic_t pin;
@@ -200,6 +200,7 @@ struct bucket {
uint8_t gen;
uint8_t last_gc; /* Most out of date gen in the btree */
uint16_t gc_mark; /* Bitfield used by GC. See below for field */
+ uint16_t reclaimable_in_gc:1;
};
/*
@@ -265,6 +266,7 @@ struct bcache_device {
#define BCACHE_DEV_WB_RUNNING 3
#define BCACHE_DEV_RATE_DW_RUNNING 4
int nr_stripes;
+#define BCH_MIN_STRIPE_SZ ((4 << 20) >> SECTOR_SHIFT)
unsigned int stripe_size;
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
@@ -275,7 +277,7 @@ struct bcache_device {
int (*cache_miss)(struct btree *b, struct search *s,
struct bio *bio, unsigned int sectors);
- int (*ioctl)(struct bcache_device *d, fmode_t mode,
+ int (*ioctl)(struct bcache_device *d, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
};
@@ -299,6 +301,7 @@ struct cached_dev {
struct list_head list;
struct bcache_device disk;
struct block_device *bdev;
+ struct file *bdev_file;
struct cache_sb sb;
struct cache_sb_disk *sb_disk;
@@ -421,6 +424,7 @@ struct cache {
struct kobject kobj;
struct block_device *bdev;
+ struct file *bdev_file;
struct task_struct *alloc_thread;
@@ -443,8 +447,7 @@ struct cache {
* free_inc: Incoming buckets - these are buckets that currently have
* cached data in them, and we can't reuse them until after we write
* their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
+ * gens/prios, they'll be moved to the free list.
*/
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
@@ -463,8 +466,6 @@ struct cache {
*/
unsigned int invalidate_needs_gc;
- bool discard; /* Get rid of? */
-
struct journal_device journal;
/* The rest of this all shows up in sysfs */
@@ -541,7 +542,7 @@ struct cache_set {
struct bio_set bio_split;
/* For the btree cache */
- struct shrinker shrink;
+ struct shrinker *shrink;
/* For the btree cache and anything allocation related */
struct mutex bucket_lock;
@@ -603,6 +604,7 @@ struct cache_set {
*/
atomic_t prio_blocked;
wait_queue_head_t bucket_wait;
+ atomic_t bucket_wait_cnt;
/*
* For any bio we don't skip we subtract the number of sectors from
@@ -1004,11 +1006,11 @@ extern struct workqueue_struct *bch_flush_wq;
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
-extern struct kobj_type bch_cached_dev_ktype;
-extern struct kobj_type bch_flash_dev_ktype;
-extern struct kobj_type bch_cache_set_ktype;
-extern struct kobj_type bch_cache_set_internal_ktype;
-extern struct kobj_type bch_cache_ktype;
+extern const struct kobj_type bch_cached_dev_ktype;
+extern const struct kobj_type bch_flash_dev_ktype;
+extern const struct kobj_type bch_cache_set_ktype;
+extern const struct kobj_type bch_cache_set_internal_ktype;
+extern const struct kobj_type bch_cache_ktype;
void bch_cached_dev_release(struct kobject *kobj);
void bch_flash_dev_release(struct kobject *kobj);
diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
index f96034e0ba4f..6620a7f8fffc 100644
--- a/drivers/md/bcache/bcache_ondisk.h
+++ b/drivers/md/bcache/bcache_ondisk.h
@@ -360,8 +360,8 @@ struct jset {
__u64 prio_bucket[MAX_CACHES_PER_SET];
union {
- struct bkey start[0];
- __u64 d[0];
+ DECLARE_FLEX_ARRAY(struct bkey, start);
+ DECLARE_FLEX_ARRAY(__u64, d);
};
};
@@ -425,8 +425,8 @@ struct bset {
__u32 keys;
union {
- struct bkey start[0];
- __u64 d[0];
+ DECLARE_FLEX_ARRAY(struct bkey, start);
+ DECLARE_FLEX_ARRAY(__u64, d);
};
};
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 2bba4d6aaaa2..463eb13bd0b2 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -54,7 +54,7 @@ void bch_dump_bucket(struct btree_keys *b)
int __bch_count_data(struct btree_keys *b)
{
unsigned int ret = 0;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k;
if (b->ops->is_extents)
@@ -67,7 +67,7 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
{
va_list args;
struct bkey *k, *p = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
const char *err;
for_each_key(b, k, &iter) {
@@ -879,7 +879,7 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
unsigned int status = BTREE_INSERT_STATUS_NO_INSERT;
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey preceding_key_on_stack = ZERO_KEY;
struct bkey *preceding_key_p = &preceding_key_on_stack;
@@ -895,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
else
preceding_key(k, &preceding_key_p);
- m = bch_btree_iter_init(b, &iter, preceding_key_p);
+ m = bch_btree_iter_stack_init(b, &iter, preceding_key_p);
- if (b->ops->insert_fixup(b, k, &iter, replace_key))
+ if (b->ops->insert_fixup(b, k, &iter.iter, replace_key))
return status;
status = BTREE_INSERT_STATUS_INSERT;
@@ -1100,33 +1100,33 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
btree_iter_cmp));
}
-static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
- struct bkey *search,
- struct bset_tree *start)
+static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
+ struct bkey *search,
+ struct bset_tree *start)
{
struct bkey *ret = NULL;
- iter->size = ARRAY_SIZE(iter->data);
- iter->used = 0;
+ iter->iter.size = ARRAY_SIZE(iter->stack_data);
+ iter->iter.used = 0;
#ifdef CONFIG_BCACHE_DEBUG
- iter->b = b;
+ iter->iter.b = b;
#endif
for (; start <= bset_tree_last(b); start++) {
ret = bch_bset_search(b, start, search);
- bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
+ bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data));
}
return ret;
}
-struct bkey *bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
+struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
struct bkey *search)
{
- return __bch_btree_iter_init(b, iter, search, b->set);
+ return __bch_btree_iter_stack_init(b, iter, search, b->set);
}
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
@@ -1293,10 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
struct bset_sort_state *state)
{
size_t order = b->page_order, keys = 0;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
int oldsize = bch_count_data(b);
- __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
+ __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]);
if (start) {
unsigned int i;
@@ -1307,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
order = get_order(__set_bytes(b->set->data, keys));
}
- __btree_sort(b, &iter, start, order, false, state);
+ __btree_sort(b, &iter.iter, start, order, false, state);
EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
}
@@ -1323,11 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
struct bset_sort_state *state)
{
uint64_t start_time = local_clock();
- struct btree_iter iter;
+ struct btree_iter_stack iter;
- bch_btree_iter_init(b, &iter, NULL);
+ bch_btree_iter_stack_init(b, &iter, NULL);
- btree_mergesort(b, new->set->data, &iter, false, true);
+ btree_mergesort(b, new->set->data, &iter.iter, false, true);
bch_time_stats_update(&state->time, start_time);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index d795c84246b0..6ee2c6a506a2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -321,9 +321,20 @@ struct btree_iter {
#endif
struct btree_iter_set {
struct bkey *k, *end;
- } data[MAX_BSETS];
+ } data[];
};
+/* Fixed-size btree_iter that can be allocated on the stack */
+
+struct btree_iter_stack {
+ /* Must be last as it ends in a flexible-array member. */
+ TRAILING_OVERLAP(struct btree_iter, iter, data,
+ struct btree_iter_set stack_data[MAX_BSETS];
+ );
+};
+static_assert(offsetof(struct btree_iter_stack, iter.data) ==
+ offsetof(struct btree_iter_stack, stack_data));
+
typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k);
struct bkey *bch_btree_iter_next(struct btree_iter *iter);
@@ -333,9 +344,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
struct bkey *end);
-struct bkey *bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
- struct bkey *search);
+struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
+ struct bkey *search);
struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
const struct bkey *search);
@@ -350,13 +361,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
return search ? __bch_bset_search(b, t, search) : t->data->start;
}
-#define for_each_key_filter(b, k, iter, filter) \
- for (bch_btree_iter_init((b), (iter), NULL); \
- ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+#define for_each_key_filter(b, k, stack_iter, filter) \
+ for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
+ ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \
+ filter));)
-#define for_each_key(b, k, iter) \
- for (bch_btree_iter_init((b), (iter), NULL); \
- ((k) = bch_btree_iter_next(iter));)
+#define for_each_key(b, k, stack_iter) \
+ for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
+ ((k) = bch_btree_iter_next(&((stack_iter)->iter)));)
/* Sorting */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 147c493a989a..3ed39c823826 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -36,6 +36,7 @@
#include <linux/sched/clock.h>
#include <linux/rculist.h>
#include <linux/delay.h>
+#include <linux/sort.h>
#include <trace/events/bcache.h>
/*
@@ -88,10 +89,9 @@
* Test module load/unload
*/
-#define MAX_NEED_GC 64
-#define MAX_SAVE_PRIO 72
-#define MAX_GC_TIMES 100
-#define MIN_GC_NODES 100
+#define MAX_GC_TIMES_SHIFT 7 /* 128 loops */
+#define GC_NODES_MIN 10
+#define GC_SLEEP_MS_MIN 10
#define GC_SLEEP_MS 100
#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
@@ -293,16 +293,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
w->journal = NULL;
}
-static void btree_node_write_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_write_unlock)
{
- struct btree *b = container_of(cl, struct btree, io);
+ closure_type(b, struct btree, io);
up(&b->io_mutex);
}
-static void __btree_node_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(__btree_node_write_done)
{
- struct btree *b = container_of(cl, struct btree, io);
+ closure_type(b, struct btree, io);
struct btree_write *w = btree_prev_write(b);
bch_bbio_free(b->bio, b->c);
@@ -315,12 +315,12 @@ static void __btree_node_write_done(struct closure *cl)
closure_return_with_destructor(cl, btree_node_write_unlock);
}
-static void btree_node_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_write_done)
{
- struct btree *b = container_of(cl, struct btree, io);
+ closure_type(b, struct btree, io);
bio_free_pages(b->bio);
- __btree_node_write_done(cl);
+ __btree_node_write_done(&cl->work);
}
static void btree_node_write_endio(struct bio *bio)
@@ -372,7 +372,7 @@ static void do_btree_node_write(struct btree *b)
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i));
- if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+ if (!bch_bio_alloc_pages(b->bio, GFP_NOWAIT)) {
struct bio_vec *bv;
void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
struct bvec_iter_all iter_all;
@@ -559,6 +559,25 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
}
}
+#ifdef CONFIG_PROVE_LOCKING
+static int btree_lock_cmp_fn(const struct lockdep_map *_a,
+ const struct lockdep_map *_b)
+{
+ const struct btree *a = container_of(_a, struct btree, lock.dep_map);
+ const struct btree *b = container_of(_b, struct btree, lock.dep_map);
+
+ return -cmp_int(a->level, b->level) ?: bkey_cmp(&a->key, &b->key);
+}
+
+static void btree_lock_print_fn(const struct lockdep_map *map)
+{
+ const struct btree *b = container_of(map, struct btree, lock.dep_map);
+
+ printk(KERN_CONT " l=%u %llu:%llu", b->level,
+ KEY_INODE(&b->key), KEY_OFFSET(&b->key));
+}
+#endif
+
static struct btree *mca_bucket_alloc(struct cache_set *c,
struct bkey *k, gfp_t gfp)
{
@@ -572,7 +591,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
return NULL;
init_rwsem(&b->lock);
- lockdep_set_novalidate_class(&b->lock);
+ lock_set_cmp_fn(&b->lock, btree_lock_cmp_fn, btree_lock_print_fn);
mutex_init(&b->write_lock);
lockdep_set_novalidate_class(&b->write_lock);
INIT_LIST_HEAD(&b->list);
@@ -646,7 +665,7 @@ out_unlock:
static unsigned long bch_mca_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+ struct cache_set *c = shrink->private_data;
struct btree *b, *t;
unsigned long i, nr = sc->nr_to_scan;
unsigned long freed = 0;
@@ -713,7 +732,7 @@ out:
static unsigned long bch_mca_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+ struct cache_set *c = shrink->private_data;
if (c->shrinker_disabled)
return 0;
@@ -731,8 +750,8 @@ void bch_btree_cache_free(struct cache_set *c)
closure_init_stack(&cl);
- if (c->shrink.list.next)
- unregister_shrinker(&c->shrink);
+ if (c->shrink)
+ shrinker_free(c->shrink);
mutex_lock(&c->bucket_lock);
@@ -807,14 +826,19 @@ int bch_btree_cache_alloc(struct cache_set *c)
c->verify_data = NULL;
#endif
- c->shrink.count_objects = bch_mca_count;
- c->shrink.scan_objects = bch_mca_scan;
- c->shrink.seeks = 4;
- c->shrink.batch = c->btree_pages * 2;
+ c->shrink = shrinker_alloc(0, "md-bcache:%pU", c->set_uuid);
+ if (!c->shrink) {
+ pr_warn("bcache: %s: could not allocate shrinker\n", __func__);
+ return 0;
+ }
+
+ c->shrink->count_objects = bch_mca_count;
+ c->shrink->scan_objects = bch_mca_scan;
+ c->shrink->seeks = 4;
+ c->shrink->batch = c->btree_pages * 2;
+ c->shrink->private_data = c;
- if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid))
- pr_warn("bcache: %s: could not register shrinker\n",
- __func__);
+ shrinker_register(c->shrink);
return 0;
}
@@ -885,7 +909,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
-static void bch_cannibalize_unlock(struct cache_set *c)
+void bch_cannibalize_unlock(struct cache_set *c)
{
spin_lock(&c->btree_cannibalize_lock);
if (c->btree_cache_alloc_lock == current) {
@@ -974,6 +998,9 @@ err:
*
* The btree node will have either a read or a write lock held, depending on
* level and op->lock.
+ *
+ * Note: Only error code or btree pointer will be returned, it is unncessary
+ * for callers to check NULL pointer.
*/
struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
struct bkey *k, int level, bool write,
@@ -1085,15 +1112,21 @@ retry:
mutex_unlock(&b->c->bucket_lock);
}
+/*
+ * Only error code or btree pointer will be returned, it is unncessary for
+ * callers to check NULL pointer.
+ */
struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
int level, bool wait,
struct btree *parent)
{
BKEY_PADDED(key) k;
- struct btree *b = ERR_PTR(-EAGAIN);
+ struct btree *b;
mutex_lock(&c->bucket_lock);
retry:
+ /* return ERR_PTR(-EAGAIN) when it fails */
+ b = ERR_PTR(-EAGAIN);
if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
goto err;
@@ -1138,7 +1171,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b,
{
struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
- if (!IS_ERR_OR_NULL(n)) {
+ if (!IS_ERR(n)) {
mutex_lock(&n->write_lock);
bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
bkey_copy_key(&n->key, &b->key);
@@ -1274,7 +1307,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
uint8_t stale = 0;
unsigned int keys = 0, good_keys = 0;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bset_tree *t;
gc->nodes++;
@@ -1352,7 +1385,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
for (i = 0; i < nodes; i++) {
new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
- if (IS_ERR_OR_NULL(new_nodes[i]))
+ if (IS_ERR(new_nodes[i]))
goto out_nocoalesce;
}
@@ -1504,6 +1537,8 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
return 0;
n = btree_node_alloc_replacement(replace, NULL);
+ if (IS_ERR(n))
+ return 0;
/* recheck reserve after allocating replacement node */
if (btree_check_reserve(b, NULL)) {
@@ -1533,7 +1568,7 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
static unsigned int btree_gc_count_keys(struct btree *b)
{
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
unsigned int ret = 0;
for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
@@ -1544,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b)
static size_t btree_gc_min_nodes(struct cache_set *c)
{
- size_t min_nodes;
+ size_t min_nodes = GC_NODES_MIN;
- /*
- * Since incremental GC would stop 100ms when front
- * side I/O comes, so when there are many btree nodes,
- * if GC only processes constant (100) nodes each time,
- * GC would last a long time, and the front side I/Os
- * would run out of the buckets (since no new bucket
- * can be allocated during GC), and be blocked again.
- * So GC should not process constant nodes, but varied
- * nodes according to the number of btree nodes, which
- * realized by dividing GC into constant(100) times,
- * so when there are many btree nodes, GC can process
- * more nodes each time, otherwise, GC will process less
- * nodes each time (but no less than MIN_GC_NODES)
- */
- min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
- if (min_nodes < MIN_GC_NODES)
- min_nodes = MIN_GC_NODES;
+ if (atomic_read(&c->search_inflight) == 0) {
+ size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT;
+
+ if (min_nodes < n)
+ min_nodes = n;
+ }
return min_nodes;
}
+static uint64_t btree_gc_sleep_ms(struct cache_set *c)
+{
+ uint64_t sleep_ms;
+
+ if (atomic_read(&c->bucket_wait_cnt) > 0)
+ sleep_ms = GC_SLEEP_MS_MIN;
+ else
+ sleep_ms = GC_SLEEP_MS;
+
+ return sleep_ms;
+}
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
@@ -1574,17 +1609,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
int ret = 0;
bool should_rewrite;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct gc_merge_info r[GC_MERGE_NODES];
struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
- bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
+ bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done);
for (i = r; i < r + ARRAY_SIZE(r); i++)
i->b = ERR_PTR(-EINTR);
while (1) {
- k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
+ k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+ bch_ptr_bad);
if (k) {
r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
true, b);
@@ -1633,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;
- if (atomic_read(&b->c->search_inflight) &&
- gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+ if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) {
gc->nodes_pre = gc->nodes;
ret = -EAGAIN;
break;
@@ -1669,7 +1704,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
if (should_rewrite) {
n = btree_node_alloc_replacement(b, NULL);
- if (!IS_ERR_OR_NULL(n)) {
+ if (!IS_ERR(n)) {
bch_btree_node_write_sync(n);
bch_btree_set_root(n);
@@ -1703,18 +1738,20 @@ static void btree_gc_start(struct cache_set *c)
mutex_lock(&c->bucket_lock);
- c->gc_mark_valid = 0;
c->gc_done = ZERO_KEY;
ca = c->cache;
for_each_bucket(b, ca) {
b->last_gc = b->gen;
+ if (bch_can_invalidate_bucket(ca, b))
+ b->reclaimable_in_gc = 1;
if (!atomic_read(&b->pin)) {
SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0);
}
}
+ c->gc_mark_valid = 0;
mutex_unlock(&c->bucket_lock);
}
@@ -1771,6 +1808,9 @@ static void bch_btree_gc_finish(struct cache_set *c)
for_each_bucket(b, ca) {
c->need_gc = max(c->need_gc, bucket_gc_gen(b));
+ if (b->reclaimable_in_gc)
+ b->reclaimable_in_gc = 0;
+
if (atomic_read(&b->pin))
continue;
@@ -1806,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c)
cond_resched();
if (ret == -EAGAIN)
- schedule_timeout_interruptible(msecs_to_jiffies
- (GC_SLEEP_MS));
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(btree_gc_sleep_ms(c)));
else if (ret)
pr_warn("gc failed!\n");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -1874,7 +1914,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
{
int ret = 0;
struct bkey *k, *p = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
bch_initial_mark_key(b->c, b->level, k);
@@ -1882,10 +1922,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
bch_initial_mark_key(b->c, b->level + 1, &b->key);
if (b->level) {
- bch_btree_iter_init(&b->keys, &iter, NULL);
+ bch_btree_iter_stack_init(&b->keys, &iter, NULL);
do {
- k = bch_btree_iter_next_filter(&iter, &b->keys,
+ k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
bch_ptr_bad);
if (k) {
btree_node_prefetch(b, k);
@@ -1913,7 +1953,7 @@ static int bch_btree_check_thread(void *arg)
struct btree_check_info *info = arg;
struct btree_check_state *check_state = info->state;
struct cache_set *c = check_state->c;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
@@ -1922,8 +1962,8 @@ static int bch_btree_check_thread(void *arg)
ret = 0;
/* root node keys are checked before thread created */
- bch_btree_iter_init(&c->root->keys, &iter, NULL);
- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
p = k;
@@ -1941,7 +1981,7 @@ static int bch_btree_check_thread(void *arg)
skip_nr = cur_idx - prev_idx;
while (skip_nr) {
- k = bch_btree_iter_next_filter(&iter,
+ k = bch_btree_iter_next_filter(&iter.iter,
&c->root->keys,
bch_ptr_bad);
if (k)
@@ -1968,6 +2008,15 @@ static int bch_btree_check_thread(void *arg)
c->gc_stats.nodes++;
bch_btree_op_init(&op, 0);
ret = bcache_btree(check_recurse, p, c->root, &op);
+ /*
+ * The op may be added to cache_set's btree_cache_wait
+ * in mca_cannibalize(), must ensure it is removed from
+ * the list and release btree_cache_alloc_lock before
+ * free op memory.
+ * Otherwise, the btree_cache_wait will be damaged.
+ */
+ bch_cannibalize_unlock(c);
+ finish_wait(&c->btree_cache_wait, &(&op)->wait);
if (ret)
goto out;
}
@@ -2005,7 +2054,7 @@ int bch_btree_check(struct cache_set *c)
int ret = 0;
int i;
struct bkey *k = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct btree_check_state check_state;
/* check and mark root node keys */
@@ -2501,11 +2550,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
if (b->level) {
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
- bch_btree_iter_init(&b->keys, &iter, from);
+ bch_btree_iter_stack_init(&b->keys, &iter, from);
- while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
+ while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
bch_ptr_bad))) {
ret = bcache_btree(map_nodes_recurse, k, b,
op, from, fn, flags);
@@ -2534,11 +2583,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
{
int ret = MAP_CONTINUE;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
- bch_btree_iter_init(&b->keys, &iter, from);
+ bch_btree_iter_stack_init(&b->keys, &iter, from);
- while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
+ while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+ bch_ptr_bad))) {
ret = !b->level
? fn(op, b, k)
: bcache_btree(map_keys_recurse, k,
@@ -2772,7 +2822,8 @@ void bch_btree_exit(void)
int __init bch_btree_init(void)
{
- btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
+ btree_io_wq = alloc_workqueue("bch_btree_io",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!btree_io_wq)
return -ENOMEM;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 1b5fdbc0d83e..45d64b54115a 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -247,8 +247,8 @@ static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
static inline void rw_lock(bool w, struct btree *b, int level)
{
- w ? down_write_nested(&b->lock, level + 1)
- : down_read_nested(&b->lock, level + 1);
+ w ? down_write(&b->lock)
+ : down_read(&b->lock);
if (w)
b->seq++;
}
@@ -282,6 +282,7 @@ void bch_initial_gc_finish(struct cache_set *c);
void bch_moving_gc(struct cache_set *c);
int bch_btree_check(struct cache_set *c);
void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k);
+void bch_cannibalize_unlock(struct cache_set *c);
static inline void wake_up_gc(struct cache_set *c)
{
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
deleted file mode 100644
index d8d9394a6beb..000000000000
--- a/drivers/md/bcache/closure.c
+++ /dev/null
@@ -1,207 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Asynchronous refcounty things
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/sched/debug.h>
-
-#include "closure.h"
-
-static inline void closure_put_after_sub(struct closure *cl, int flags)
-{
- int r = flags & CLOSURE_REMAINING_MASK;
-
- BUG_ON(flags & CLOSURE_GUARD_MASK);
- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
-
- if (!r) {
- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
- atomic_set(&cl->remaining,
- CLOSURE_REMAINING_INITIALIZER);
- closure_queue(cl);
- } else {
- struct closure *parent = cl->parent;
- closure_fn *destructor = cl->fn;
-
- closure_debug_destroy(cl);
-
- if (destructor)
- destructor(cl);
-
- if (parent)
- closure_put(parent);
- }
- }
-}
-
-/* For clearing flags with the same atomic op as a put */
-void closure_sub(struct closure *cl, int v)
-{
- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
-}
-
-/*
- * closure_put - decrement a closure's refcount
- */
-void closure_put(struct closure *cl)
-{
- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
-}
-
-/*
- * closure_wake_up - wake up all closures on a wait list, without memory barrier
- */
-void __closure_wake_up(struct closure_waitlist *wait_list)
-{
- struct llist_node *list;
- struct closure *cl, *t;
- struct llist_node *reverse = NULL;
-
- list = llist_del_all(&wait_list->list);
-
- /* We first reverse the list to preserve FIFO ordering and fairness */
- reverse = llist_reverse_order(list);
-
- /* Then do the wakeups */
- llist_for_each_entry_safe(cl, t, reverse, list) {
- closure_set_waiting(cl, 0);
- closure_sub(cl, CLOSURE_WAITING + 1);
- }
-}
-
-/**
- * closure_wait - add a closure to a waitlist
- * @waitlist: will own a ref on @cl, which will be released when
- * closure_wake_up() is called on @waitlist.
- * @cl: closure pointer.
- *
- */
-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
-{
- if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
- return false;
-
- closure_set_waiting(cl, _RET_IP_);
- atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
- llist_add(&cl->list, &waitlist->list);
-
- return true;
-}
-
-struct closure_syncer {
- struct task_struct *task;
- int done;
-};
-
-static void closure_sync_fn(struct closure *cl)
-{
- struct closure_syncer *s = cl->s;
- struct task_struct *p;
-
- rcu_read_lock();
- p = READ_ONCE(s->task);
- s->done = 1;
- wake_up_process(p);
- rcu_read_unlock();
-}
-
-void __sched __closure_sync(struct closure *cl)
-{
- struct closure_syncer s = { .task = current };
-
- cl->s = &s;
- continue_at(cl, closure_sync_fn, NULL);
-
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (s.done)
- break;
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
-}
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-
-static LIST_HEAD(closure_list);
-static DEFINE_SPINLOCK(closure_list_lock);
-
-void closure_debug_create(struct closure *cl)
-{
- unsigned long flags;
-
- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
- cl->magic = CLOSURE_MAGIC_ALIVE;
-
- spin_lock_irqsave(&closure_list_lock, flags);
- list_add(&cl->all, &closure_list);
- spin_unlock_irqrestore(&closure_list_lock, flags);
-}
-
-void closure_debug_destroy(struct closure *cl)
-{
- unsigned long flags;
-
- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
- cl->magic = CLOSURE_MAGIC_DEAD;
-
- spin_lock_irqsave(&closure_list_lock, flags);
- list_del(&cl->all);
- spin_unlock_irqrestore(&closure_list_lock, flags);
-}
-
-static struct dentry *closure_debug;
-
-static int debug_show(struct seq_file *f, void *data)
-{
- struct closure *cl;
-
- spin_lock_irq(&closure_list_lock);
-
- list_for_each_entry(cl, &closure_list, all) {
- int r = atomic_read(&cl->remaining);
-
- seq_printf(f, "%p: %pS -> %pS p %p r %i ",
- cl, (void *) cl->ip, cl->fn, cl->parent,
- r & CLOSURE_REMAINING_MASK);
-
- seq_printf(f, "%s%s\n",
- test_bit(WORK_STRUCT_PENDING_BIT,
- work_data_bits(&cl->work)) ? "Q" : "",
- r & CLOSURE_RUNNING ? "R" : "");
-
- if (r & CLOSURE_WAITING)
- seq_printf(f, " W %pS\n",
- (void *) cl->waiting_on);
-
- seq_printf(f, "\n");
- }
-
- spin_unlock_irq(&closure_list_lock);
- return 0;
-}
-
-DEFINE_SHOW_ATTRIBUTE(debug);
-
-void __init closure_debug_init(void)
-{
- if (!IS_ERR_OR_NULL(bcache_debug))
- /*
- * it is unnecessary to check return value of
- * debugfs_create_file(), we should not care
- * about this.
- */
- closure_debug = debugfs_create_file(
- "closures", 0400, bcache_debug, NULL, &debug_fops);
-}
-#endif
-
-MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
deleted file mode 100644
index c88cdc4ae4ec..000000000000
--- a/drivers/md/bcache/closure.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CLOSURE_H
-#define _LINUX_CLOSURE_H
-
-#include <linux/llist.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/workqueue.h>
-
-/*
- * Closure is perhaps the most overused and abused term in computer science, but
- * since I've been unable to come up with anything better you're stuck with it
- * again.
- *
- * What are closures?
- *
- * They embed a refcount. The basic idea is they count "things that are in
- * progress" - in flight bios, some other thread that's doing something else -
- * anything you might want to wait on.
- *
- * The refcount may be manipulated with closure_get() and closure_put().
- * closure_put() is where many of the interesting things happen, when it causes
- * the refcount to go to 0.
- *
- * Closures can be used to wait on things both synchronously and asynchronously,
- * and synchronous and asynchronous use can be mixed without restriction. To
- * wait synchronously, use closure_sync() - you will sleep until your closure's
- * refcount hits 1.
- *
- * To wait asynchronously, use
- * continue_at(cl, next_function, workqueue);
- *
- * passing it, as you might expect, the function to run when nothing is pending
- * and the workqueue to run that function out of.
- *
- * continue_at() also, critically, requires a 'return' immediately following the
- * location where this macro is referenced, to return to the calling function.
- * There's good reason for this.
- *
- * To use safely closures asynchronously, they must always have a refcount while
- * they are running owned by the thread that is running them. Otherwise, suppose
- * you submit some bios and wish to have a function run when they all complete:
- *
- * foo_endio(struct bio *bio)
- * {
- * closure_put(cl);
- * }
- *
- * closure_init(cl);
- *
- * do_stuff();
- * closure_get(cl);
- * bio1->bi_endio = foo_endio;
- * bio_submit(bio1);
- *
- * do_more_stuff();
- * closure_get(cl);
- * bio2->bi_endio = foo_endio;
- * bio_submit(bio2);
- *
- * continue_at(cl, complete_some_read, system_wq);
- *
- * If closure's refcount started at 0, complete_some_read() could run before the
- * second bio was submitted - which is almost always not what you want! More
- * importantly, it wouldn't be possible to say whether the original thread or
- * complete_some_read()'s thread owned the closure - and whatever state it was
- * associated with!
- *
- * So, closure_init() initializes a closure's refcount to 1 - and when a
- * closure_fn is run, the refcount will be reset to 1 first.
- *
- * Then, the rule is - if you got the refcount with closure_get(), release it
- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
- * on a closure because you called closure_init() or you were run out of a
- * closure - _always_ use continue_at(). Doing so consistently will help
- * eliminate an entire class of particularly pernicious races.
- *
- * Lastly, you might have a wait list dedicated to a specific event, and have no
- * need for specifying the condition - you just want to wait until someone runs
- * closure_wake_up() on the appropriate wait list. In that case, just use
- * closure_wait(). It will return either true or false, depending on whether the
- * closure was already on a wait list or not - a closure can only be on one wait
- * list at a time.
- *
- * Parents:
- *
- * closure_init() takes two arguments - it takes the closure to initialize, and
- * a (possibly null) parent.
- *
- * If parent is non null, the new closure will have a refcount for its lifetime;
- * a closure is considered to be "finished" when its refcount hits 0 and the
- * function to run is null. Hence
- *
- * continue_at(cl, NULL, NULL);
- *
- * returns up the (spaghetti) stack of closures, precisely like normal return
- * returns up the C stack. continue_at() with non null fn is better thought of
- * as doing a tail call.
- *
- * All this implies that a closure should typically be embedded in a particular
- * struct (which its refcount will normally control the lifetime of), and that
- * struct can very much be thought of as a stack frame.
- */
-
-struct closure;
-struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
-extern struct dentry *bcache_debug;
-
-struct closure_waitlist {
- struct llist_head list;
-};
-
-enum closure_state {
- /*
- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
- * the thread that owns the closure, and cleared by the thread that's
- * waking up the closure.
- *
- * The rest are for debugging and don't affect behaviour:
- *
- * CLOSURE_RUNNING: Set when a closure is running (i.e. by
- * closure_init() and when closure_put() runs then next function), and
- * must be cleared before remaining hits 0. Primarily to help guard
- * against incorrect usage and accidentally transferring references.
- * continue_at() and closure_return() clear it for you, if you're doing
- * something unusual you can use closure_set_dead() which also helps
- * annotate where references are being transferred.
- */
-
- CLOSURE_BITS_START = (1U << 26),
- CLOSURE_DESTRUCTOR = (1U << 26),
- CLOSURE_WAITING = (1U << 28),
- CLOSURE_RUNNING = (1U << 30),
-};
-
-#define CLOSURE_GUARD_MASK \
- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
-
-#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
-#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
-
-struct closure {
- union {
- struct {
- struct workqueue_struct *wq;
- struct closure_syncer *s;
- struct llist_node list;
- closure_fn *fn;
- };
- struct work_struct work;
- };
-
- struct closure *parent;
-
- atomic_t remaining;
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-#define CLOSURE_MAGIC_DEAD 0xc054dead
-#define CLOSURE_MAGIC_ALIVE 0xc054a11e
-
- unsigned int magic;
- struct list_head all;
- unsigned long ip;
- unsigned long waiting_on;
-#endif
-};
-
-void closure_sub(struct closure *cl, int v);
-void closure_put(struct closure *cl);
-void __closure_wake_up(struct closure_waitlist *list);
-bool closure_wait(struct closure_waitlist *list, struct closure *cl);
-void __closure_sync(struct closure *cl);
-
-/**
- * closure_sync - sleep until a closure a closure has nothing left to wait on
- *
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
- * the last refcount.
- */
-static inline void closure_sync(struct closure *cl)
-{
- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
- __closure_sync(cl);
-}
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-
-void closure_debug_init(void);
-void closure_debug_create(struct closure *cl);
-void closure_debug_destroy(struct closure *cl);
-
-#else
-
-static inline void closure_debug_init(void) {}
-static inline void closure_debug_create(struct closure *cl) {}
-static inline void closure_debug_destroy(struct closure *cl) {}
-
-#endif
-
-static inline void closure_set_ip(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->ip = _THIS_IP_;
-#endif
-}
-
-static inline void closure_set_ret_ip(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->ip = _RET_IP_;
-#endif
-}
-
-static inline void closure_set_waiting(struct closure *cl, unsigned long f)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->waiting_on = f;
-#endif
-}
-
-static inline void closure_set_stopped(struct closure *cl)
-{
- atomic_sub(CLOSURE_RUNNING, &cl->remaining);
-}
-
-static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
- struct workqueue_struct *wq)
-{
- closure_set_ip(cl);
- cl->fn = fn;
- cl->wq = wq;
- /* between atomic_dec() in closure_put() */
- smp_mb__before_atomic();
-}
-
-static inline void closure_queue(struct closure *cl)
-{
- struct workqueue_struct *wq = cl->wq;
- /**
- * Changes made to closure, work_struct, or a couple of other structs
- * may cause work.func not pointing to the right location.
- */
- BUILD_BUG_ON(offsetof(struct closure, fn)
- != offsetof(struct work_struct, func));
- if (wq) {
- INIT_WORK(&cl->work, cl->work.func);
- BUG_ON(!queue_work(wq, &cl->work));
- } else
- cl->fn(cl);
-}
-
-/**
- * closure_get - increment a closure's refcount
- */
-static inline void closure_get(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- BUG_ON((atomic_inc_return(&cl->remaining) &
- CLOSURE_REMAINING_MASK) <= 1);
-#else
- atomic_inc(&cl->remaining);
-#endif
-}
-
-/**
- * closure_init - Initialize a closure, setting the refcount to 1
- * @cl: closure to initialize
- * @parent: parent of the new closure. cl will take a refcount on it for its
- * lifetime; may be NULL.
- */
-static inline void closure_init(struct closure *cl, struct closure *parent)
-{
- memset(cl, 0, sizeof(struct closure));
- cl->parent = parent;
- if (parent)
- closure_get(parent);
-
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-
- closure_debug_create(cl);
- closure_set_ip(cl);
-}
-
-static inline void closure_init_stack(struct closure *cl)
-{
- memset(cl, 0, sizeof(struct closure));
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-}
-
-/**
- * closure_wake_up - wake up all closures on a wait list,
- * with memory barrier
- */
-static inline void closure_wake_up(struct closure_waitlist *list)
-{
- /* Memory barrier for the wait list */
- smp_mb();
- __closure_wake_up(list);
-}
-
-/**
- * continue_at - jump to another function with barrier
- *
- * After @cl is no longer waiting on anything (i.e. all outstanding refs have
- * been dropped with closure_put()), it will resume execution at @fn running out
- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
- *
- * This is because after calling continue_at() you no longer have a ref on @cl,
- * and whatever @cl owns may be freed out from under you - a running closure fn
- * has a ref on its own closure which continue_at() drops.
- *
- * Note you are expected to immediately return after using this macro.
- */
-#define continue_at(_cl, _fn, _wq) \
-do { \
- set_closure_fn(_cl, _fn, _wq); \
- closure_sub(_cl, CLOSURE_RUNNING + 1); \
-} while (0)
-
-/**
- * closure_return - finish execution of a closure
- *
- * This is used to indicate that @cl is finished: when all outstanding refs on
- * @cl have been dropped @cl's ref on its parent closure (as passed to
- * closure_init()) will be dropped, if one was specified - thus this can be
- * thought of as returning to the parent closure.
- */
-#define closure_return(_cl) continue_at((_cl), NULL, NULL)
-
-/**
- * continue_at_nobarrier - jump to another function without barrier
- *
- * Causes @fn to be executed out of @cl, in @wq context (or called directly if
- * @wq is NULL).
- *
- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
- * thus it's not safe to touch anything protected by @cl after a
- * continue_at_nobarrier().
- */
-#define continue_at_nobarrier(_cl, _fn, _wq) \
-do { \
- set_closure_fn(_cl, _fn, _wq); \
- closure_queue(_cl); \
-} while (0)
-
-/**
- * closure_return_with_destructor - finish execution of a closure,
- * with destructor
- *
- * Works like closure_return(), except @destructor will be called when all
- * outstanding refs on @cl have been dropped; @destructor may be used to safely
- * free the memory occupied by @cl, and it is called with the ref on the parent
- * closure still held - so @destructor could safely return an item to a
- * freelist protected by @cl's parent.
- */
-#define closure_return_with_destructor(_cl, _destructor) \
-do { \
- set_closure_fn(_cl, _destructor, NULL); \
- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
-} while (0)
-
-/**
- * closure_call - execute @fn out of a new, uninitialized closure
- *
- * Typically used when running out of one closure, and we want to run @fn
- * asynchronously out of a new closure - @parent will then wait for @cl to
- * finish.
- */
-static inline void closure_call(struct closure *cl, closure_fn fn,
- struct workqueue_struct *wq,
- struct closure *parent)
-{
- closure_init(cl, parent);
- continue_at_nobarrier(cl, fn, wq);
-}
-
-#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 7510d1c983a5..f327456fc4e0 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
check = bio_kmalloc(nr_segs, GFP_NOIO);
if (!check)
return;
- bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs,
- REQ_OP_READ);
+ bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ);
check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
check->bi_iter.bi_size = bio->bi_iter.bi_size;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 020712c5203f..2386d08bf4e4 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
- meta_bucket_pages(&c->cache->sb), 0);
+ bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0);
return bio;
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index c182c21de2e8..144693b7c46a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -275,8 +275,7 @@ bsearch:
* ja->cur_idx
*/
ja->cur_idx = i;
- ja->last_idx = ja->discard_idx = (i + 1) %
- ca->sb.njournal_buckets;
+ ja->last_idx = (i + 1) % ca->sb.njournal_buckets;
}
@@ -336,16 +335,6 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
}
}
-static bool is_discard_enabled(struct cache_set *s)
-{
- struct cache *ca = s->cache;
-
- if (ca->discard)
- return true;
-
- return false;
-}
-
int bch_journal_replay(struct cache_set *s, struct list_head *list)
{
int ret = 0, keys = 0, entries = 0;
@@ -360,15 +349,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
BUG_ON(i->pin && atomic_read(i->pin) != 1);
if (n != i->j.seq) {
- if (n == start && is_discard_enabled(s))
- pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n",
- n, i->j.seq - 1, start, end);
- else {
- pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
- n, i->j.seq - 1, start, end);
- ret = -EIO;
- goto err;
- }
+ pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
+ n, i->j.seq - 1, start, end);
+ ret = -EIO;
+ goto err;
}
for (k = i->j.start;
@@ -568,65 +552,6 @@ out:
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
-static void journal_discard_endio(struct bio *bio)
-{
- struct journal_device *ja =
- container_of(bio, struct journal_device, discard_bio);
- struct cache *ca = container_of(ja, struct cache, journal);
-
- atomic_set(&ja->discard_in_flight, DISCARD_DONE);
-
- closure_wake_up(&ca->set->journal.wait);
- closure_put(&ca->set->cl);
-}
-
-static void journal_discard_work(struct work_struct *work)
-{
- struct journal_device *ja =
- container_of(work, struct journal_device, discard_work);
-
- submit_bio(&ja->discard_bio);
-}
-
-static void do_journal_discard(struct cache *ca)
-{
- struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->discard_bio;
-
- if (!ca->discard) {
- ja->discard_idx = ja->last_idx;
- return;
- }
-
- switch (atomic_read(&ja->discard_in_flight)) {
- case DISCARD_IN_FLIGHT:
- return;
-
- case DISCARD_DONE:
- ja->discard_idx = (ja->discard_idx + 1) %
- ca->sb.njournal_buckets;
-
- atomic_set(&ja->discard_in_flight, DISCARD_READY);
- fallthrough;
-
- case DISCARD_READY:
- if (ja->discard_idx == ja->last_idx)
- return;
-
- atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
-
- bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD);
- bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
- ca->sb.d[ja->discard_idx]);
- bio->bi_iter.bi_size = bucket_bytes(ca);
- bio->bi_end_io = journal_discard_endio;
-
- closure_get(&ca->set->cl);
- INIT_WORK(&ja->discard_work, journal_discard_work);
- queue_work(bch_journal_wq, &ja->discard_work);
- }
-}
-
static unsigned int free_journal_buckets(struct cache_set *c)
{
struct journal *j = &c->journal;
@@ -635,10 +560,10 @@ static unsigned int free_journal_buckets(struct cache_set *c)
unsigned int n;
/* In case njournal_buckets is not power of 2 */
- if (ja->cur_idx >= ja->discard_idx)
- n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx;
+ if (ja->cur_idx >= ja->last_idx)
+ n = ca->sb.njournal_buckets + ja->last_idx - ja->cur_idx;
else
- n = ja->discard_idx - ja->cur_idx;
+ n = ja->last_idx - ja->cur_idx;
if (n > (1 + j->do_reserve))
return n - (1 + j->do_reserve);
@@ -668,8 +593,6 @@ static void journal_reclaim(struct cache_set *c)
ja->last_idx = (ja->last_idx + 1) %
ca->sb.njournal_buckets;
- do_journal_discard(ca);
-
if (c->journal.blocks_free)
goto out;
@@ -723,11 +646,11 @@ static void journal_write_endio(struct bio *bio)
closure_put(&w->c->journal.io);
}
-static void journal_write(struct closure *cl);
+static CLOSURE_CALLBACK(journal_write);
-static void journal_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_done)
{
- struct journal *j = container_of(cl, struct journal, io);
+ closure_type(j, struct journal, io);
struct journal_write *w = (j->cur == j->w)
? &j->w[1]
: &j->w[0];
@@ -736,19 +659,19 @@ static void journal_write_done(struct closure *cl)
continue_at_nobarrier(cl, journal_write, bch_journal_wq);
}
-static void journal_write_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_unlock)
__releases(&c->journal.lock)
{
- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+ closure_type(c, struct cache_set, journal.io);
c->journal.io_in_flight = 0;
spin_unlock(&c->journal.lock);
}
-static void journal_write_unlocked(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_unlocked)
__releases(c->journal.lock)
{
- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+ closure_type(c, struct cache_set, journal.io);
struct cache *ca = c->cache;
struct journal_write *w = c->journal.cur;
struct bkey *k = &c->journal.key;
@@ -823,12 +746,12 @@ static void journal_write_unlocked(struct closure *cl)
continue_at(cl, journal_write_done, NULL);
}
-static void journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write)
{
- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+ closure_type(c, struct cache_set, journal.io);
spin_lock(&c->journal.lock);
- journal_write_unlocked(cl);
+ journal_write_unlocked(&cl->work);
}
static void journal_try_write(struct cache_set *c)
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index cd316b4a1e95..9e9d1b3016a5 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -139,19 +139,6 @@ struct journal_device {
/* Last journal bucket that still contains an open journal entry */
unsigned int last_idx;
- /* Next journal bucket to be discarded */
- unsigned int discard_idx;
-
-#define DISCARD_READY 0
-#define DISCARD_IN_FLIGHT 1
-#define DISCARD_DONE 2
- /* 1 - discard in flight, -1 - discard completed */
- atomic_t discard_in_flight;
-
- struct work_struct discard_work;
- struct bio discard_bio;
- struct bio_vec discard_bv;
-
/* Bio for journal reads/writes to this device */
struct bio bio;
struct bio_vec bv[8];
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 9f32901fdad1..73918e55bf04 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -35,16 +35,16 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
/* Moving GC - IO loop */
-static void moving_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(moving_io_destructor)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_type(io, struct moving_io, cl);
kfree(io);
}
-static void write_moving_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_moving_finish)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_type(io, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
bio_free_pages(bio);
@@ -79,19 +79,19 @@ static void moving_init(struct moving_io *io)
{
struct bio *bio = &io->bio.bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
+ bio_init_inline(bio, NULL,
DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
bio_get(bio);
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9;
bio->bi_private = &io->cl;
bch_bio_map(bio, NULL);
}
-static void write_moving(struct closure *cl)
+static CLOSURE_CALLBACK(write_moving)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_type(io, struct moving_io, cl);
struct data_insert_op *op = &io->op;
if (!op->status) {
@@ -113,9 +113,9 @@ static void write_moving(struct closure *cl)
continue_at(cl, write_moving_finish, op->wq);
}
-static void read_moving_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_moving_submit)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_type(io, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
@@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c)
continue;
}
- io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 67a2e29e0b40..af345dc6fde1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,7 +25,7 @@
struct kmem_cache *bch_search_cache;
-static void bch_data_insert_start(struct closure *cl);
+static CLOSURE_CALLBACK(bch_data_insert_start);
static unsigned int cache_mode(struct cached_dev *dc)
{
@@ -55,9 +55,9 @@ static void bio_csum(struct bio *bio, struct bkey *k)
/* Insert data into cache */
-static void bch_data_insert_keys(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_keys)
{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ closure_type(op, struct data_insert_op, cl);
atomic_t *journal_ref = NULL;
struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
int ret;
@@ -136,9 +136,9 @@ out:
continue_at(cl, bch_data_insert_keys, op->wq);
}
-static void bch_data_insert_error(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_error)
{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ closure_type(op, struct data_insert_op, cl);
/*
* Our data write just errored, which means we've got a bunch of keys to
@@ -163,7 +163,7 @@ static void bch_data_insert_error(struct closure *cl)
op->insert_keys.top = dst;
- bch_data_insert_keys(cl);
+ bch_data_insert_keys(&cl->work);
}
static void bch_data_insert_endio(struct bio *bio)
@@ -184,9 +184,9 @@ static void bch_data_insert_endio(struct bio *bio)
bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
}
-static void bch_data_insert_start(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_start)
{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ closure_type(op, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
if (op->bypass)
@@ -305,16 +305,16 @@ err:
* If op->bypass is true, instead of inserting the data it invalidates the
* region of the cache represented by op->bio and op->inode.
*/
-void bch_data_insert(struct closure *cl)
+CLOSURE_CALLBACK(bch_data_insert)
{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ closure_type(op, struct data_insert_op, cl);
trace_bcache_write(op->c, op->inode, op->bio,
op->writeback, op->bypass);
bch_keylist_init(&op->insert_keys);
bio_get(op->bio);
- bch_data_insert_start(cl);
+ bch_data_insert_start(&cl->work);
}
/*
@@ -369,10 +369,24 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
struct io *i;
if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
- c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
(bio_op(bio) == REQ_OP_DISCARD))
goto skip;
+ if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) {
+ /*
+ * If cached buckets are all clean now, 'true' will be
+ * returned and all requests will bypass the cache device.
+ * Then c->sectors_to_gc has no chance to be negative, and
+ * gc thread won't wake up and caching won't work forever.
+ * Here call force_wake_up_gc() to avoid such aftermath.
+ */
+ if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN &&
+ c->gc_mark_valid)
+ force_wake_up_gc(c);
+
+ goto skip;
+ }
+
if (mode == CACHE_MODE_NONE ||
(mode == CACHE_MODE_WRITEAROUND &&
op_is_write(bio_op(bio))))
@@ -575,9 +589,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
return n == bio ? MAP_DONE : MAP_CONTINUE;
}
-static void cache_lookup(struct closure *cl)
+static CLOSURE_CALLBACK(cache_lookup)
{
- struct search *s = container_of(cl, struct search, iop.cl);
+ closure_type(s, struct search, iop.cl);
struct bio *bio = &s->bio.bio;
struct cached_dev *dc;
int ret;
@@ -698,9 +712,9 @@ static void do_bio_hook(struct search *s,
bio_cnt_set(bio, 3);
}
-static void search_free(struct closure *cl)
+static CLOSURE_CALLBACK(search_free)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
atomic_dec(&s->iop.c->search_inflight);
@@ -749,20 +763,20 @@ static inline struct search *search_alloc(struct bio *bio,
/* Cached devices */
-static void cached_dev_bio_complete(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_bio_complete)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
cached_dev_put(dc);
- search_free(cl);
+ search_free(&cl->work);
}
/* Process reads */
-static void cached_dev_read_error_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_error_done)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
if (s->iop.replace_collision)
bch_mark_cache_miss_collision(s->iop.c, s->d);
@@ -770,12 +784,12 @@ static void cached_dev_read_error_done(struct closure *cl)
if (s->iop.bio)
bio_free_pages(s->iop.bio);
- cached_dev_bio_complete(cl);
+ cached_dev_bio_complete(&cl->work);
}
-static void cached_dev_read_error(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_error)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct bio *bio = &s->bio.bio;
/*
@@ -801,9 +815,9 @@ static void cached_dev_read_error(struct closure *cl)
continue_at(cl, cached_dev_read_error_done, NULL);
}
-static void cached_dev_cache_miss_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_cache_miss_done)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct bcache_device *d = s->d;
if (s->iop.replace_collision)
@@ -812,13 +826,13 @@ static void cached_dev_cache_miss_done(struct closure *cl)
if (s->iop.bio)
bio_free_pages(s->iop.bio);
- cached_dev_bio_complete(cl);
+ cached_dev_bio_complete(&cl->work);
closure_put(&d->cl);
}
-static void cached_dev_read_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_done)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
/*
@@ -858,9 +872,9 @@ static void cached_dev_read_done(struct closure *cl)
continue_at(cl, cached_dev_cache_miss_done, NULL);
}
-static void cached_dev_read_done_bh(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_done_bh)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
bch_mark_cache_accounting(s->iop.c, s->d,
@@ -955,13 +969,13 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
/* Process writes */
-static void cached_dev_write_complete(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_write_complete)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
up_read_non_owner(&dc->writeback_lock);
- cached_dev_bio_complete(cl);
+ cached_dev_bio_complete(&cl->work);
}
static void cached_dev_write(struct cached_dev *dc, struct search *s)
@@ -1048,9 +1062,9 @@ insert_data:
continue_at(cl, cached_dev_write_complete, NULL);
}
-static void cached_dev_nodata(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_nodata)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
struct bio *bio = &s->bio.bio;
if (s->iop.flush_journal)
@@ -1228,7 +1242,7 @@ void cached_dev_submit_bio(struct bio *bio)
detached_dev_do_request(d, bio, orig_bdev, start_time);
}
-static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
@@ -1265,9 +1279,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
return MAP_CONTINUE;
}
-static void flash_dev_nodata(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_nodata)
{
- struct search *s = container_of(cl, struct search, cl);
+ closure_type(s, struct search, cl);
if (s->iop.flush_journal)
bch_journal_meta(s->iop.c, cl);
@@ -1318,7 +1332,7 @@ void flash_dev_submit_bio(struct bio *bio)
continue_at(cl, search_free, NULL);
}
-static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
+static int flash_dev_ioctl(struct bcache_device *d, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
return -ENOTTY;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 38ab4856eaab..46bbef00aebb 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -34,7 +34,7 @@ struct data_insert_op {
};
unsigned int bch_get_congested(const struct cache_set *c);
-void bch_data_insert(struct closure *cl);
+CLOSURE_CALLBACK(bch_data_insert);
void bch_cached_dev_request_init(struct cached_dev *dc);
void cached_dev_submit_bio(struct bio *bio);
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 68b02216033d..0056106495a7 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -123,7 +123,7 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc)
kobject_put(&acc->day.kobj);
atomic_set(&acc->closing, 1);
- if (del_timer_sync(&acc->timer))
+ if (timer_delete_sync(&acc->timer))
closure_return(&acc->cl);
}
@@ -149,7 +149,7 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
static void scale_accounting(struct timer_list *t)
{
- struct cache_accounting *acc = from_timer(acc, t, timer);
+ struct cache_accounting *acc = timer_container_of(acc, t, timer);
#define move_stat(name) do { \
unsigned int t = atomic_xchg(&acc->collector.name, 0); \
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index bd3afc856d53..21b445f8af15 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -18,7 +18,6 @@ struct cache_stats {
unsigned long cache_misses;
unsigned long cache_bypass_hits;
unsigned long cache_bypass_misses;
- unsigned long cache_readaheads;
unsigned long cache_miss_collisions;
unsigned long sectors_bypassed;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index ba3909bb6bea..c17d4517af22 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -168,14 +168,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
{
const char *err;
struct cache_sb_disk *s;
- struct page *page;
+ struct folio *folio;
unsigned int i;
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
- SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
- if (IS_ERR(page))
+ folio = mapping_read_folio_gfp(bdev->bd_mapping,
+ SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
+ if (IS_ERR(folio))
return "IO error";
- s = page_address(page) + offset_in_page(SB_OFFSET);
+ s = folio_address(folio) + offset_in_folio(folio, SB_OFFSET);
sb->offset = le64_to_cpu(s->offset);
sb->version = le64_to_cpu(s->version);
@@ -272,7 +272,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
*res = s;
return NULL;
err:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
- __bio_add_page(bio, virt_to_page(out), SB_SIZE,
- offset_in_page(out));
+ bio_add_virt_nofail(bio, out, SB_SIZE);
out->offset = cpu_to_le64(sb->offset);
@@ -327,9 +326,9 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
submit_bio(bio);
}
-static void bch_write_bdev_super_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(bch_write_bdev_super_unlock)
{
- struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+ closure_type(dc, struct cached_dev, sb_write);
up(&dc->sb_write_mutex);
}
@@ -363,9 +362,9 @@ static void write_super_endio(struct bio *bio)
closure_put(&ca->set->sb_write);
}
-static void bcache_write_super_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(bcache_write_super_unlock)
{
- struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+ closure_type(c, struct cache_set, sb_write);
up(&c->sb_write_mutex);
}
@@ -407,9 +406,9 @@ static void uuid_endio(struct bio *bio)
closure_put(cl);
}
-static void uuid_io_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(uuid_io_unlock)
{
- struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+ closure_type(c, struct cache_set, uuid_write);
up(&c->uuid_write_mutex);
}
@@ -546,7 +545,8 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
static struct uuid_entry *uuid_find_empty(struct cache_set *c)
{
- static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+ static const char zero_uuid[16] __nonstring =
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
return uuid_find(c, zero_uuid);
}
@@ -732,9 +732,9 @@ out:
/* Bcache device */
-static int open_dev(struct block_device *b, fmode_t mode)
+static int open_dev(struct gendisk *disk, blk_mode_t mode)
{
- struct bcache_device *d = b->bd_disk->private_data;
+ struct bcache_device *d = disk->private_data;
if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
return -ENXIO;
@@ -743,14 +743,14 @@ static int open_dev(struct block_device *b, fmode_t mode)
return 0;
}
-static void release_dev(struct gendisk *b, fmode_t mode)
+static void release_dev(struct gendisk *b)
{
struct bcache_device *d = b->private_data;
closure_put(&d->cl);
}
-static int ioctl_dev(struct block_device *b, fmode_t mode,
+static int ioctl_dev(struct block_device *b, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct bcache_device *d = b->bd_disk->private_data;
@@ -881,8 +881,8 @@ static void bcache_device_free(struct bcache_device *d)
bcache_device_detach(d);
if (disk) {
- ida_simple_remove(&bcache_device_idx,
- first_minor_to_idx(disk->first_minor));
+ ida_free(&bcache_device_idx,
+ first_minor_to_idx(disk->first_minor));
put_disk(disk);
}
@@ -897,14 +897,30 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
sector_t sectors, struct block_device *cached_bdev,
const struct block_device_operations *ops)
{
- struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
+ struct queue_limits lim = {
+ .max_hw_sectors = UINT_MAX,
+ .max_sectors = UINT_MAX,
+ .max_segment_size = UINT_MAX,
+ .max_segments = BIO_MAX_VECS,
+ .max_hw_discard_sectors = UINT_MAX,
+ .io_min = block_size,
+ .logical_block_size = block_size,
+ .physical_block_size = block_size,
+ .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA,
+ };
uint64_t n;
int idx;
+ if (cached_bdev) {
+ d->stripe_size = bdev_io_opt(cached_bdev) >> SECTOR_SHIFT;
+ lim.io_opt = umax(block_size, bdev_io_opt(cached_bdev));
+ }
if (!d->stripe_size)
d->stripe_size = 1 << 31;
+ else if (d->stripe_size < BCH_MIN_STRIPE_SZ)
+ d->stripe_size = roundup(BCH_MIN_STRIPE_SZ, d->stripe_size);
n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
if (!n || n > max_stripes) {
@@ -924,8 +940,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
if (!d->full_dirty_stripes)
goto out_free_stripe_sectors_dirty;
- idx = ida_simple_get(&bcache_device_idx, 0,
- BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
+ idx = ida_alloc_max(&bcache_device_idx, BCACHE_DEVICE_IDX_MAX - 1,
+ GFP_KERNEL);
if (idx < 0)
goto out_free_full_dirty_stripes;
@@ -933,54 +949,37 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
goto out_ida_remove;
- d->disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!d->disk)
- goto out_bioset_exit;
-
- set_capacity(d->disk, sectors);
- snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
-
- d->disk->major = bcache_major;
- d->disk->first_minor = idx_to_first_minor(idx);
- d->disk->minors = BCACHE_MINORS;
- d->disk->fops = ops;
- d->disk->private_data = d;
-
- q = d->disk->queue;
- q->limits.max_hw_sectors = UINT_MAX;
- q->limits.max_sectors = UINT_MAX;
- q->limits.max_segment_size = UINT_MAX;
- q->limits.max_segments = BIO_MAX_VECS;
- blk_queue_max_discard_sectors(q, UINT_MAX);
- q->limits.discard_granularity = 512;
- q->limits.io_min = block_size;
- q->limits.logical_block_size = block_size;
- q->limits.physical_block_size = block_size;
-
- if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
+ if (lim.logical_block_size > PAGE_SIZE && cached_bdev) {
/*
* This should only happen with BCACHE_SB_VERSION_BDEV.
* Block/page size is checked for BCACHE_SB_VERSION_CDEV.
*/
- pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
- d->disk->disk_name, q->limits.logical_block_size,
+ pr_info("bcache%i: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
+ idx, lim.logical_block_size,
PAGE_SIZE, bdev_logical_block_size(cached_bdev));
/* This also adjusts physical block size/min io size if needed */
- blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
+ lim.logical_block_size = bdev_logical_block_size(cached_bdev);
}
- blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
+ d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(d->disk))
+ goto out_bioset_exit;
- blk_queue_write_cache(q, true, true);
+ set_capacity(d->disk, sectors);
+ snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
+ d->disk->major = bcache_major;
+ d->disk->first_minor = idx_to_first_minor(idx);
+ d->disk->minors = BCACHE_MINORS;
+ d->disk->fops = ops;
+ d->disk->private_data = d;
return 0;
out_bioset_exit:
bioset_exit(&d->bio_split);
out_ida_remove:
- ida_simple_remove(&bcache_device_idx, idx);
+ ida_free(&bcache_device_idx, idx);
out_free_full_dirty_stripes:
kvfree(d->full_dirty_stripes);
out_free_stripe_sectors_dirty:
@@ -1343,9 +1342,9 @@ void bch_cached_dev_release(struct kobject *kobj)
module_put(THIS_MODULE);
}
-static void cached_dev_free(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_free)
{
- struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ closure_type(dc, struct cached_dev, disk.cl);
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
@@ -1367,19 +1366,19 @@ static void cached_dev_free(struct closure *cl)
mutex_unlock(&bch_register_lock);
if (dc->sb_disk)
- put_page(virt_to_page(dc->sb_disk));
+ folio_put(virt_to_folio(dc->sb_disk));
- if (!IS_ERR_OR_NULL(dc->bdev))
- blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ if (dc->bdev_file)
+ fput(dc->bdev_file);
wake_up(&unregister_wait);
kobject_put(&dc->disk.kobj);
}
-static void cached_dev_flush(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_flush)
{
- struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ closure_type(dc, struct cached_dev, disk.cl);
struct bcache_device *d = &dc->disk;
mutex_lock(&bch_register_lock);
@@ -1389,7 +1388,7 @@ static void cached_dev_flush(struct closure *cl)
bch_cache_accounting_destroy(&dc->accounting);
kobject_del(&d->kobj);
- continue_at(cl, cached_dev_free, system_wq);
+ continue_at(cl, cached_dev_free, system_percpu_wq);
}
static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
@@ -1401,7 +1400,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
__module_get(THIS_MODULE);
INIT_LIST_HEAD(&dc->list);
closure_init(&dc->disk.cl, NULL);
- set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+ set_closure_fn(&dc->disk.cl, cached_dev_flush, system_percpu_wq);
kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
INIT_WORK(&dc->detach, cached_dev_detach_finish);
sema_init(&dc->sb_write_mutex, 1);
@@ -1416,11 +1415,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
}
- dc->disk.stripe_size = q->limits.io_opt >> 9;
-
- if (dc->disk.stripe_size)
- dc->partial_stripes_expensive =
- q->limits.raid_partial_stripes_expensive;
+ if (bdev_io_opt(dc->bdev))
+ dc->partial_stripes_expensive = !!(q->limits.features &
+ BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE);
ret = bcache_device_init(&dc->disk, block_size,
bdev_nr_sectors(dc->bdev) - dc->sb.data_offset,
@@ -1428,9 +1425,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
if (ret)
return ret;
- blk_queue_io_opt(dc->disk.disk->queue,
- max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
-
atomic_set(&dc->io_errors, 0);
dc->io_disable = false;
dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
@@ -1445,7 +1439,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
/* Cached device - bcache superblock */
static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
- struct block_device *bdev,
+ struct file *bdev_file,
struct cached_dev *dc)
{
const char *err = "cannot allocate memory";
@@ -1453,15 +1447,15 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
int ret = -ENOMEM;
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
- dc->bdev = bdev;
- dc->bdev->bd_holder = dc;
+ dc->bdev_file = bdev_file;
+ dc->bdev = file_bdev(bdev_file);
dc->sb_disk = sb_disk;
if (cached_dev_init(dc, sb->block_size << 9))
goto err;
err = "error creating kobject";
- if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
+ if (kobject_add(&dc->disk.kobj, bdev_kobj(dc->bdev), "bcache"))
goto err;
if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
goto err;
@@ -1498,9 +1492,9 @@ void bch_flash_dev_release(struct kobject *kobj)
kfree(d);
}
-static void flash_dev_free(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_free)
{
- struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ closure_type(d, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
atomic_long_sub(bcache_dev_sectors_dirty(d),
@@ -1511,15 +1505,15 @@ static void flash_dev_free(struct closure *cl)
kobject_put(&d->kobj);
}
-static void flash_dev_flush(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_flush)
{
- struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ closure_type(d, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
bcache_device_unlink(d);
mutex_unlock(&bch_register_lock);
kobject_del(&d->kobj);
- continue_at(cl, flash_dev_free, system_wq);
+ continue_at(cl, flash_dev_free, system_percpu_wq);
}
static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
@@ -1531,7 +1525,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
goto err_ret;
closure_init(&d->cl, NULL);
- set_closure_fn(&d->cl, flash_dev_flush, system_wq);
+ set_closure_fn(&d->cl, flash_dev_flush, system_percpu_wq);
kobject_init(&d->kobj, &bch_flash_dev_ktype);
@@ -1669,9 +1663,9 @@ void bch_cache_set_release(struct kobject *kobj)
module_put(THIS_MODULE);
}
-static void cache_set_free(struct closure *cl)
+static CLOSURE_CALLBACK(cache_set_free)
{
- struct cache_set *c = container_of(cl, struct cache_set, cl);
+ closure_type(c, struct cache_set, cl);
struct cache *ca;
debugfs_remove(c->debug);
@@ -1710,9 +1704,9 @@ static void cache_set_free(struct closure *cl)
kobject_put(&c->kobj);
}
-static void cache_set_flush(struct closure *cl)
+static CLOSURE_CALLBACK(cache_set_flush)
{
- struct cache_set *c = container_of(cl, struct cache_set, caching);
+ closure_type(c, struct cache_set, caching);
struct cache *ca = c->cache;
struct btree *b;
@@ -1739,7 +1733,12 @@ static void cache_set_flush(struct closure *cl)
mutex_unlock(&b->write_lock);
}
- if (ca->alloc_thread)
+ /*
+ * If the register_cache_set() call to bch_cache_set_alloc() failed,
+ * ca has not been assigned a value and return error.
+ * So we need check ca is not NULL during bch_cache_set_unregister().
+ */
+ if (ca && ca->alloc_thread)
kthread_stop(ca->alloc_thread);
if (c->journal.cur) {
@@ -1807,9 +1806,9 @@ static void conditional_stop_bcache_device(struct cache_set *c,
}
}
-static void __cache_set_unregister(struct closure *cl)
+static CLOSURE_CALLBACK(__cache_set_unregister)
{
- struct cache_set *c = container_of(cl, struct cache_set, caching);
+ closure_type(c, struct cache_set, caching);
struct cached_dev *dc;
struct bcache_device *d;
size_t i;
@@ -1834,7 +1833,7 @@ static void __cache_set_unregister(struct closure *cl)
mutex_unlock(&bch_register_lock);
- continue_at(cl, cache_set_flush, system_wq);
+ continue_at(cl, cache_set_flush, system_percpu_wq);
}
void bch_cache_set_stop(struct cache_set *c)
@@ -1864,10 +1863,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
__module_get(THIS_MODULE);
closure_init(&c->cl, NULL);
- set_closure_fn(&c->cl, cache_set_free, system_wq);
+ set_closure_fn(&c->cl, cache_set_free, system_percpu_wq);
closure_init(&c->caching, &c->cl);
- set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
+ set_closure_fn(&c->caching, __cache_set_unregister, system_percpu_wq);
/* Maybe create continue_at_noreturn() and use it here? */
closure_set_stopped(&c->cl);
@@ -1913,8 +1912,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
- iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
- sizeof(struct btree_iter_set);
+ iter_size = sizeof(struct btree_iter) +
+ ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
+ sizeof(struct btree_iter_set);
c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
if (!c->devices)
@@ -1939,7 +1939,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
if (!c->uuids)
goto err;
- c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
+ c->moving_gc_wq = alloc_workqueue("bcache_gc",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!c->moving_gc_wq)
goto err;
@@ -2017,7 +2018,7 @@ static int run_cache_set(struct cache_set *c)
c->root = bch_btree_node_get(c, NULL, k,
j->btree_level,
true, NULL);
- if (IS_ERR_OR_NULL(c->root))
+ if (IS_ERR(c->root))
goto err;
list_del_init(&c->root->list);
@@ -2088,7 +2089,7 @@ static int run_cache_set(struct cache_set *c)
err = "cannot allocate new btree root";
c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
- if (IS_ERR_OR_NULL(c->root))
+ if (IS_ERR(c->root))
goto err;
mutex_lock(&c->root->write_lock);
@@ -2216,10 +2217,10 @@ void bch_cache_release(struct kobject *kobj)
free_fifo(&ca->free[i]);
if (ca->sb_disk)
- put_page(virt_to_page(ca->sb_disk));
+ folio_put(virt_to_folio(ca->sb_disk));
- if (!IS_ERR_OR_NULL(ca->bdev))
- blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ if (ca->bdev_file)
+ fput(ca->bdev_file);
kfree(ca);
module_put(THIS_MODULE);
@@ -2236,18 +2237,50 @@ static int cache_alloc(struct cache *ca)
__module_get(THIS_MODULE);
kobject_init(&ca->kobj, &bch_cache_ktype);
- bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0);
+ bio_init_inline(&ca->journal.bio, NULL, 8, 0);
/*
- * when ca->sb.njournal_buckets is not zero, journal exists,
- * and in bch_journal_replay(), tree node may split,
- * so bucket of RESERVE_BTREE type is needed,
- * the worst situation is all journal buckets are valid journal,
- * and all the keys need to replay,
- * so the number of RESERVE_BTREE type buckets should be as much
- * as journal buckets
+ * When the cache disk is first registered, ca->sb.njournal_buckets
+ * is zero, and it is assigned in run_cache_set().
+ *
+ * When ca->sb.njournal_buckets is not zero, journal exists,
+ * and in bch_journal_replay(), tree node may split.
+ * The worst situation is all journal buckets are valid journal,
+ * and all the keys need to replay, so the number of RESERVE_BTREE
+ * type buckets should be as much as journal buckets.
+ *
+ * If the number of RESERVE_BTREE type buckets is too few, the
+ * bch_allocator_thread() may hang up and unable to allocate
+ * bucket. The situation is roughly as follows:
+ *
+ * 1. In bch_data_insert_keys(), if the operation is not op->replace,
+ * it will call the bch_journal(), which increments the journal_ref
+ * counter. This counter is only decremented after bch_btree_insert
+ * completes.
+ *
+ * 2. When calling bch_btree_insert, if the btree needs to split,
+ * it will call btree_split() and btree_check_reserve() to check
+ * whether there are enough reserved buckets in the RESERVE_BTREE
+ * slot. If not enough, bcache_btree_root() will repeatedly retry.
+ *
+ * 3. Normally, the bch_allocator_thread is responsible for filling
+ * the reservation slots from the free_inc bucket list. When the
+ * free_inc bucket list is exhausted, the bch_allocator_thread
+ * will call invalidate_buckets() until free_inc is refilled.
+ * Then bch_allocator_thread calls bch_prio_write() once. and
+ * bch_prio_write() will call bch_journal_meta() and waits for
+ * the journal write to complete.
+ *
+ * 4. During journal_write, journal_write_unlocked() is be called.
+ * If journal full occurs, journal_reclaim() and btree_flush_write()
+ * will be called sequentially, then retry journal_write.
+ *
+ * 5. When 2 and 4 occur together, IO will hung up and cannot recover.
+ *
+ * Therefore, reserve more RESERVE_BTREE type buckets.
*/
- btree_buckets = ca->sb.njournal_buckets ?: 8;
+ btree_buckets = clamp_t(size_t, ca->sb.nbuckets >> 7,
+ 32, SB_JOURNAL_BUCKETS);
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
if (!free) {
ret = -EPERM;
@@ -2339,39 +2372,39 @@ err_free:
}
static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
- struct block_device *bdev, struct cache *ca)
+ struct file *bdev_file,
+ struct cache *ca)
{
const char *err = NULL; /* must be set for any error case */
int ret = 0;
memcpy(&ca->sb, sb, sizeof(struct cache_sb));
- ca->bdev = bdev;
- ca->bdev->bd_holder = ca;
+ ca->bdev_file = bdev_file;
+ ca->bdev = file_bdev(bdev_file);
ca->sb_disk = sb_disk;
- if (bdev_max_discard_sectors((bdev)))
- ca->discard = CACHE_DISCARD(&ca->sb);
-
ret = cache_alloc(ca);
if (ret != 0) {
- /*
- * If we failed here, it means ca->kobj is not initialized yet,
- * kobject_put() won't be called and there is no chance to
- * call blkdev_put() to bdev in bch_cache_release(). So we
- * explicitly call blkdev_put() here.
- */
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
else if (ret == -EPERM)
err = "cache_alloc(): cache device is too small";
else
err = "cache_alloc(): unknown error";
- goto err;
+ pr_notice("error %pg: %s\n", file_bdev(bdev_file), err);
+ /*
+ * If we failed here, it means ca->kobj is not initialized yet,
+ * kobject_put() won't be called and there is no chance to
+ * call fput() to bdev in bch_cache_release(). So
+ * we explicitly call fput() on the block device here.
+ */
+ fput(bdev_file);
+ return ret;
}
- if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
- err = "error calling kobject_add";
+ if (kobject_add(&ca->kobj, bdev_kobj(file_bdev(bdev_file)), "bcache")) {
+ pr_notice("error %pg: error calling kobject_add\n",
+ file_bdev(bdev_file));
ret = -ENOMEM;
goto out;
}
@@ -2385,15 +2418,10 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
goto out;
}
- pr_info("registered cache device %pg\n", ca->bdev);
+ pr_info("registered cache device %pg\n", file_bdev(ca->bdev_file));
out:
kobject_put(&ca->kobj);
-
-err:
- if (err)
- pr_notice("error %pg: %s\n", ca->bdev, err);
-
return ret;
}
@@ -2448,7 +2476,8 @@ struct async_reg_args {
char *path;
struct cache_sb *sb;
struct cache_sb_disk *sb_disk;
- struct block_device *bdev;
+ struct file *bdev_file;
+ void *holder;
};
static void register_bdev_worker(struct work_struct *work)
@@ -2456,22 +2485,13 @@ static void register_bdev_worker(struct work_struct *work)
int fail = false;
struct async_reg_args *args =
container_of(work, struct async_reg_args, reg_work.work);
- struct cached_dev *dc;
-
- dc = kzalloc(sizeof(*dc), GFP_KERNEL);
- if (!dc) {
- fail = true;
- put_page(virt_to_page(args->sb_disk));
- blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
- goto out;
- }
mutex_lock(&bch_register_lock);
- if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
+ if (register_bdev(args->sb, args->sb_disk, args->bdev_file,
+ args->holder) < 0)
fail = true;
mutex_unlock(&bch_register_lock);
-out:
if (fail)
pr_info("error %s: fail to register backing device\n",
args->path);
@@ -2486,21 +2506,12 @@ static void register_cache_worker(struct work_struct *work)
int fail = false;
struct async_reg_args *args =
container_of(work, struct async_reg_args, reg_work.work);
- struct cache *ca;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca) {
- fail = true;
- put_page(virt_to_page(args->sb_disk));
- blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
- goto out;
- }
/* blkdev_put() will be called in bch_cache_release() */
- if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
+ if (register_cache(args->sb, args->sb_disk, args->bdev_file,
+ args->holder))
fail = true;
-out:
if (fail)
pr_info("error %s: fail to register cache device\n",
args->path);
@@ -2518,7 +2529,14 @@ static void register_device_async(struct async_reg_args *args)
INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
/* 10 jiffies is enough for a delay */
- queue_delayed_work(system_wq, &args->reg_work, 10);
+ queue_delayed_work(system_percpu_wq, &args->reg_work, 10);
+}
+
+static void *alloc_holder_object(struct cache_sb *sb)
+{
+ if (SB_IS_BDEV(sb))
+ return kzalloc(sizeof(struct cached_dev), GFP_KERNEL);
+ return kzalloc(sizeof(struct cache), GFP_KERNEL);
}
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
@@ -2528,9 +2546,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
char *path = NULL;
struct cache_sb *sb;
struct cache_sb_disk *sb_disk;
- struct block_device *bdev;
+ struct file *bdev_file, *bdev_file2;
+ void *holder = NULL;
ssize_t ret;
bool async_registration = false;
+ bool quiet = false;
#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
async_registration = true;
@@ -2559,11 +2579,30 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
ret = -EINVAL;
err = "failed to open device";
- bdev = blkdev_get_by_path(strim(path),
- FMODE_READ|FMODE_WRITE|FMODE_EXCL,
- sb);
- if (IS_ERR(bdev)) {
- if (bdev == ERR_PTR(-EBUSY)) {
+ bdev_file = bdev_file_open_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ goto out_free_sb;
+
+ err = read_super(sb, file_bdev(bdev_file), &sb_disk);
+ if (err)
+ goto out_blkdev_put;
+
+ holder = alloc_holder_object(sb);
+ if (!holder) {
+ ret = -ENOMEM;
+ err = "cannot allocate memory";
+ goto out_put_sb_folio;
+ }
+
+ /* Now reopen in exclusive mode with proper holder */
+ bdev_file2 = bdev_file_open_by_dev(file_bdev(bdev_file)->bd_dev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE, holder, NULL);
+ fput(bdev_file);
+ bdev_file = bdev_file2;
+ if (IS_ERR(bdev_file)) {
+ ret = PTR_ERR(bdev_file);
+ bdev_file = NULL;
+ if (ret == -EBUSY) {
dev_t dev;
mutex_lock(&bch_register_lock);
@@ -2573,20 +2612,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
else
err = "device busy";
mutex_unlock(&bch_register_lock);
- if (attr == &ksysfs_register_quiet)
- goto done;
+ if (attr == &ksysfs_register_quiet) {
+ quiet = true;
+ ret = size;
+ }
}
- goto out_free_sb;
+ goto out_free_holder;
}
- err = "failed to set blocksize";
- if (set_blocksize(bdev, 4096))
- goto out_blkdev_put;
-
- err = read_super(sb, bdev, &sb_disk);
- if (err)
- goto out_blkdev_put;
-
err = "failed to register device";
if (async_registration) {
@@ -2597,59 +2630,46 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!args) {
ret = -ENOMEM;
err = "cannot allocate memory";
- goto out_put_sb_page;
+ goto out_free_holder;
}
args->path = path;
args->sb = sb;
args->sb_disk = sb_disk;
- args->bdev = bdev;
+ args->bdev_file = bdev_file;
+ args->holder = holder;
register_device_async(args);
/* No wait and returns to user space */
goto async_done;
}
if (SB_IS_BDEV(sb)) {
- struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
-
- if (!dc) {
- ret = -ENOMEM;
- err = "cannot allocate memory";
- goto out_put_sb_page;
- }
-
mutex_lock(&bch_register_lock);
- ret = register_bdev(sb, sb_disk, bdev, dc);
+ ret = register_bdev(sb, sb_disk, bdev_file, holder);
mutex_unlock(&bch_register_lock);
/* blkdev_put() will be called in cached_dev_free() */
if (ret < 0)
goto out_free_sb;
} else {
- struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-
- if (!ca) {
- ret = -ENOMEM;
- err = "cannot allocate memory";
- goto out_put_sb_page;
- }
-
/* blkdev_put() will be called in bch_cache_release() */
- ret = register_cache(sb, sb_disk, bdev, ca);
+ ret = register_cache(sb, sb_disk, bdev_file, holder);
if (ret)
goto out_free_sb;
}
-done:
kfree(sb);
kfree(path);
module_put(THIS_MODULE);
async_done:
return size;
-out_put_sb_page:
- put_page(virt_to_page(sb_disk));
+out_free_holder:
+ kfree(holder);
+out_put_sb_folio:
+ folio_put(virt_to_folio(sb_disk));
out_blkdev_put:
- blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ if (bdev_file)
+ fput(bdev_file);
out_free_sb:
kfree(sb);
out_free_path:
@@ -2658,7 +2678,8 @@ out_free_path:
out_module_put:
module_put(THIS_MODULE);
out:
- pr_info("error %s: %s\n", path?path:"", err);
+ if (!quiet)
+ pr_info("error %s: %s\n", path?path:"", err);
return ret;
}
@@ -2882,24 +2903,25 @@ static int __init bcache_init(void)
if (bch_btree_init())
goto err;
- bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
+ bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!bcache_wq)
goto err;
/*
* Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
*
- * 1. It used `system_wq` before which also does no memory reclaim.
+ * 1. It used `system_percpu_wq` before which also does no memory reclaim.
* 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
* reduced throughput can be observed.
*
- * We still want to user our own queue to not congest the `system_wq`.
+ * We still want to user our own queue to not congest the `system_percpu_wq`.
*/
- bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
+ bch_flush_wq = alloc_workqueue("bch_flush", WQ_PERCPU, 0);
if (!bch_flush_wq)
goto err;
- bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
+ bch_journal_wq = alloc_workqueue("bch_journal",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!bch_journal_wq)
goto err;
@@ -2912,7 +2934,6 @@ static int __init bcache_init(void)
goto err;
bch_debug_init();
- closure_debug_init();
bcache_is_reboot = false;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c6f677059214..72f38e5b6f5c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -134,7 +134,6 @@ read_attribute(partial_stripes_expensive);
rw_attribute(synchronous);
rw_attribute(journal_delay_ms);
rw_attribute(io_disable);
-rw_attribute(discard);
rw_attribute(running);
rw_attribute(label);
rw_attribute(errors);
@@ -660,7 +659,7 @@ static unsigned int bch_root_usage(struct cache_set *c)
unsigned int bytes = 0;
struct bkey *k;
struct btree *b;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
goto lock_root;
@@ -702,13 +701,7 @@ static unsigned int bch_cache_max_chain(struct cache_set *c)
for (h = c->bucket_hash;
h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
h++) {
- unsigned int i = 0;
- struct hlist_node *p;
-
- hlist_for_each(p, h)
- i++;
-
- ret = max(ret, i);
+ ret = max(ret, hlist_count_nodes(h));
}
mutex_unlock(&c->bucket_lock);
@@ -866,7 +859,8 @@ STORE(__bch_cache_set)
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->shrink.scan_objects(&c->shrink, &sc);
+ if (c->shrink)
+ c->shrink->scan_objects(c->shrink, &sc);
}
sysfs_strtoul_clamp(congested_read_threshold_us,
@@ -1041,7 +1035,6 @@ SHOW(__bch_cache)
sysfs_hprint(bucket_size, bucket_bytes(ca));
sysfs_hprint(block_size, block_bytes(ca));
sysfs_print(nbuckets, ca->sb.nbuckets);
- sysfs_print(discard, ca->discard);
sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
sysfs_hprint(btree_written,
atomic_long_read(&ca->btree_sectors_written) << 9);
@@ -1103,7 +1096,7 @@ SHOW(__bch_cache)
sum += INITIAL_PRIO - cached[i];
if (n)
- do_div(sum, n);
+ sum = div64_u64(sum, n);
for (i = 0; i < ARRAY_SIZE(q); i++)
q[i] = INITIAL_PRIO - cached[n * (i + 1) /
@@ -1111,26 +1104,25 @@ SHOW(__bch_cache)
vfree(p);
- ret = scnprintf(buf, PAGE_SIZE,
- "Unused: %zu%%\n"
- "Clean: %zu%%\n"
- "Dirty: %zu%%\n"
- "Metadata: %zu%%\n"
- "Average: %llu\n"
- "Sectors per Q: %zu\n"
- "Quantiles: [",
- unused * 100 / (size_t) ca->sb.nbuckets,
- available * 100 / (size_t) ca->sb.nbuckets,
- dirty * 100 / (size_t) ca->sb.nbuckets,
- meta * 100 / (size_t) ca->sb.nbuckets, sum,
- n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
+ ret = sysfs_emit(buf,
+ "Unused: %zu%%\n"
+ "Clean: %zu%%\n"
+ "Dirty: %zu%%\n"
+ "Metadata: %zu%%\n"
+ "Average: %llu\n"
+ "Sectors per Q: %zu\n"
+ "Quantiles: [",
+ unused * 100 / (size_t) ca->sb.nbuckets,
+ available * 100 / (size_t) ca->sb.nbuckets,
+ dirty * 100 / (size_t) ca->sb.nbuckets,
+ meta * 100 / (size_t) ca->sb.nbuckets, sum,
+ n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
for (i = 0; i < ARRAY_SIZE(q); i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "%u ", q[i]);
+ ret += sysfs_emit_at(buf, ret, "%u ", q[i]);
ret--;
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
+ ret += sysfs_emit_at(buf, ret, "]\n");
return ret;
}
@@ -1148,18 +1140,6 @@ STORE(__bch_cache)
if (bcache_is_reboot)
return -EBUSY;
- if (attr == &sysfs_discard) {
- bool v = strtoul_or_return(buf);
-
- if (bdev_max_discard_sectors(ca->bdev))
- ca->discard = v;
-
- if (v != CACHE_DISCARD(&ca->sb)) {
- SET_CACHE_DISCARD(&ca->sb, v);
- bcache_write_super(ca->set);
- }
- }
-
if (attr == &sysfs_cache_replacement_policy) {
v = __sysfs_match_string(cache_replacement_policies, -1, buf);
if (v < 0)
@@ -1191,7 +1171,6 @@ static struct attribute *bch_cache_attrs[] = {
&sysfs_block_size,
&sysfs_nbuckets,
&sysfs_priority_stats,
- &sysfs_discard,
&sysfs_written,
&sysfs_btree_written,
&sysfs_metadata_written,
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
index a2ff6447b699..65b8bd975ab1 100644
--- a/drivers/md/bcache/sysfs.h
+++ b/drivers/md/bcache/sysfs.h
@@ -3,7 +3,7 @@
#define _BCACHE_SYSFS_H_
#define KTYPE(type) \
-struct kobj_type type ## _ktype = { \
+const struct kobj_type type ## _ktype = { \
.release = type ## _release, \
.sysfs_ops = &((const struct sysfs_ops) { \
.show = type ## _show, \
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index ae380bc3992e..410d8cb49e50 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * random utiility code, for bcache but in theory not specific to bcache
+ * random utility code, for bcache but in theory not specific to bcache
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 6f3cb7c92130..f61ab1bada6c 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,6 +4,7 @@
#define _BCACHE_UTIL_H
#include <linux/blkdev.h>
+#include <linux/closure.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/sched/clock.h>
@@ -13,8 +14,6 @@
#include <linux/workqueue.h>
#include <linux/crc64.h>
-#include "closure.h"
-
struct closure;
#ifdef CONFIG_BCACHE_DEBUG
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d4a5fc0650bb..4b237074f453 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -331,26 +331,26 @@ static void dirty_init(struct keybuf_key *w)
struct dirty_io *io = w->private;
struct bio *bio = &io->bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
+ bio_init_inline(bio, NULL,
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
if (!io->dc->writeback_percent)
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
bio->bi_private = w;
bch_bio_map(bio, NULL);
}
-static void dirty_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(dirty_io_destructor)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
kfree(io);
}
-static void write_dirty_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty_finish)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
@@ -400,9 +400,9 @@ static void dirty_endio(struct bio *bio)
closure_put(&io->cl);
}
-static void write_dirty(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
@@ -462,9 +462,9 @@ static void read_dirty_endio(struct bio *bio)
dirty_endio(bio);
}
-static void read_dirty_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_dirty_submit)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
closure_bio_submit(io->dc->disk.c, &io->bio, cl);
@@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc)
for (i = 0; i < nk; i++) {
w = keys[i];
- io = kzalloc(struct_size(io, bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
@@ -805,8 +805,7 @@ static int bch_writeback_thread(void *arg)
* may set BCH_ENABLE_AUTO_GC via sysfs, then when
* BCH_DO_AUTO_GC is set, garbage collection thread
* will be wake up here. After moving gc, the shrunk
- * btree and discarded free buckets SSD space may be
- * helpful for following write requests.
+ * btree may be helpful for following write requests.
*/
if (c->gc_after_writeback ==
(BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
@@ -890,6 +889,16 @@ static int bch_root_node_dirty_init(struct cache_set *c,
if (ret < 0)
pr_warn("sectors dirty init failed, ret=%d!\n", ret);
+ /*
+ * The op may be added to cache_set's btree_cache_wait
+ * in mca_cannibalize(), must ensure it is removed from
+ * the list and release btree_cache_alloc_lock before
+ * free op memory.
+ * Otherwise, the btree_cache_wait will be damaged.
+ */
+ bch_cannibalize_unlock(c);
+ finish_wait(&c->btree_cache_wait, &(&op.op)->wait);
+
return ret;
}
@@ -898,15 +907,15 @@ static int bch_dirty_init_thread(void *arg)
struct dirty_init_thrd_info *info = arg;
struct bch_dirty_init_state *state = info->state;
struct cache_set *c = state->c;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
k = p = NULL;
- cur_idx = prev_idx = 0;
+ prev_idx = 0;
- bch_btree_iter_init(&c->root->keys, &iter, NULL);
- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
p = k;
@@ -920,7 +929,7 @@ static int bch_dirty_init_thread(void *arg)
skip_nr = cur_idx - prev_idx;
while (skip_nr) {
- k = bch_btree_iter_next_filter(&iter,
+ k = bch_btree_iter_next_filter(&iter.iter,
&c->root->keys,
bch_ptr_bad);
if (k)
@@ -967,24 +976,35 @@ static int bch_btre_dirty_init_thread_nr(void)
void bch_sectors_dirty_init(struct bcache_device *d)
{
int i;
+ struct btree *b = NULL;
struct bkey *k = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct sectors_dirty_init op;
struct cache_set *c = d->c;
struct bch_dirty_init_state state;
+retry_lock:
+ b = c->root;
+ rw_lock(0, b, b->level);
+ if (b != c->root) {
+ rw_unlock(0, b);
+ goto retry_lock;
+ }
+
/* Just count root keys if no leaf node */
- rw_lock(0, c->root, c->root->level);
if (c->root->level == 0) {
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
op.count = 0;
for_each_key_filter(&c->root->keys,
- k, &iter, bch_ptr_invalid)
+ k, &iter, bch_ptr_invalid) {
+ if (KEY_INODE(k) != op.inode)
+ continue;
sectors_dirty_init_fn(&op.op, c->root, k);
+ }
- rw_unlock(0, c->root);
+ rw_unlock(0, b);
return;
}
@@ -1004,23 +1024,24 @@ void bch_sectors_dirty_init(struct bcache_device *d)
if (atomic_read(&state.enough))
break;
+ atomic_inc(&state.started);
state.infos[i].state = &state;
state.infos[i].thread =
kthread_run(bch_dirty_init_thread, &state.infos[i],
"bch_dirtcnt[%d]", i);
if (IS_ERR(state.infos[i].thread)) {
pr_err("fails to run thread bch_dirty_init[%d]\n", i);
+ atomic_dec(&state.started);
for (--i; i >= 0; i--)
kthread_stop(state.infos[i].thread);
goto out;
}
- atomic_inc(&state.started);
}
out:
/* Must wait for all threads to stop. */
wait_event(state.wait, atomic_read(&state.started) == 0);
- rw_unlock(0, c->root);
+ rw_unlock(0, b);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -1054,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
- WQ_MEM_RECLAIM, 0);
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!dc->writeback_write_wq)
return -ENOMEM;