summaryrefslogtreecommitdiff
path: root/drivers/md/bcache
diff options
context:
space:
mode:
authorRadim Krčmář <rkrcmar@redhat.com>2018-02-01 15:04:17 +0100
committerRadim Krčmář <rkrcmar@redhat.com>2018-02-01 15:04:17 +0100
commit7bf14c28ee776be567855bd39ed8ff795ea19f55 (patch)
tree6113748c673e85fccc2c56c050697789c00c6bc2 /drivers/md/bcache
parent87cedc6be55954c6efd6eca2e694132513f65a2a (diff)
parent5fa4ec9cb2e6679e2f828033726f758ea314b9c5 (diff)
Merge branch 'x86/hyperv' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Topic branch for stable KVM clockource under Hyper-V. Thanks to Christoffer Dall for resolving the ARM conflict.
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r--drivers/md/bcache/alloc.c19
-rw-r--r--drivers/md/bcache/bcache.h24
-rw-r--r--drivers/md/bcache/btree.c10
-rw-r--r--drivers/md/bcache/closure.c47
-rw-r--r--drivers/md/bcache/closure.h60
-rw-r--r--drivers/md/bcache/debug.c7
-rw-r--r--drivers/md/bcache/io.c13
-rw-r--r--drivers/md/bcache/movinggc.c2
-rw-r--r--drivers/md/bcache/request.c29
-rw-r--r--drivers/md/bcache/super.c27
-rw-r--r--drivers/md/bcache/util.c34
-rw-r--r--drivers/md/bcache/util.h1
-rw-r--r--drivers/md/bcache/writeback.c203
-rw-r--r--drivers/md/bcache/writeback.h12
14 files changed, 347 insertions, 141 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a0cc1bc6d884..6cc6c0f9c3a9 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -525,15 +525,21 @@ struct open_bucket {
/*
* We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
- * the last write to it was sequential with the current write, and failing that
- * we look for a bucket that was last used by the same task.
+ * write streams for better cache utilization: first we try to segregate flash
+ * only volume write streams from cached devices, secondly we look for a bucket
+ * where the last write to it was sequential with the current write, and
+ * failing that we look for a bucket that was last used by the same task.
*
* The ideas is if you've got multiple tasks pulling data into the cache at the
* same time, you'll get better cache utilization if you try to segregate their
* data and preserve locality.
*
- * For example, say you've starting Firefox at the same time you're copying a
+ * For example, dirty sectors of flash only volume is not reclaimable, if their
+ * dirty sectors mixed with dirty sectors of cached device, such buckets will
+ * be marked as dirty and won't be reclaimed, though the dirty data of cached
+ * device have been written back to backend device.
+ *
+ * And say you've starting Firefox at the same time you're copying a
* bunch of files. Firefox will likely end up being fairly hot and stay in the
* cache awhile, but the data you copied might not be; if you wrote all that
* data to the same buckets it'd get invalidated at the same time.
@@ -550,7 +556,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
struct open_bucket *ret, *ret_task = NULL;
list_for_each_entry_reverse(ret, &c->data_buckets, list)
- if (!bkey_cmp(&ret->key, search))
+ if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
+ UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
+ continue;
+ else if (!bkey_cmp(&ret->key, search))
goto found;
else if (ret->last_write_point == write_point)
ret_task = ret;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 843877e017e1..5e2d4e80198e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -320,14 +320,15 @@ struct cached_dev {
*/
atomic_t has_dirty;
- struct bch_ratelimit writeback_rate;
- struct delayed_work writeback_rate_update;
-
/*
- * Internal to the writeback code, so read_dirty() can keep track of
- * where it's at.
+ * Set to zero by things that touch the backing volume-- except
+ * writeback. Incremented by writeback. Used to determine when to
+ * accelerate idle writeback.
*/
- sector_t last_read;
+ atomic_t backing_idle;
+
+ struct bch_ratelimit writeback_rate;
+ struct delayed_work writeback_rate_update;
/* Limit number of writeback bios in flight */
struct semaphore in_flight;
@@ -336,6 +337,14 @@ struct cached_dev {
struct keybuf writeback_keys;
+ /*
+ * Order the write-half of writeback operations strongly in dispatch
+ * order. (Maintain LBA order; don't allow reads completing out of
+ * order to re-order the writes...)
+ */
+ struct closure_waitlist writeback_ordering_wait;
+ atomic_t writeback_sequence_next;
+
/* For tracking sequential IO */
#define RECENT_IO_BITS 7
#define RECENT_IO (1 << RECENT_IO_BITS)
@@ -488,6 +497,7 @@ struct cache_set {
int caches_loaded;
struct bcache_device **devices;
+ unsigned devices_max_used;
struct list_head cached_devs;
uint64_t cached_dev_sectors;
struct closure caching;
@@ -852,7 +862,7 @@ static inline void wake_up_allocators(struct cache_set *c)
/* Forward declarations */
-void bch_count_io_errors(struct cache *, blk_status_t, const char *);
+void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
blk_status_t, const char *);
void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81e8dc3dbe5e..bf3a48aa9a9a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b)
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i));
- if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+ if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
int j;
struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b)
continue_at(cl, btree_node_write_done, NULL);
} else {
+ /* No problem for multipage bvec since the bio is just allocated */
b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i);
@@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
/* don't reclaim buckets to which writeback keys point */
rcu_read_lock();
- for (i = 0; i < c->nr_uuids; i++) {
+ for (i = 0; i < c->devices_max_used; i++) {
struct bcache_device *d = c->devices[i];
struct cached_dev *dc;
struct keybuf_key *w, *n;
@@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg)
int bch_gc_thread_start(struct cache_set *c)
{
c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
- if (IS_ERR(c->gc_thread))
- return PTR_ERR(c->gc_thread);
-
- return 0;
+ return PTR_ERR_OR_ZERO(c->gc_thread);
}
/* Initial partial gc */
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 1841d0359bac..7f12920c14f7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -8,6 +8,7 @@
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/seq_file.h>
+#include <linux/sched/debug.h>
#include "closure.h"
@@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
BUG_ON(flags & CLOSURE_GUARD_MASK);
BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
- /* Must deliver precisely one wakeup */
- if (r == 1 && (flags & CLOSURE_SLEEPING))
- wake_up_process(cl->task);
-
if (!r) {
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
atomic_set(&cl->remaining,
@@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
}
EXPORT_SYMBOL(closure_wait);
-/**
- * closure_sync - sleep until a closure has nothing left to wait on
- *
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
- * the last refcount.
- */
-void closure_sync(struct closure *cl)
+struct closure_syncer {
+ struct task_struct *task;
+ int done;
+};
+
+static void closure_sync_fn(struct closure *cl)
{
- while (1) {
- __closure_start_sleep(cl);
- closure_set_ret_ip(cl);
+ cl->s->done = 1;
+ wake_up_process(cl->s->task);
+}
- if ((atomic_read(&cl->remaining) &
- CLOSURE_REMAINING_MASK) == 1)
- break;
+void __sched __closure_sync(struct closure *cl)
+{
+ struct closure_syncer s = { .task = current };
+ cl->s = &s;
+ continue_at(cl, closure_sync_fn, NULL);
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (s.done)
+ break;
schedule();
}
- __closure_end_sleep(cl);
+ __set_current_state(TASK_RUNNING);
}
-EXPORT_SYMBOL(closure_sync);
+EXPORT_SYMBOL(__closure_sync);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data)
cl, (void *) cl->ip, cl->fn, cl->parent,
r & CLOSURE_REMAINING_MASK);
- seq_printf(f, "%s%s%s%s\n",
+ seq_printf(f, "%s%s\n",
test_bit(WORK_STRUCT_PENDING_BIT,
work_data_bits(&cl->work)) ? "Q" : "",
- r & CLOSURE_RUNNING ? "R" : "",
- r & CLOSURE_STACK ? "S" : "",
- r & CLOSURE_SLEEPING ? "Sl" : "");
+ r & CLOSURE_RUNNING ? "R" : "");
if (r & CLOSURE_WAITING)
seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index ccfbea6f9f6b..3b9dfc9962ad 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -103,6 +103,7 @@
*/
struct closure;
+struct closure_syncer;
typedef void (closure_fn) (struct closure *);
struct closure_waitlist {
@@ -115,10 +116,6 @@ enum closure_state {
* the thread that owns the closure, and cleared by the thread that's
* waking up the closure.
*
- * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
- * - indicates that cl->task is valid and closure_put() may wake it up.
- * Only set or cleared by the thread that owns the closure.
- *
* The rest are for debugging and don't affect behaviour:
*
* CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -128,22 +125,16 @@ enum closure_state {
* continue_at() and closure_return() clear it for you, if you're doing
* something unusual you can use closure_set_dead() which also helps
* annotate where references are being transferred.
- *
- * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
- * closure with this flag set
*/
- CLOSURE_BITS_START = (1 << 23),
- CLOSURE_DESTRUCTOR = (1 << 23),
- CLOSURE_WAITING = (1 << 25),
- CLOSURE_SLEEPING = (1 << 27),
- CLOSURE_RUNNING = (1 << 29),
- CLOSURE_STACK = (1 << 31),
+ CLOSURE_BITS_START = (1U << 26),
+ CLOSURE_DESTRUCTOR = (1U << 26),
+ CLOSURE_WAITING = (1U << 28),
+ CLOSURE_RUNNING = (1U << 30),
};
#define CLOSURE_GUARD_MASK \
- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \
- CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
@@ -152,7 +143,7 @@ struct closure {
union {
struct {
struct workqueue_struct *wq;
- struct task_struct *task;
+ struct closure_syncer *s;
struct llist_node list;
closure_fn *fn;
};
@@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v);
void closure_put(struct closure *cl);
void __closure_wake_up(struct closure_waitlist *list);
bool closure_wait(struct closure_waitlist *list, struct closure *cl);
-void closure_sync(struct closure *cl);
+void __closure_sync(struct closure *cl);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+ __closure_sync(cl);
+}
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f)
#endif
}
-static inline void __closure_end_sleep(struct closure *cl)
-{
- __set_current_state(TASK_RUNNING);
-
- if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
- atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
-}
-
-static inline void __closure_start_sleep(struct closure *cl)
-{
- closure_set_ip(cl);
- cl->task = current;
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
- atomic_add(CLOSURE_SLEEPING, &cl->remaining);
-}
-
static inline void closure_set_stopped(struct closure *cl)
{
atomic_sub(CLOSURE_RUNNING, &cl->remaining);
@@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl)
static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
struct workqueue_struct *wq)
{
- BUG_ON(object_is_on_stack(cl));
closure_set_ip(cl);
cl->fn = fn;
cl->wq = wq;
@@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
static inline void closure_init_stack(struct closure *cl)
{
memset(cl, 0, sizeof(struct closure));
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
}
/**
@@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list)
* This is because after calling continue_at() you no longer have a ref on @cl,
* and whatever @cl owns may be freed out from under you - a running closure fn
* has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
*/
#define continue_at(_cl, _fn, _wq) \
do { \
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c7a02c4900da..af89408befe8 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
return;
check->bi_opf = REQ_OP_READ;
- if (bio_alloc_pages(check, GFP_NOIO))
+ if (bch_bio_alloc_pages(check, GFP_NOIO))
goto out_put;
submit_bio_wait(check);
@@ -251,8 +251,7 @@ void bch_debug_exit(void)
int __init bch_debug_init(struct kobject *kobj)
{
- int ret = 0;
-
debug = debugfs_create_dir("bcache", NULL);
- return ret;
+
+ return IS_ERR_OR_NULL(debug);
}
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fac97ec2d0e2..a783c5a41ff1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
/* IO errors */
-void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
+void bch_count_io_errors(struct cache *ca,
+ blk_status_t error,
+ int is_read,
+ const char *m)
{
/*
* The halflife of an error is:
@@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
errors >>= IO_ERROR_SHIFT;
if (errors < ca->set->error_limit)
- pr_err("%s: IO error on %s, recovering",
- bdevname(ca->bdev, buf), m);
+ pr_err("%s: IO error on %s%s",
+ bdevname(ca->bdev, buf), m,
+ is_read ? ", recovering." : ".");
else
bch_cache_set_error(ca->set,
"%s: too many IO errors %s",
@@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
{
struct bbio *b = container_of(bio, struct bbio, bio);
struct cache *ca = PTR_CACHE(c, &b->key, 0);
+ int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
unsigned threshold = op_is_write(bio_op(bio))
? c->congested_write_threshold_us
@@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
atomic_inc(&c->congested);
}
- bch_count_io_errors(ca, error, m);
+ bch_count_io_errors(ca, error, is_read, m);
}
void bch_bbio_endio(struct cache_set *c, struct bio *bio,
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index d50c1c97da68..a24c3a95b2c0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c)
bio_set_op_attrs(bio, REQ_OP_READ, 0);
bio->bi_end_io = read_moving_endio;
- if (bio_alloc_pages(bio, GFP_KERNEL))
+ if (bch_bio_alloc_pages(bio, GFP_KERNEL))
goto err;
trace_bcache_gc_copy(&w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 643c3021624f..1a46b41dac70 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl)
{
struct search *s = container_of(cl, struct search, iop.cl);
struct bio *bio = &s->bio.bio;
+ struct cached_dev *dc;
int ret;
bch_btree_op_init(&s->op, -1);
@@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl)
return;
}
+ /*
+ * We might meet err when searching the btree, If that happens, we will
+ * get negative ret, in this scenario we should not recover data from
+ * backing device (when cache device is dirty) because we don't know
+ * whether bkeys the read request covered are all clean.
+ *
+ * And after that happened, s->iop.status is still its initial value
+ * before we submit s->bio.bio
+ */
+ if (ret < 0) {
+ BUG_ON(ret == -EINTR);
+ if (s->d && s->d->c &&
+ !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
+ dc = container_of(s->d, struct cached_dev, disk);
+ if (dc && atomic_read(&dc->has_dirty))
+ s->recoverable = false;
+ }
+ if (!s->iop.status)
+ s->iop.status = BLK_STS_IOERR;
+ }
+
closure_return(cl);
}
@@ -611,8 +633,8 @@ static void request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- struct request_queue *q = s->orig_bio->bi_disk->queue;
- generic_end_io_acct(q, bio_data_dir(s->orig_bio),
+ generic_end_io_acct(s->d->disk->queue,
+ bio_data_dir(s->orig_bio),
&s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
@@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
cache_bio->bi_private = &s->cl;
bch_bio_map(cache_bio, NULL);
- if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
+ if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
goto out_put;
if (reada)
@@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
int rw = bio_data_dir(bio);
+ atomic_set(&dc->backing_idle, 0);
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b4d28928dec5..133b81225ea9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio)
static void __write_super(struct cache_sb *sb, struct bio *bio)
{
- struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+ struct cache_sb *out = page_address(bio_first_page_all(bio));
unsigned i;
bio->bi_iter.bi_sector = SB_SECTOR;
@@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
- bch_count_io_errors(ca, bio->bi_status, "writing superblock");
+ /* is_read = 0 */
+ bch_count_io_errors(ca, bio->bi_status, 0,
+ "writing superblock");
closure_put(&ca->set->sb_write);
}
@@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
d->c = c;
c->devices[id] = d;
+ if (id >= c->devices_max_used)
+ c->devices_max_used = id + 1;
+
closure_get(&c->caching);
}
@@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w)
mutex_lock(&bch_register_lock);
+ cancel_delayed_work_sync(&dc->writeback_rate_update);
+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+ kthread_stop(dc->writeback_thread);
+ dc->writeback_thread = NULL;
+ }
+
memset(&dc->sb.set_uuid, 0, 16);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
@@ -1166,7 +1177,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
dc->bdev->bd_holder = dc;
bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
- dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
get_page(sb_page);
if (cached_dev_init(dc, sb->block_size << 9))
@@ -1261,7 +1272,7 @@ static int flash_devs_run(struct cache_set *c)
struct uuid_entry *u;
for (u = c->uuids;
- u < c->uuids + c->nr_uuids && !ret;
+ u < c->uuids + c->devices_max_used && !ret;
u++)
if (UUID_FLASH_ONLY(u))
ret = flash_dev_run(c, u);
@@ -1427,7 +1438,7 @@ static void __cache_set_unregister(struct closure *cl)
mutex_lock(&bch_register_lock);
- for (i = 0; i < c->nr_uuids; i++)
+ for (i = 0; i < c->devices_max_used; i++)
if (c->devices[i]) {
if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
@@ -1490,7 +1501,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->bucket_bits = ilog2(sb->bucket_size);
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
-
+ c->devices_max_used = 0;
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1810,7 +1821,7 @@ void bch_cache_release(struct kobject *kobj)
free_fifo(&ca->free[i]);
if (ca->sb_bio.bi_inline_vecs[0].bv_page)
- put_page(ca->sb_bio.bi_io_vec[0].bv_page);
+ put_page(bio_first_page_all(&ca->sb_bio));
if (!IS_ERR_OR_NULL(ca->bdev))
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1864,7 +1875,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
ca->bdev->bd_holder = ca;
bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
- ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
get_page(sb_page);
if (blk_queue_discard(bdev_get_queue(ca->bdev)))
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index e548b8b51322..a23cd6a14b74 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
: 0;
}
+/*
+ * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly,
+ * the preferred way is bio_add_page, but in this case, bch_bio_map()
+ * supposes that the bvec table is empty, so it is safe to access
+ * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is
+ * supported.
+ */
void bch_bio_map(struct bio *bio, void *base)
{
size_t size = bio->bi_iter.bi_size;
@@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
}
}
+/**
+ * bch_bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+ int i;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ bv->bv_page = alloc_page(gfp_mask);
+ if (!bv->bv_page) {
+ while (--bv >= bio->bi_io_vec)
+ __free_page(bv->bv_page);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
/*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
* use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ed5e8a412eb8..4df4c5c1cab2 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
}
void bch_bio_map(struct bio *bio, void *base);
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
static inline sector_t bdev_sectors(struct block_device *bdev)
{
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 56a37884ca8b..51306a19ab03 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -18,17 +18,39 @@
#include <trace/events/bcache.h>
/* Rate limiting */
-
-static void __update_writeback_rate(struct cached_dev *dc)
+static uint64_t __calc_target_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
+
+ /*
+ * This is the size of the cache, minus the amount used for
+ * flash-only devices
+ */
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
bcache_flash_devs_sectors_dirty(c);
+
+ /*
+ * Unfortunately there is no control of global dirty data. If the
+ * user states that they want 10% dirty data in the cache, and has,
+ * e.g., 5 backing volumes of equal size, we try and ensure each
+ * backing volume uses about 2% of the cache for dirty data.
+ */
+ uint32_t bdev_share =
+ div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
+ c->cached_dev_sectors);
+
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
- int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
- c->cached_dev_sectors);
+ /* Ensure each backing dev gets at least one dirty share */
+ if (bdev_share < 1)
+ bdev_share = 1;
+
+ return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
+}
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
/*
* PI controller:
* Figures out the amount that should be written per second.
@@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
* This acts as a slow, long-term average that is not subject to
* variations in usage like the p term.
*/
+ int64_t target = __calc_target_rate(dc);
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
int64_t error = dirty - target;
int64_t proportional_scaled =
@@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
struct dirty_io {
struct closure cl;
struct cached_dev *dc;
+ uint16_t sequence;
struct bio bio;
};
@@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
+ struct cached_dev *dc = io->dc;
+
+ uint16_t next_sequence;
+
+ if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
+ /* Not our turn to write; wait for a write to complete */
+ closure_wait(&dc->writeback_ordering_wait, cl);
+
+ if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
+ /*
+ * Edge case-- it happened in indeterminate order
+ * relative to when we were added to wait list..
+ */
+ closure_wake_up(&dc->writeback_ordering_wait);
+ }
+
+ continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+ return;
+ }
+
+ next_sequence = io->sequence + 1;
/*
* IO errors are signalled using the dirty bit on the key.
@@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl);
}
+ atomic_set(&dc->writeback_sequence_next, next_sequence);
+ closure_wake_up(&dc->writeback_ordering_wait);
+
continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
@@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio)
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
+ /* is_read = 1 */
bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
- bio->bi_status, "reading dirty data from cache");
+ bio->bi_status, 1,
+ "reading dirty data from cache");
dirty_endio(bio);
}
@@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl)
static void read_dirty(struct cached_dev *dc)
{
unsigned delay = 0;
- struct keybuf_key *w;
+ struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
+ size_t size;
+ int nk, i;
struct dirty_io *io;
struct closure cl;
+ uint16_t sequence = 0;
+ BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
+ atomic_set(&dc->writeback_sequence_next, sequence);
closure_init_stack(&cl);
/*
@@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc)
* mempools.
*/
- while (!kthread_should_stop()) {
-
- w = bch_keybuf_next(&dc->writeback_keys);
- if (!w)
- break;
-
- BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
-
- if (KEY_START(&w->key) != dc->last_read ||
- jiffies_to_msecs(delay) > 50)
- while (!kthread_should_stop() && delay)
- delay = schedule_timeout_interruptible(delay);
-
- dc->last_read = KEY_OFFSET(&w->key);
-
- io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
- * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
- GFP_KERNEL);
- if (!io)
- goto err;
-
- w->private = io;
- io->dc = dc;
-
- dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
- io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
- bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
- io->bio.bi_end_io = read_dirty_endio;
-
- if (bio_alloc_pages(&io->bio, GFP_KERNEL))
- goto err_free;
+ next = bch_keybuf_next(&dc->writeback_keys);
+
+ while (!kthread_should_stop() && next) {
+ size = 0;
+ nk = 0;
+
+ do {
+ BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
+
+ /*
+ * Don't combine too many operations, even if they
+ * are all small.
+ */
+ if (nk >= MAX_WRITEBACKS_IN_PASS)
+ break;
+
+ /*
+ * If the current operation is very large, don't
+ * further combine operations.
+ */
+ if (size >= MAX_WRITESIZE_IN_PASS)
+ break;
+
+ /*
+ * Operations are only eligible to be combined
+ * if they are contiguous.
+ *
+ * TODO: add a heuristic willing to fire a
+ * certain amount of non-contiguous IO per pass,
+ * so that we can benefit from backing device
+ * command queueing.
+ */
+ if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
+ &START_KEY(&next->key)))
+ break;
+
+ size += KEY_SIZE(&next->key);
+ keys[nk++] = next;
+ } while ((next = bch_keybuf_next(&dc->writeback_keys)));
+
+ /* Now we have gathered a set of 1..5 keys to write back. */
+ for (i = 0; i < nk; i++) {
+ w = keys[i];
+
+ io = kzalloc(sizeof(struct dirty_io) +
+ sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
+ if (!io)
+ goto err;
+
+ w->private = io;
+ io->dc = dc;
+ io->sequence = sequence++;
+
+ dirty_init(w);
+ bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+ io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+ bio_set_dev(&io->bio,
+ PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
+ io->bio.bi_end_io = read_dirty_endio;
+
+ if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
+ goto err_free;
+
+ trace_bcache_writeback(&w->key);
+
+ down(&dc->in_flight);
+
+ /* We've acquired a semaphore for the maximum
+ * simultaneous number of writebacks; from here
+ * everything happens asynchronously.
+ */
+ closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+ }
- trace_bcache_writeback(&w->key);
+ delay = writeback_delay(dc, size);
- down(&dc->in_flight);
- closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+ /* If the control system would wait for at least half a
+ * second, and there's been no reqs hitting the backing disk
+ * for awhile: use an alternate mode where we have at most
+ * one contiguous set of writebacks in flight at a time. If
+ * someone wants to do IO it will be quick, as it will only
+ * have to contend with one operation in flight, and we'll
+ * be round-tripping data to the backing disk as quickly as
+ * it can accept it.
+ */
+ if (delay >= HZ / 2) {
+ /* 3 means at least 1.5 seconds, up to 7.5 if we
+ * have slowed way down.
+ */
+ if (atomic_inc_return(&dc->backing_idle) >= 3) {
+ /* Wait for current I/Os to finish */
+ closure_sync(&cl);
+ /* And immediately launch a new set. */
+ delay = 0;
+ }
+ }
- delay = writeback_delay(dc, KEY_SIZE(&w->key));
+ while (!kthread_should_stop() && delay) {
+ schedule_timeout_interruptible(delay);
+ delay = writeback_delay(dc, 0);
+ }
}
if (0) {
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index a9e3ffb4b03c..66f1c527fa24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,6 +5,16 @@
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
+#define MAX_WRITEBACKS_IN_PASS 5
+#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
+
+/*
+ * 14 (16384ths) is chosen here as something that each backing device
+ * should be a reasonable fraction of the share, and not to blow up
+ * until individual backing devices are a petabyte.
+ */
+#define WRITEBACK_SHARE_SHIFT 14
+
static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
{
uint64_t i, ret = 0;
@@ -21,7 +31,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
mutex_lock(&bch_register_lock);
- for (i = 0; i < c->nr_uuids; i++) {
+ for (i = 0; i < c->devices_max_used; i++) {
struct bcache_device *d = c->devices[i];
if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))