summaryrefslogtreecommitdiff
path: root/drivers/md/bcache/writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/bcache/writeback.c')
-rw-r--r--drivers/md/bcache/writeback.c293
1 files changed, 167 insertions, 126 deletions
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c7560f66dca8..4b237074f453 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -157,6 +157,53 @@ static void __update_writeback_rate(struct cached_dev *dc)
dc->writeback_rate_target = target;
}
+static bool idle_counter_exceeded(struct cache_set *c)
+{
+ int counter, dev_nr;
+
+ /*
+ * If c->idle_counter is overflow (idel for really long time),
+ * reset as 0 and not set maximum rate this time for code
+ * simplicity.
+ */
+ counter = atomic_inc_return(&c->idle_counter);
+ if (counter <= 0) {
+ atomic_set(&c->idle_counter, 0);
+ return false;
+ }
+
+ dev_nr = atomic_read(&c->attached_dev_nr);
+ if (dev_nr == 0)
+ return false;
+
+ /*
+ * c->idle_counter is increased by writeback thread of all
+ * attached backing devices, in order to represent a rough
+ * time period, counter should be divided by dev_nr.
+ * Otherwise the idle time cannot be larger with more backing
+ * device attached.
+ * The following calculation equals to checking
+ * (counter / dev_nr) < (dev_nr * 6)
+ */
+ if (counter < (dev_nr * dev_nr * 6))
+ return false;
+
+ return true;
+}
+
+/*
+ * Idle_counter is increased every time when update_writeback_rate() is
+ * called. If all backing devices attached to the same cache set have
+ * identical dc->writeback_rate_update_seconds values, it is about 6
+ * rounds of update_writeback_rate() on each backing device before
+ * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+ * to each dc->writeback_rate.rate.
+ * In order to avoid extra locking cost for counting exact dirty cached
+ * devices number, c->attached_dev_nr is used to calculate the idle
+ * throushold. It might be bigger if not all cached device are in write-
+ * back mode, but it still works well with limited extra rounds of
+ * update_writeback_rate().
+ */
static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc)
{
@@ -167,21 +214,8 @@ static bool set_at_max_writeback_rate(struct cache_set *c,
/* Don't set max writeback rate if gc is running */
if (!c->gc_mark_valid)
return false;
- /*
- * Idle_counter is increased everytime when update_writeback_rate() is
- * called. If all backing devices attached to the same cache set have
- * identical dc->writeback_rate_update_seconds values, it is about 6
- * rounds of update_writeback_rate() on each backing device before
- * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
- * to each dc->writeback_rate.rate.
- * In order to avoid extra locking cost for counting exact dirty cached
- * devices number, c->attached_dev_nr is used to calculate the idle
- * throushold. It might be bigger if not all cached device are in write-
- * back mode, but it still works well with limited extra rounds of
- * update_writeback_rate().
- */
- if (atomic_inc_return(&c->idle_counter) <
- atomic_read(&c->attached_dev_nr) * 6)
+
+ if (!idle_counter_exceeded(c))
return false;
if (atomic_read(&c->at_max_writeback_rate) != 1)
@@ -195,13 +229,10 @@ static bool set_at_max_writeback_rate(struct cache_set *c,
dc->writeback_rate_change = 0;
/*
- * Check c->idle_counter and c->at_max_writeback_rate agagain in case
- * new I/O arrives during before set_at_max_writeback_rate() returns.
- * Then the writeback rate is set to 1, and its new value should be
- * decided via __update_writeback_rate().
+ * In case new I/O arrives during before
+ * set_at_max_writeback_rate() returns.
*/
- if ((atomic_read(&c->idle_counter) <
- atomic_read(&c->attached_dev_nr) * 6) ||
+ if (!idle_counter_exceeded(c) ||
!atomic_read(&c->at_max_writeback_rate))
return false;
@@ -235,19 +266,27 @@ static void update_writeback_rate(struct work_struct *work)
return;
}
- if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
- /*
- * If the whole cache set is idle, set_at_max_writeback_rate()
- * will set writeback rate to a max number. Then it is
- * unncessary to update writeback rate for an idle cache set
- * in maximum writeback rate number(s).
- */
- if (!set_at_max_writeback_rate(c, dc)) {
- down_read(&dc->writeback_lock);
+ /*
+ * If the whole cache set is idle, set_at_max_writeback_rate()
+ * will set writeback rate to a max number. Then it is
+ * unncessary to update writeback rate for an idle cache set
+ * in maximum writeback rate number(s).
+ */
+ if (atomic_read(&dc->has_dirty) && dc->writeback_percent &&
+ !set_at_max_writeback_rate(c, dc)) {
+ do {
+ if (!down_read_trylock((&dc->writeback_lock))) {
+ dc->rate_update_retry++;
+ if (dc->rate_update_retry <=
+ BCH_WBRATE_UPDATE_MAX_SKIPS)
+ break;
+ down_read(&dc->writeback_lock);
+ dc->rate_update_retry = 0;
+ }
__update_writeback_rate(dc);
update_gc_after_writeback(c);
up_read(&dc->writeback_lock);
- }
+ } while (0);
}
@@ -292,26 +331,26 @@ static void dirty_init(struct keybuf_key *w)
struct dirty_io *io = w->private;
struct bio *bio = &io->bio;
- bio_init(bio, bio->bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
+ bio_init_inline(bio, NULL,
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
if (!io->dc->writeback_percent)
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
bio->bi_private = w;
bch_bio_map(bio, NULL);
}
-static void dirty_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(dirty_io_destructor)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
kfree(io);
}
-static void write_dirty_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty_finish)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
@@ -361,9 +400,9 @@ static void dirty_endio(struct bio *bio)
closure_put(&io->cl);
}
-static void write_dirty(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
@@ -395,7 +434,7 @@ static void write_dirty(struct closure *cl)
*/
if (KEY_DIRTY(&w->key)) {
dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+ io->bio.bi_opf = REQ_OP_WRITE;
io->bio.bi_iter.bi_sector = KEY_START(&w->key);
bio_set_dev(&io->bio, io->dc->bdev);
io->bio.bi_end_io = dirty_endio;
@@ -423,9 +462,9 @@ static void read_dirty_endio(struct bio *bio)
dirty_endio(bio);
}
-static void read_dirty_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_dirty_submit)
{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ closure_type(io, struct dirty_io, cl);
closure_bio_submit(io->dc->disk.c, &io->bio, cl);
@@ -497,9 +536,9 @@ static void read_dirty(struct cached_dev *dc)
for (i = 0; i < nk; i++) {
w = keys[i];
- io = kzalloc(struct_size(io, bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
@@ -508,7 +547,7 @@ static void read_dirty(struct cached_dev *dc)
io->sequence = sequence++;
dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+ io->bio.bi_opf = REQ_OP_READ;
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
bio_set_dev(&io->bio, dc->disk.c->cache->bdev);
io->bio.bi_end_io = read_dirty_endio;
@@ -585,10 +624,13 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode,
sectors_dirty = atomic_add_return(s,
d->stripe_sectors_dirty + stripe);
- if (sectors_dirty == d->stripe_size)
- set_bit(stripe, d->full_dirty_stripes);
- else
- clear_bit(stripe, d->full_dirty_stripes);
+ if (sectors_dirty == d->stripe_size) {
+ if (!test_bit(stripe, d->full_dirty_stripes))
+ set_bit(stripe, d->full_dirty_stripes);
+ } else {
+ if (test_bit(stripe, d->full_dirty_stripes))
+ clear_bit(stripe, d->full_dirty_stripes);
+ }
nr_sectors -= s;
stripe_offset = 0;
@@ -763,8 +805,7 @@ static int bch_writeback_thread(void *arg)
* may set BCH_ENABLE_AUTO_GC via sysfs, then when
* BCH_DO_AUTO_GC is set, garbage collection thread
* will be wake up here. After moving gc, the shrunk
- * btree and discarded free buckets SSD space may be
- * helpful for following write requests.
+ * btree may be helpful for following write requests.
*/
if (c->gc_after_writeback ==
(BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
@@ -790,10 +831,9 @@ static int bch_writeback_thread(void *arg)
}
}
- if (dc->writeback_write_wq) {
- flush_workqueue(dc->writeback_write_wq);
+ if (dc->writeback_write_wq)
destroy_workqueue(dc->writeback_write_wq);
- }
+
cached_dev_put(dc);
wait_for_kthread_stop();
@@ -802,13 +842,11 @@ static int bch_writeback_thread(void *arg)
/* Init */
#define INIT_KEYS_EACH_TIME 500000
-#define INIT_KEYS_SLEEP_MS 100
struct sectors_dirty_init {
struct btree_op op;
unsigned int inode;
size_t count;
- struct bkey start;
};
static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -824,11 +862,8 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
KEY_START(k), KEY_SIZE(k));
op->count++;
- if (atomic_read(&b->c->search_inflight) &&
- !(op->count % INIT_KEYS_EACH_TIME)) {
- bkey_copy_key(&op->start, k);
- return -EAGAIN;
- }
+ if (!(op->count % INIT_KEYS_EACH_TIME))
+ cond_resched();
return MAP_CONTINUE;
}
@@ -843,24 +878,26 @@ static int bch_root_node_dirty_init(struct cache_set *c,
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
op.count = 0;
- op.start = KEY(op.inode, 0, 0);
-
- do {
- ret = bcache_btree(map_keys_recurse,
- k,
- c->root,
- &op.op,
- &op.start,
- sectors_dirty_init_fn,
- 0);
- if (ret == -EAGAIN)
- schedule_timeout_interruptible(
- msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
- else if (ret < 0) {
- pr_warn("sectors dirty init failed, ret=%d!\n", ret);
- break;
- }
- } while (ret == -EAGAIN);
+
+ ret = bcache_btree(map_keys_recurse,
+ k,
+ c->root,
+ &op.op,
+ &KEY(op.inode, 0, 0),
+ sectors_dirty_init_fn,
+ 0);
+ if (ret < 0)
+ pr_warn("sectors dirty init failed, ret=%d!\n", ret);
+
+ /*
+ * The op may be added to cache_set's btree_cache_wait
+ * in mca_cannibalize(), must ensure it is removed from
+ * the list and release btree_cache_alloc_lock before
+ * free op memory.
+ * Otherwise, the btree_cache_wait will be damaged.
+ */
+ bch_cannibalize_unlock(c);
+ finish_wait(&c->btree_cache_wait, &(&op.op)->wait);
return ret;
}
@@ -870,15 +907,15 @@ static int bch_dirty_init_thread(void *arg)
struct dirty_init_thrd_info *info = arg;
struct bch_dirty_init_state *state = info->state;
struct cache_set *c = state->c;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
k = p = NULL;
- cur_idx = prev_idx = 0;
+ prev_idx = 0;
- bch_btree_iter_init(&c->root->keys, &iter, NULL);
- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
p = k;
@@ -892,7 +929,7 @@ static int bch_dirty_init_thread(void *arg)
skip_nr = cur_idx - prev_idx;
while (skip_nr) {
- k = bch_btree_iter_next_filter(&iter,
+ k = bch_btree_iter_next_filter(&iter.iter,
&c->root->keys,
bch_ptr_bad);
if (k)
@@ -904,7 +941,6 @@ static int bch_dirty_init_thread(void *arg)
goto out;
}
skip_nr--;
- cond_resched();
}
if (p) {
@@ -914,7 +950,6 @@ static int bch_dirty_init_thread(void *arg)
p = NULL;
prev_idx = cur_idx;
- cond_resched();
}
out:
@@ -941,69 +976,72 @@ static int bch_btre_dirty_init_thread_nr(void)
void bch_sectors_dirty_init(struct bcache_device *d)
{
int i;
+ struct btree *b = NULL;
struct bkey *k = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct sectors_dirty_init op;
struct cache_set *c = d->c;
- struct bch_dirty_init_state *state;
- char name[32];
+ struct bch_dirty_init_state state;
+
+retry_lock:
+ b = c->root;
+ rw_lock(0, b, b->level);
+ if (b != c->root) {
+ rw_unlock(0, b);
+ goto retry_lock;
+ }
/* Just count root keys if no leaf node */
if (c->root->level == 0) {
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
op.count = 0;
- op.start = KEY(op.inode, 0, 0);
for_each_key_filter(&c->root->keys,
- k, &iter, bch_ptr_invalid)
+ k, &iter, bch_ptr_invalid) {
+ if (KEY_INODE(k) != op.inode)
+ continue;
sectors_dirty_init_fn(&op.op, c->root, k);
- return;
- }
+ }
- state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL);
- if (!state) {
- pr_warn("sectors dirty init failed: cannot allocate memory\n");
+ rw_unlock(0, b);
return;
}
- state->c = c;
- state->d = d;
- state->total_threads = bch_btre_dirty_init_thread_nr();
- state->key_idx = 0;
- spin_lock_init(&state->idx_lock);
- atomic_set(&state->started, 0);
- atomic_set(&state->enough, 0);
- init_waitqueue_head(&state->wait);
-
- for (i = 0; i < state->total_threads; i++) {
- /* Fetch latest state->enough earlier */
+ memset(&state, 0, sizeof(struct bch_dirty_init_state));
+ state.c = c;
+ state.d = d;
+ state.total_threads = bch_btre_dirty_init_thread_nr();
+ state.key_idx = 0;
+ spin_lock_init(&state.idx_lock);
+ atomic_set(&state.started, 0);
+ atomic_set(&state.enough, 0);
+ init_waitqueue_head(&state.wait);
+
+ for (i = 0; i < state.total_threads; i++) {
+ /* Fetch latest state.enough earlier */
smp_mb__before_atomic();
- if (atomic_read(&state->enough))
+ if (atomic_read(&state.enough))
break;
- state->infos[i].state = state;
- atomic_inc(&state->started);
- snprintf(name, sizeof(name), "bch_dirty_init[%d]", i);
-
- state->infos[i].thread =
- kthread_run(bch_dirty_init_thread,
- &state->infos[i],
- name);
- if (IS_ERR(state->infos[i].thread)) {
+ atomic_inc(&state.started);
+ state.infos[i].state = &state;
+ state.infos[i].thread =
+ kthread_run(bch_dirty_init_thread, &state.infos[i],
+ "bch_dirtcnt[%d]", i);
+ if (IS_ERR(state.infos[i].thread)) {
pr_err("fails to run thread bch_dirty_init[%d]\n", i);
+ atomic_dec(&state.started);
for (--i; i >= 0; i--)
- kthread_stop(state->infos[i].thread);
+ kthread_stop(state.infos[i].thread);
goto out;
}
}
- wait_event_interruptible(state->wait,
- atomic_read(&state->started) == 0 ||
- test_bit(CACHE_SET_IO_DISABLE, &c->flags));
-
out:
- kfree(state);
+ /* Must wait for all threads to stop. */
+ wait_event(state.wait, atomic_read(&state.started) == 0);
+ rw_unlock(0, b);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -1027,6 +1065,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_rate_fp_term_high = 1000;
dc->writeback_rate_i_term_inverse = 10000;
+ /* For dc->writeback_lock contention in update_writeback_rate() */
+ dc->rate_update_retry = 0;
+
WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
}
@@ -1034,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
- WQ_MEM_RECLAIM, 0);
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!dc->writeback_write_wq)
return -ENOMEM;