diff options
Diffstat (limited to 'drivers/md/bcache/writeback.c')
| -rw-r--r-- | drivers/md/bcache/writeback.c | 293 |
1 files changed, 167 insertions, 126 deletions
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c7560f66dca8..4b237074f453 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -157,6 +157,53 @@ static void __update_writeback_rate(struct cached_dev *dc) dc->writeback_rate_target = target; } +static bool idle_counter_exceeded(struct cache_set *c) +{ + int counter, dev_nr; + + /* + * If c->idle_counter is overflow (idel for really long time), + * reset as 0 and not set maximum rate this time for code + * simplicity. + */ + counter = atomic_inc_return(&c->idle_counter); + if (counter <= 0) { + atomic_set(&c->idle_counter, 0); + return false; + } + + dev_nr = atomic_read(&c->attached_dev_nr); + if (dev_nr == 0) + return false; + + /* + * c->idle_counter is increased by writeback thread of all + * attached backing devices, in order to represent a rough + * time period, counter should be divided by dev_nr. + * Otherwise the idle time cannot be larger with more backing + * device attached. + * The following calculation equals to checking + * (counter / dev_nr) < (dev_nr * 6) + */ + if (counter < (dev_nr * dev_nr * 6)) + return false; + + return true; +} + +/* + * Idle_counter is increased every time when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { @@ -167,21 +214,8 @@ static bool set_at_max_writeback_rate(struct cache_set *c, /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; - /* - * Idle_counter is increased everytime when update_writeback_rate() is - * called. If all backing devices attached to the same cache set have - * identical dc->writeback_rate_update_seconds values, it is about 6 - * rounds of update_writeback_rate() on each backing device before - * c->at_max_writeback_rate is set to 1, and then max wrteback rate set - * to each dc->writeback_rate.rate. - * In order to avoid extra locking cost for counting exact dirty cached - * devices number, c->attached_dev_nr is used to calculate the idle - * throushold. It might be bigger if not all cached device are in write- - * back mode, but it still works well with limited extra rounds of - * update_writeback_rate(). - */ - if (atomic_inc_return(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) + + if (!idle_counter_exceeded(c)) return false; if (atomic_read(&c->at_max_writeback_rate) != 1) @@ -195,13 +229,10 @@ static bool set_at_max_writeback_rate(struct cache_set *c, dc->writeback_rate_change = 0; /* - * Check c->idle_counter and c->at_max_writeback_rate agagain in case - * new I/O arrives during before set_at_max_writeback_rate() returns. - * Then the writeback rate is set to 1, and its new value should be - * decided via __update_writeback_rate(). + * In case new I/O arrives during before + * set_at_max_writeback_rate() returns. */ - if ((atomic_read(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) || + if (!idle_counter_exceeded(c) || !atomic_read(&c->at_max_writeback_rate)) return false; @@ -235,19 +266,27 @@ static void update_writeback_rate(struct work_struct *work) return; } - if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { - /* - * If the whole cache set is idle, set_at_max_writeback_rate() - * will set writeback rate to a max number. Then it is - * unncessary to update writeback rate for an idle cache set - * in maximum writeback rate number(s). - */ - if (!set_at_max_writeback_rate(c, dc)) { - down_read(&dc->writeback_lock); + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (atomic_read(&dc->has_dirty) && dc->writeback_percent && + !set_at_max_writeback_rate(c, dc)) { + do { + if (!down_read_trylock((&dc->writeback_lock))) { + dc->rate_update_retry++; + if (dc->rate_update_retry <= + BCH_WBRATE_UPDATE_MAX_SKIPS) + break; + down_read(&dc->writeback_lock); + dc->rate_update_retry = 0; + } __update_writeback_rate(dc); update_gc_after_writeback(c); up_read(&dc->writeback_lock); - } + } while (0); } @@ -292,26 +331,26 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, bio->bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)); + bio_init_inline(bio, NULL, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; bio->bi_private = w; bch_bio_map(bio, NULL); } -static void dirty_io_destructor(struct closure *cl) +static CLOSURE_CALLBACK(dirty_io_destructor) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); kfree(io); } -static void write_dirty_finish(struct closure *cl) +static CLOSURE_CALLBACK(write_dirty_finish) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; @@ -361,9 +400,9 @@ static void dirty_endio(struct bio *bio) closure_put(&io->cl); } -static void write_dirty(struct closure *cl) +static CLOSURE_CALLBACK(write_dirty) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; @@ -395,7 +434,7 @@ static void write_dirty(struct closure *cl) */ if (KEY_DIRTY(&w->key)) { dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_opf = REQ_OP_WRITE; io->bio.bi_iter.bi_sector = KEY_START(&w->key); bio_set_dev(&io->bio, io->dc->bdev); io->bio.bi_end_io = dirty_endio; @@ -423,9 +462,9 @@ static void read_dirty_endio(struct bio *bio) dirty_endio(bio); } -static void read_dirty_submit(struct closure *cl) +static CLOSURE_CALLBACK(read_dirty_submit) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); closure_bio_submit(io->dc->disk.c, &io->bio, cl); @@ -497,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; @@ -508,7 +547,7 @@ static void read_dirty(struct cached_dev *dc) io->sequence = sequence++; dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); + io->bio.bi_opf = REQ_OP_READ; io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); bio_set_dev(&io->bio, dc->disk.c->cache->bdev); io->bio.bi_end_io = read_dirty_endio; @@ -585,10 +624,13 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, sectors_dirty = atomic_add_return(s, d->stripe_sectors_dirty + stripe); - if (sectors_dirty == d->stripe_size) - set_bit(stripe, d->full_dirty_stripes); - else - clear_bit(stripe, d->full_dirty_stripes); + if (sectors_dirty == d->stripe_size) { + if (!test_bit(stripe, d->full_dirty_stripes)) + set_bit(stripe, d->full_dirty_stripes); + } else { + if (test_bit(stripe, d->full_dirty_stripes)) + clear_bit(stripe, d->full_dirty_stripes); + } nr_sectors -= s; stripe_offset = 0; @@ -763,8 +805,7 @@ static int bch_writeback_thread(void *arg) * may set BCH_ENABLE_AUTO_GC via sysfs, then when * BCH_DO_AUTO_GC is set, garbage collection thread * will be wake up here. After moving gc, the shrunk - * btree and discarded free buckets SSD space may be - * helpful for following write requests. + * btree may be helpful for following write requests. */ if (c->gc_after_writeback == (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { @@ -790,10 +831,9 @@ static int bch_writeback_thread(void *arg) } } - if (dc->writeback_write_wq) { - flush_workqueue(dc->writeback_write_wq); + if (dc->writeback_write_wq) destroy_workqueue(dc->writeback_write_wq); - } + cached_dev_put(dc); wait_for_kthread_stop(); @@ -802,13 +842,11 @@ static int bch_writeback_thread(void *arg) /* Init */ #define INIT_KEYS_EACH_TIME 500000 -#define INIT_KEYS_SLEEP_MS 100 struct sectors_dirty_init { struct btree_op op; unsigned int inode; size_t count; - struct bkey start; }; static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, @@ -824,11 +862,8 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, KEY_START(k), KEY_SIZE(k)); op->count++; - if (atomic_read(&b->c->search_inflight) && - !(op->count % INIT_KEYS_EACH_TIME)) { - bkey_copy_key(&op->start, k); - return -EAGAIN; - } + if (!(op->count % INIT_KEYS_EACH_TIME)) + cond_resched(); return MAP_CONTINUE; } @@ -843,24 +878,26 @@ static int bch_root_node_dirty_init(struct cache_set *c, bch_btree_op_init(&op.op, -1); op.inode = d->id; op.count = 0; - op.start = KEY(op.inode, 0, 0); - - do { - ret = bcache_btree(map_keys_recurse, - k, - c->root, - &op.op, - &op.start, - sectors_dirty_init_fn, - 0); - if (ret == -EAGAIN) - schedule_timeout_interruptible( - msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); - else if (ret < 0) { - pr_warn("sectors dirty init failed, ret=%d!\n", ret); - break; - } - } while (ret == -EAGAIN); + + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, + 0); + if (ret < 0) + pr_warn("sectors dirty init failed, ret=%d!\n", ret); + + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op.op)->wait); return ret; } @@ -870,15 +907,15 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; - cur_idx = prev_idx = 0; + prev_idx = 0; - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -892,7 +929,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -904,7 +941,6 @@ static int bch_dirty_init_thread(void *arg) goto out; } skip_nr--; - cond_resched(); } if (p) { @@ -914,7 +950,6 @@ static int bch_dirty_init_thread(void *arg) p = NULL; prev_idx = cur_idx; - cond_resched(); } out: @@ -941,69 +976,72 @@ static int bch_btre_dirty_init_thread_nr(void) void bch_sectors_dirty_init(struct bcache_device *d) { int i; + struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct sectors_dirty_init op; struct cache_set *c = d->c; - struct bch_dirty_init_state *state; - char name[32]; + struct bch_dirty_init_state state; + +retry_lock: + b = c->root; + rw_lock(0, b, b->level); + if (b != c->root) { + rw_unlock(0, b); + goto retry_lock; + } /* Just count root keys if no leaf node */ if (c->root->level == 0) { bch_btree_op_init(&op.op, -1); op.inode = d->id; op.count = 0; - op.start = KEY(op.inode, 0, 0); for_each_key_filter(&c->root->keys, - k, &iter, bch_ptr_invalid) + k, &iter, bch_ptr_invalid) { + if (KEY_INODE(k) != op.inode) + continue; sectors_dirty_init_fn(&op.op, c->root, k); - return; - } + } - state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL); - if (!state) { - pr_warn("sectors dirty init failed: cannot allocate memory\n"); + rw_unlock(0, b); return; } - state->c = c; - state->d = d; - state->total_threads = bch_btre_dirty_init_thread_nr(); - state->key_idx = 0; - spin_lock_init(&state->idx_lock); - atomic_set(&state->started, 0); - atomic_set(&state->enough, 0); - init_waitqueue_head(&state->wait); - - for (i = 0; i < state->total_threads; i++) { - /* Fetch latest state->enough earlier */ + memset(&state, 0, sizeof(struct bch_dirty_init_state)); + state.c = c; + state.d = d; + state.total_threads = bch_btre_dirty_init_thread_nr(); + state.key_idx = 0; + spin_lock_init(&state.idx_lock); + atomic_set(&state.started, 0); + atomic_set(&state.enough, 0); + init_waitqueue_head(&state.wait); + + for (i = 0; i < state.total_threads; i++) { + /* Fetch latest state.enough earlier */ smp_mb__before_atomic(); - if (atomic_read(&state->enough)) + if (atomic_read(&state.enough)) break; - state->infos[i].state = state; - atomic_inc(&state->started); - snprintf(name, sizeof(name), "bch_dirty_init[%d]", i); - - state->infos[i].thread = - kthread_run(bch_dirty_init_thread, - &state->infos[i], - name); - if (IS_ERR(state->infos[i].thread)) { + atomic_inc(&state.started); + state.infos[i].state = &state; + state.infos[i].thread = + kthread_run(bch_dirty_init_thread, &state.infos[i], + "bch_dirtcnt[%d]", i); + if (IS_ERR(state.infos[i].thread)) { pr_err("fails to run thread bch_dirty_init[%d]\n", i); + atomic_dec(&state.started); for (--i; i >= 0; i--) - kthread_stop(state->infos[i].thread); + kthread_stop(state.infos[i].thread); goto out; } } - wait_event_interruptible(state->wait, - atomic_read(&state->started) == 0 || - test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - out: - kfree(state); + /* Must wait for all threads to stop. */ + wait_event(state.wait, atomic_read(&state.started) == 0); + rw_unlock(0, b); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -1027,6 +1065,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_fp_term_high = 1000; dc->writeback_rate_i_term_inverse = 10000; + /* For dc->writeback_lock contention in update_writeback_rate() */ + dc->rate_update_retry = 0; + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } @@ -1034,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!dc->writeback_write_wq) return -ENOMEM; |
