diff options
Diffstat (limited to 'drivers/md/bcache/writeback.c')
| -rw-r--r-- | drivers/md/bcache/writeback.c | 431 |
1 files changed, 343 insertions, 88 deletions
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 73f0efac2b9f..4b237074f453 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -35,7 +35,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * This is the size of the cache, minus the amount used for * flash-only devices */ - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - atomic_long_read(&c->flash_dev_dirty_sectors); /* @@ -45,7 +45,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * backing volume uses about 2% of the cache for dirty data. */ uint32_t bdev_share = - div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, + div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, c->cached_dev_sectors); uint64_t cache_dirty_target = @@ -88,6 +88,44 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t integral_scaled; uint32_t new_rate; + /* + * We need to consider the number of dirty buckets as well + * when calculating the proportional_scaled, Otherwise we might + * have an unreasonable small writeback rate at a highly fragmented situation + * when very few dirty sectors consumed a lot dirty buckets, the + * worst case is when dirty buckets reached cutoff_writeback_sync and + * dirty data is still not even reached to writeback percent, so the rate + * still will be at the minimum value, which will cause the write + * stuck at a non-writeback mode. + */ + struct cache_set *c = dc->disk.c; + + int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; + + if (dc->writeback_consider_fragment && + c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { + int64_t fragment = + div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty); + int64_t fp_term; + int64_t fps; + + if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { + fp_term = (int64_t)dc->writeback_rate_fp_term_low * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); + } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { + fp_term = (int64_t)dc->writeback_rate_fp_term_mid * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); + } else { + fp_term = (int64_t)dc->writeback_rate_fp_term_high * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); + } + fps = div_s64(dirty, dirty_buckets) * fp_term; + if (fragment > 3 && fps > proportional_scaled) { + /* Only overrite the p when fragment > 3 */ + proportional_scaled = fps; + } + } + if ((error < 0 && dc->writeback_rate_integral > 0) || (error > 0 && time_before64(local_clock(), dc->writeback_rate.next + NSEC_PER_MSEC))) { @@ -119,24 +157,65 @@ static void __update_writeback_rate(struct cached_dev *dc) dc->writeback_rate_target = target; } -static bool set_at_max_writeback_rate(struct cache_set *c, - struct cached_dev *dc) +static bool idle_counter_exceeded(struct cache_set *c) { + int counter, dev_nr; + /* - * Idle_counter is increased everytime when update_writeback_rate() is - * called. If all backing devices attached to the same cache set have - * identical dc->writeback_rate_update_seconds values, it is about 6 - * rounds of update_writeback_rate() on each backing device before - * c->at_max_writeback_rate is set to 1, and then max wrteback rate set - * to each dc->writeback_rate.rate. - * In order to avoid extra locking cost for counting exact dirty cached - * devices number, c->attached_dev_nr is used to calculate the idle - * throushold. It might be bigger if not all cached device are in write- - * back mode, but it still works well with limited extra rounds of - * update_writeback_rate(). + * If c->idle_counter is overflow (idel for really long time), + * reset as 0 and not set maximum rate this time for code + * simplicity. */ - if (atomic_inc_return(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) + counter = atomic_inc_return(&c->idle_counter); + if (counter <= 0) { + atomic_set(&c->idle_counter, 0); + return false; + } + + dev_nr = atomic_read(&c->attached_dev_nr); + if (dev_nr == 0) + return false; + + /* + * c->idle_counter is increased by writeback thread of all + * attached backing devices, in order to represent a rough + * time period, counter should be divided by dev_nr. + * Otherwise the idle time cannot be larger with more backing + * device attached. + * The following calculation equals to checking + * (counter / dev_nr) < (dev_nr * 6) + */ + if (counter < (dev_nr * dev_nr * 6)) + return false; + + return true; +} + +/* + * Idle_counter is increased every time when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ +static bool set_at_max_writeback_rate(struct cache_set *c, + struct cached_dev *dc) +{ + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + + /* Don't set max writeback rate if gc is running */ + if (!c->gc_mark_valid) + return false; + + if (!idle_counter_exceeded(c)) return false; if (atomic_read(&c->at_max_writeback_rate) != 1) @@ -150,13 +229,10 @@ static bool set_at_max_writeback_rate(struct cache_set *c, dc->writeback_rate_change = 0; /* - * Check c->idle_counter and c->at_max_writeback_rate agagain in case - * new I/O arrives during before set_at_max_writeback_rate() returns. - * Then the writeback rate is set to 1, and its new value should be - * decided via __update_writeback_rate(). + * In case new I/O arrives during before + * set_at_max_writeback_rate() returns. */ - if ((atomic_read(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) || + if (!idle_counter_exceeded(c) || !atomic_read(&c->at_max_writeback_rate)) return false; @@ -176,7 +252,7 @@ static void update_writeback_rate(struct work_struct *work) */ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); /* * CACHE_SET_IO_DISABLE might be set via sysfs interface, @@ -186,23 +262,31 @@ static void update_writeback_rate(struct work_struct *work) test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); return; } - if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { - /* - * If the whole cache set is idle, set_at_max_writeback_rate() - * will set writeback rate to a max number. Then it is - * unncessary to update writeback rate for an idle cache set - * in maximum writeback rate number(s). - */ - if (!set_at_max_writeback_rate(c, dc)) { - down_read(&dc->writeback_lock); + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (atomic_read(&dc->has_dirty) && dc->writeback_percent && + !set_at_max_writeback_rate(c, dc)) { + do { + if (!down_read_trylock((&dc->writeback_lock))) { + dc->rate_update_retry++; + if (dc->rate_update_retry <= + BCH_WBRATE_UPDATE_MAX_SKIPS) + break; + down_read(&dc->writeback_lock); + dc->rate_update_retry = 0; + } __update_writeback_rate(dc); update_gc_after_writeback(c); up_read(&dc->writeback_lock); - } + } while (0); } @@ -222,7 +306,7 @@ static void update_writeback_rate(struct work_struct *work) */ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); } static unsigned int writeback_delay(struct cached_dev *dc, @@ -247,26 +331,26 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, bio->bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)); + bio_init_inline(bio, NULL, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; bio->bi_private = w; bch_bio_map(bio, NULL); } -static void dirty_io_destructor(struct closure *cl) +static CLOSURE_CALLBACK(dirty_io_destructor) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); kfree(io); } -static void write_dirty_finish(struct closure *cl) +static CLOSURE_CALLBACK(write_dirty_finish) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; @@ -316,9 +400,9 @@ static void dirty_endio(struct bio *bio) closure_put(&io->cl); } -static void write_dirty(struct closure *cl) +static CLOSURE_CALLBACK(write_dirty) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; @@ -350,7 +434,7 @@ static void write_dirty(struct closure *cl) */ if (KEY_DIRTY(&w->key)) { dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_opf = REQ_OP_WRITE; io->bio.bi_iter.bi_sector = KEY_START(&w->key); bio_set_dev(&io->bio, io->dc->bdev); io->bio.bi_end_io = dirty_endio; @@ -371,16 +455,16 @@ static void read_dirty_endio(struct bio *bio) struct dirty_io *io = w->private; /* is_read = 1 */ - bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), + bch_count_io_errors(io->dc->disk.c->cache, bio->bi_status, 1, "reading dirty data from cache"); dirty_endio(bio); } -static void read_dirty_submit(struct closure *cl) +static CLOSURE_CALLBACK(read_dirty_submit) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + closure_type(io, struct dirty_io, cl); closure_bio_submit(io->dc->disk.c, &io->bio, cl); @@ -452,11 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(sizeof(struct dirty_io) + - sizeof(struct bio_vec) * - DIV_ROUND_UP(KEY_SIZE(&w->key), - PAGE_SECTORS), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; @@ -465,10 +547,9 @@ static void read_dirty(struct cached_dev *dc) io->sequence = sequence++; dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); + io->bio.bi_opf = REQ_OP_READ; io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - bio_set_dev(&io->bio, - PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); + bio_set_dev(&io->bio, dc->disk.c->cache->bdev); io->bio.bi_end_io = read_dirty_endio; if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) @@ -516,15 +597,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned int stripe_offset, stripe, sectors_dirty; + unsigned int stripe_offset, sectors_dirty; + int stripe; if (!d) return; + stripe = offset_to_stripe(d, offset); + if (stripe < 0) + return; + if (UUID_FLASH_ONLY(&c->uuids[inode])) atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); - stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { @@ -539,10 +624,13 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, sectors_dirty = atomic_add_return(s, d->stripe_sectors_dirty + stripe); - if (sectors_dirty == d->stripe_size) - set_bit(stripe, d->full_dirty_stripes); - else - clear_bit(stripe, d->full_dirty_stripes); + if (sectors_dirty == d->stripe_size) { + if (!test_bit(stripe, d->full_dirty_stripes)) + set_bit(stripe, d->full_dirty_stripes); + } else { + if (test_bit(stripe, d->full_dirty_stripes)) + clear_bit(stripe, d->full_dirty_stripes); + } nr_sectors -= s; stripe_offset = 0; @@ -564,12 +652,12 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static void refill_full_stripes(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - unsigned int start_stripe, stripe, next_stripe; + unsigned int start_stripe, next_stripe; + int stripe; bool wrapped = false; stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); - - if (stripe >= dc->disk.nr_stripes) + if (stripe < 0) stripe = 0; start_stripe = stripe; @@ -696,6 +784,15 @@ static int bch_writeback_thread(void *arg) * bch_cached_dev_detach(). */ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + struct closure cl; + + closure_init_stack(&cl); + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + up_write(&dc->writeback_lock); break; } @@ -708,8 +805,7 @@ static int bch_writeback_thread(void *arg) * may set BCH_ENABLE_AUTO_GC via sysfs, then when * BCH_DO_AUTO_GC is set, garbage collection thread * will be wake up here. After moving gc, the shrunk - * btree and discarded free buckets SSD space may be - * helpful for following write requests. + * btree may be helpful for following write requests. */ if (c->gc_after_writeback == (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { @@ -735,6 +831,9 @@ static int bch_writeback_thread(void *arg) } } + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); + cached_dev_put(dc); wait_for_kthread_stop(); @@ -743,13 +842,11 @@ static int bch_writeback_thread(void *arg) /* Init */ #define INIT_KEYS_EACH_TIME 500000 -#define INIT_KEYS_SLEEP_MS 100 struct sectors_dirty_init { struct btree_op op; unsigned int inode; size_t count; - struct bkey start; }; static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, @@ -765,16 +862,15 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, KEY_START(k), KEY_SIZE(k)); op->count++; - if (atomic_read(&b->c->search_inflight) && - !(op->count % INIT_KEYS_EACH_TIME)) { - bkey_copy_key(&op->start, k); - return -EAGAIN; - } + if (!(op->count % INIT_KEYS_EACH_TIME)) + cond_resched(); return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct bcache_device *d) +static int bch_root_node_dirty_init(struct cache_set *c, + struct bcache_device *d, + struct bkey *k) { struct sectors_dirty_init op; int ret; @@ -782,19 +878,170 @@ void bch_sectors_dirty_init(struct bcache_device *d) bch_btree_op_init(&op.op, -1); op.inode = d->id; op.count = 0; - op.start = KEY(op.inode, 0, 0); - - do { - ret = bch_btree_map_keys(&op.op, d->c, &op.start, - sectors_dirty_init_fn, 0); - if (ret == -EAGAIN) - schedule_timeout_interruptible( - msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); - else if (ret < 0) { - pr_warn("sectors dirty init failed, ret=%d!", ret); + + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, + 0); + if (ret < 0) + pr_warn("sectors dirty init failed, ret=%d!\n", ret); + + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op.op)->wait); + + return ret; +} + +static int bch_dirty_init_thread(void *arg) +{ + struct dirty_init_thrd_info *info = arg; + struct bch_dirty_init_state *state = info->state; + struct cache_set *c = state->c; + struct btree_iter_stack iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + + k = p = NULL; + prev_idx = 0; + + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + + while (k) { + spin_lock(&state->idx_lock); + cur_idx = state->key_idx; + state->key_idx++; + spin_unlock(&state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter.iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + atomic_set(&state->enough, 1); + /* Update state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + } + + if (p) { + if (bch_root_node_dirty_init(c, state->d, p) < 0) + goto out; + } + + p = NULL; + prev_idx = cur_idx; + } + +out: + /* In order to wake up state->wait in time */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&state->started)) + wake_up(&state->wait); + + return 0; +} + +static int bch_btre_dirty_init_thread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_DIRTY_INIT_THRD_MAX) + n = BCH_DIRTY_INIT_THRD_MAX; + + return n; +} + +void bch_sectors_dirty_init(struct bcache_device *d) +{ + int i; + struct btree *b = NULL; + struct bkey *k = NULL; + struct btree_iter_stack iter; + struct sectors_dirty_init op; + struct cache_set *c = d->c; + struct bch_dirty_init_state state; + +retry_lock: + b = c->root; + rw_lock(0, b, b->level); + if (b != c->root) { + rw_unlock(0, b); + goto retry_lock; + } + + /* Just count root keys if no leaf node */ + if (c->root->level == 0) { + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) { + if (KEY_INODE(k) != op.inode) + continue; + sectors_dirty_init_fn(&op.op, c->root, k); + } + + rw_unlock(0, b); + return; + } + + memset(&state, 0, sizeof(struct bch_dirty_init_state)); + state.c = c; + state.d = d; + state.total_threads = bch_btre_dirty_init_thread_nr(); + state.key_idx = 0; + spin_lock_init(&state.idx_lock); + atomic_set(&state.started, 0); + atomic_set(&state.enough, 0); + init_waitqueue_head(&state.wait); + + for (i = 0; i < state.total_threads; i++) { + /* Fetch latest state.enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&state.enough)) break; + + atomic_inc(&state.started); + state.infos[i].state = &state; + state.infos[i].thread = + kthread_run(bch_dirty_init_thread, &state.infos[i], + "bch_dirtcnt[%d]", i); + if (IS_ERR(state.infos[i].thread)) { + pr_err("fails to run thread bch_dirty_init[%d]\n", i); + atomic_dec(&state.started); + for (--i; i >= 0; i--) + kthread_stop(state.infos[i].thread); + goto out; } - } while (ret == -EAGAIN); + } + +out: + /* Must wait for all threads to stop. */ + wait_event(state.wait, atomic_read(&state.started) == 0); + rw_unlock(0, b); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -805,6 +1052,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_metadata = true; dc->writeback_running = false; + dc->writeback_consider_fragment = true; dc->writeback_percent = 10; dc->writeback_delay = 30; atomic_long_set(&dc->writeback_rate.rate, 1024); @@ -812,8 +1060,14 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_fp_term_low = 1; + dc->writeback_rate_fp_term_mid = 10; + dc->writeback_rate_fp_term_high = 1000; dc->writeback_rate_i_term_inverse = 10000; + /* For dc->writeback_lock contention in update_writeback_rate() */ + dc->rate_update_retry = 0; + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } @@ -821,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!dc->writeback_write_wq) return -ENOMEM; @@ -830,6 +1084,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) { cached_dev_put(dc); + destroy_workqueue(dc->writeback_write_wq); return PTR_ERR(dc->writeback_thread); } dc->writeback_running = true; |
