diff options
Diffstat (limited to 'drivers/md/bcache/writeback.c')
| -rw-r--r-- | drivers/md/bcache/writeback.c | 1171 |
1 files changed, 928 insertions, 243 deletions
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..4b237074f453 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * background writeback - scan btree for dirty data and write it to the backing * device @@ -9,69 +10,233 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "writeback.h" -static struct workqueue_struct *dirty_wq; +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/sched/clock.h> +#include <trace/events/bcache.h> -static void read_dirty(struct closure *); +static void update_gc_after_writeback(struct cache_set *c) +{ + if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || + c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) + return; -struct dirty_io { - struct closure cl; - struct cached_dev *dc; - struct bio bio; -}; + c->gc_after_writeback |= BCH_DO_AUTO_GC; +} /* Rate limiting */ - -static void __update_writeback_rate(struct cached_dev *dc) +static uint64_t __calc_target_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + + /* + * This is the size of the cache, minus the amount used for + * flash-only devices + */ + uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - + atomic_long_read(&c->flash_dev_dirty_sectors); + + /* + * Unfortunately there is no control of global dirty data. If the + * user states that they want 10% dirty data in the cache, and has, + * e.g., 5 backing volumes of equal size, we try and ensure each + * backing volume uses about 2% of the cache for dirty data. + */ + uint32_t bdev_share = + div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, + c->cached_dev_sectors); + uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); - int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), - c->cached_dev_sectors); + /* Ensure each backing dev gets at least one dirty share */ + if (bdev_share < 1) + bdev_share = 1; - /* PD controller */ + return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT; +} - int change = 0; - int64_t error; - int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); - int64_t derivative = dirty - dc->disk.sectors_dirty_last; +static void __update_writeback_rate(struct cached_dev *dc) +{ + /* + * PI controller: + * Figures out the amount that should be written per second. + * + * First, the error (number of sectors that are dirty beyond our + * target) is calculated. The error is accumulated (numerically + * integrated). + * + * Then, the proportional value and integral value are scaled + * based on configured values. These are stored as inverses to + * avoid fixed point math and to make configuration easy-- e.g. + * the default value of 40 for writeback_rate_p_term_inverse + * attempts to write at a rate that would retire all the dirty + * blocks in 40 seconds. + * + * The writeback_rate_i_inverse value of 10000 means that 1/10000th + * of the error is accumulated in the integral term per second. + * This acts as a slow, long-term average that is not subject to + * variations in usage like the p term. + */ + int64_t target = __calc_target_rate(dc); + int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); + int64_t error = dirty - target; + int64_t proportional_scaled = + div_s64(error, dc->writeback_rate_p_term_inverse); + int64_t integral_scaled; + uint32_t new_rate; - dc->disk.sectors_dirty_last = dirty; + /* + * We need to consider the number of dirty buckets as well + * when calculating the proportional_scaled, Otherwise we might + * have an unreasonable small writeback rate at a highly fragmented situation + * when very few dirty sectors consumed a lot dirty buckets, the + * worst case is when dirty buckets reached cutoff_writeback_sync and + * dirty data is still not even reached to writeback percent, so the rate + * still will be at the minimum value, which will cause the write + * stuck at a non-writeback mode. + */ + struct cache_set *c = dc->disk.c; - derivative *= dc->writeback_rate_d_term; - derivative = clamp(derivative, -dirty, dirty); + int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; + + if (dc->writeback_consider_fragment && + c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { + int64_t fragment = + div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty); + int64_t fp_term; + int64_t fps; + + if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { + fp_term = (int64_t)dc->writeback_rate_fp_term_low * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); + } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { + fp_term = (int64_t)dc->writeback_rate_fp_term_mid * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); + } else { + fp_term = (int64_t)dc->writeback_rate_fp_term_high * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); + } + fps = div_s64(dirty, dirty_buckets) * fp_term; + if (fragment > 3 && fps > proportional_scaled) { + /* Only overrite the p when fragment > 3 */ + proportional_scaled = fps; + } + } - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - dc->writeback_rate_d_smooth, 0); + if ((error < 0 && dc->writeback_rate_integral > 0) || + (error > 0 && time_before64(local_clock(), + dc->writeback_rate.next + NSEC_PER_MSEC))) { + /* + * Only decrease the integral term if it's more than + * zero. Only increase the integral term if the device + * is keeping up. (Don't wind up the integral + * ineffectively in either case). + * + * It's necessary to scale this by + * writeback_rate_update_seconds to keep the integral + * term dimensioned properly. + */ + dc->writeback_rate_integral += error * + dc->writeback_rate_update_seconds; + } - /* Avoid divide by zero */ - if (!target) - goto out; + integral_scaled = div_s64(dc->writeback_rate_integral, + dc->writeback_rate_i_term_inverse); - error = div64_s64((dirty + derivative - target) << 8, target); + new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_SEC); - change = div_s64((dc->writeback_rate.rate * error) >> 8, - dc->writeback_rate_p_term_inverse); + dc->writeback_rate_proportional = proportional_scaled; + dc->writeback_rate_integral_scaled = integral_scaled; + dc->writeback_rate_change = new_rate - + atomic_long_read(&dc->writeback_rate.rate); + atomic_long_set(&dc->writeback_rate.rate, new_rate); + dc->writeback_rate_target = target; +} - /* Don't increase writeback rate if the device isn't keeping up */ - if (change > 0 && - time_after64(local_clock(), - dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) - change = 0; +static bool idle_counter_exceeded(struct cache_set *c) +{ + int counter, dev_nr; - dc->writeback_rate.rate = - clamp_t(int64_t, dc->writeback_rate.rate + change, - 1, NSEC_PER_MSEC); -out: - dc->writeback_rate_derivative = derivative; - dc->writeback_rate_change = change; - dc->writeback_rate_target = target; + /* + * If c->idle_counter is overflow (idel for really long time), + * reset as 0 and not set maximum rate this time for code + * simplicity. + */ + counter = atomic_inc_return(&c->idle_counter); + if (counter <= 0) { + atomic_set(&c->idle_counter, 0); + return false; + } - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); + dev_nr = atomic_read(&c->attached_dev_nr); + if (dev_nr == 0) + return false; + + /* + * c->idle_counter is increased by writeback thread of all + * attached backing devices, in order to represent a rough + * time period, counter should be divided by dev_nr. + * Otherwise the idle time cannot be larger with more backing + * device attached. + * The following calculation equals to checking + * (counter / dev_nr) < (dev_nr * 6) + */ + if (counter < (dev_nr * dev_nr * 6)) + return false; + + return true; +} + +/* + * Idle_counter is increased every time when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ +static bool set_at_max_writeback_rate(struct cache_set *c, + struct cached_dev *dc) +{ + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + + /* Don't set max writeback rate if gc is running */ + if (!c->gc_mark_valid) + return false; + + if (!idle_counter_exceeded(c)) + return false; + + if (atomic_read(&c->at_max_writeback_rate) != 1) + atomic_set(&c->at_max_writeback_rate, 1); + + atomic_long_set(&dc->writeback_rate.rate, INT_MAX); + + /* keep writeback_rate_target as existing value */ + dc->writeback_rate_proportional = 0; + dc->writeback_rate_integral_scaled = 0; + dc->writeback_rate_change = 0; + + /* + * In case new I/O arrives during before + * set_at_max_writeback_rate() returns. + */ + if (!idle_counter_exceeded(c) || + !atomic_read(&c->at_max_writeback_rate)) + return false; + + return true; } static void update_writeback_rate(struct work_struct *work) @@ -79,336 +244,856 @@ static void update_writeback_rate(struct work_struct *work) struct cached_dev *dc = container_of(to_delayed_work(work), struct cached_dev, writeback_rate_update); + struct cache_set *c = dc->disk.c; + + /* + * should check BCACHE_DEV_RATE_DW_RUNNING before calling + * cancel_delayed_work_sync(). + */ + set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb__after_atomic(); + + /* + * CACHE_SET_IO_DISABLE might be set via sysfs interface, + * check it here too. + */ + if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb__after_atomic(); + return; + } + + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (atomic_read(&dc->has_dirty) && dc->writeback_percent && + !set_at_max_writeback_rate(c, dc)) { + do { + if (!down_read_trylock((&dc->writeback_lock))) { + dc->rate_update_retry++; + if (dc->rate_update_retry <= + BCH_WBRATE_UPDATE_MAX_SKIPS) + break; + down_read(&dc->writeback_lock); + dc->rate_update_retry = 0; + } + __update_writeback_rate(dc); + update_gc_after_writeback(c); + up_read(&dc->writeback_lock); + } while (0); + } - down_read(&dc->writeback_lock); - if (atomic_read(&dc->has_dirty) && - dc->writeback_percent) - __update_writeback_rate(dc); + /* + * CACHE_SET_IO_DISABLE might be set via sysfs interface, + * check it here too. + */ + if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + } - up_read(&dc->writeback_lock); + /* + * should check BCACHE_DEV_RATE_DW_RUNNING before calling + * cancel_delayed_work_sync(). + */ + clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb__after_atomic(); } -static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) +static unsigned int writeback_delay(struct cached_dev *dc, + unsigned int sectors) { - if (atomic_read(&dc->disk.detaching) || + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; - return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); + return bch_next_delay(&dc->writeback_rate, sectors); } -/* Background writeback */ - -static bool dirty_pred(struct keybuf *buf, struct bkey *k) -{ - return KEY_DIRTY(k); -} +struct dirty_io { + struct closure cl; + struct cached_dev *dc; + uint16_t sequence; + struct bio bio; +}; static void dirty_init(struct keybuf_key *w) { struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio); + bio_init_inline(bio, NULL, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - bio->bi_size = KEY_SIZE(&w->key) << 9; - bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); + bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; bio->bi_private = w; - bio->bi_io_vec = bio->bi_inline_vecs; bch_bio_map(bio, NULL); } -static void refill_dirty(struct closure *cl) +static CLOSURE_CALLBACK(dirty_io_destructor) { - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - struct keybuf *buf = &dc->writeback_keys; - bool searched_from_start = false; - struct bkey end = MAX_KEY; - SET_KEY_INODE(&end, dc->disk.id); + closure_type(io, struct dirty_io, cl); - if (!atomic_read(&dc->disk.detaching) && - !dc->writeback_running) - closure_return(cl); + kfree(io); +} - down_write(&dc->writeback_lock); +static CLOSURE_CALLBACK(write_dirty_finish) +{ + closure_type(io, struct dirty_io, cl); + struct keybuf_key *w = io->bio.bi_private; + struct cached_dev *dc = io->dc; - if (!atomic_read(&dc->has_dirty)) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); + bio_free_pages(&io->bio); - up_write(&dc->writeback_lock); - closure_return(cl); + /* This is kind of a dumb way of signalling errors. */ + if (KEY_DIRTY(&w->key)) { + int ret; + unsigned int i; + struct keylist keys; + + bch_keylist_init(&keys); + + bkey_copy(keys.top, &w->key); + SET_KEY_DIRTY(keys.top, false); + bch_keylist_push(&keys); + + for (i = 0; i < KEY_PTRS(&w->key); i++) + atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); + + ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); + + if (ret) + trace_bcache_writeback_collision(&w->key); + + atomic_long_inc(ret + ? &dc->disk.c->writeback_keys_failed + : &dc->disk.c->writeback_keys_done); } - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; + bch_keybuf_del(&dc->writeback_keys, w); + up(&dc->in_flight); + + closure_return_with_destructor(cl, dirty_io_destructor); +} + +static void dirty_endio(struct bio *bio) +{ + struct keybuf_key *w = bio->bi_private; + struct dirty_io *io = w->private; + + if (bio->bi_status) { + SET_KEY_DIRTY(&w->key, false); + bch_count_backing_io_errors(io->dc, bio); } - bch_refill_keybuf(dc->disk.c, buf, &end); + closure_put(&io->cl); +} - if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { - /* Searched the entire btree - delay awhile */ +static CLOSURE_CALLBACK(write_dirty) +{ + closure_type(io, struct dirty_io, cl); + struct keybuf_key *w = io->bio.bi_private; + struct cached_dev *dc = io->dc; - if (RB_EMPTY_ROOT(&buf->keys)) { - atomic_set(&dc->has_dirty, 0); - cached_dev_put(dc); + uint16_t next_sequence; + + if (atomic_read(&dc->writeback_sequence_next) != io->sequence) { + /* Not our turn to write; wait for a write to complete */ + closure_wait(&dc->writeback_ordering_wait, cl); + + if (atomic_read(&dc->writeback_sequence_next) == io->sequence) { + /* + * Edge case-- it happened in indeterminate order + * relative to when we were added to wait list.. + */ + closure_wake_up(&dc->writeback_ordering_wait); } - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); + continue_at(cl, write_dirty, io->dc->writeback_write_wq); + return; } - up_write(&dc->writeback_lock); + next_sequence = io->sequence + 1; + + /* + * IO errors are signalled using the dirty bit on the key. + * If we failed to read, we should not attempt to write to the + * backing device. Instead, immediately go to write_dirty_finish + * to clean up. + */ + if (KEY_DIRTY(&w->key)) { + dirty_init(w); + io->bio.bi_opf = REQ_OP_WRITE; + io->bio.bi_iter.bi_sector = KEY_START(&w->key); + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; + + /* I/O request sent to backing device */ + closure_bio_submit(io->dc->disk.c, &io->bio, cl); + } - ratelimit_reset(&dc->writeback_rate); + atomic_set(&dc->writeback_sequence_next, next_sequence); + closure_wake_up(&dc->writeback_ordering_wait); - /* Punt to workqueue only so we don't recurse and blow the stack */ - continue_at(cl, read_dirty, dirty_wq); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } -void bch_writeback_queue(struct cached_dev *dc) +static void read_dirty_endio(struct bio *bio) { - if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); + struct keybuf_key *w = bio->bi_private; + struct dirty_io *io = w->private; - continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); - } + /* is_read = 1 */ + bch_count_io_errors(io->dc->disk.c->cache, + bio->bi_status, 1, + "reading dirty data from cache"); + + dirty_endio(bio); } -void bch_writeback_add(struct cached_dev *dc, unsigned sectors) +static CLOSURE_CALLBACK(read_dirty_submit) { - atomic_long_add(sectors, &dc->disk.sectors_dirty); + closure_type(io, struct dirty_io, cl); - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); + closure_bio_submit(io->dc->disk.c, &io->bio, cl); - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ - bch_write_bdev_super(dc, NULL); + continue_at(cl, write_dirty, io->dc->writeback_write_wq); +} + +static void read_dirty(struct cached_dev *dc) +{ + unsigned int delay = 0; + struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; + size_t size; + int nk, i; + struct dirty_io *io; + struct closure cl; + uint16_t sequence = 0; + + BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list)); + atomic_set(&dc->writeback_sequence_next, sequence); + closure_init_stack(&cl); + + /* + * XXX: if we error, background writeback just spins. Should use some + * mempools. + */ + + next = bch_keybuf_next(&dc->writeback_keys); + + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && + next) { + size = 0; + nk = 0; + + do { + BUG_ON(ptr_stale(dc->disk.c, &next->key, 0)); + + /* + * Don't combine too many operations, even if they + * are all small. + */ + if (nk >= MAX_WRITEBACKS_IN_PASS) + break; + + /* + * If the current operation is very large, don't + * further combine operations. + */ + if (size >= MAX_WRITESIZE_IN_PASS) + break; + + /* + * Operations are only eligible to be combined + * if they are contiguous. + * + * TODO: add a heuristic willing to fire a + * certain amount of non-contiguous IO per pass, + * so that we can benefit from backing device + * command queueing. + */ + if ((nk != 0) && bkey_cmp(&keys[nk-1]->key, + &START_KEY(&next->key))) + break; + + size += KEY_SIZE(&next->key); + keys[nk++] = next; + } while ((next = bch_keybuf_next(&dc->writeback_keys))); + + /* Now we have gathered a set of 1..5 keys to write back. */ + for (i = 0; i < nk; i++) { + w = keys[i]; + + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->dc = dc; + io->sequence = sequence++; + + dirty_init(w); + io->bio.bi_opf = REQ_OP_READ; + io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); + bio_set_dev(&io->bio, dc->disk.c->cache->bdev); + io->bio.bi_end_io = read_dirty_endio; + + if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) + goto err_free; + + trace_bcache_writeback(&w->key); + + down(&dc->in_flight); + + /* + * We've acquired a semaphore for the maximum + * simultaneous number of writebacks; from here + * everything happens asynchronously. + */ + closure_call(&io->cl, read_dirty_submit, NULL, &cl); } - bch_writeback_queue(dc); + delay = writeback_delay(dc, size); - if (dc->writeback_percent) - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && + delay) { + schedule_timeout_interruptible(delay); + delay = writeback_delay(dc, 0); + } } -} -/* Background writeback - IO loop */ + if (0) { +err_free: + kfree(w->private); +err: + bch_keybuf_del(&dc->writeback_keys, w); + } -static void dirty_io_destructor(struct closure *cl) -{ - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - kfree(io); + /* + * Wait for outstanding writeback IOs to finish (and keybuf slots to be + * freed) before refilling again + */ + closure_sync(&cl); } -static void write_dirty_finish(struct closure *cl) +/* Scan for dirty data */ + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, + uint64_t offset, int nr_sectors) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - struct keybuf_key *w = io->bio.bi_private; - struct cached_dev *dc = io->dc; - struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); + struct bcache_device *d = c->devices[inode]; + unsigned int stripe_offset, sectors_dirty; + int stripe; - while (bv-- != io->bio.bi_io_vec) - __free_page(bv->bv_page); + if (!d) + return; - /* This is kind of a dumb way of signalling errors. */ - if (KEY_DIRTY(&w->key)) { - unsigned i; - struct btree_op op; - bch_btree_op_init_stack(&op); + stripe = offset_to_stripe(d, offset); + if (stripe < 0) + return; - op.type = BTREE_REPLACE; - bkey_copy(&op.replace, &w->key); + if (UUID_FLASH_ONLY(&c->uuids[inode])) + atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); - SET_KEY_DIRTY(&w->key, false); - bch_keylist_add(&op.keys, &w->key); + stripe_offset = offset & (d->stripe_size - 1); - for (i = 0; i < KEY_PTRS(&w->key); i++) - atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); + while (nr_sectors) { + int s = min_t(unsigned int, abs(nr_sectors), + d->stripe_size - stripe_offset); - pr_debug("clearing %s", pkey(&w->key)); - bch_btree_insert(&op, dc->disk.c); - closure_sync(&op.cl); + if (nr_sectors < 0) + s = -s; - atomic_long_inc(op.insert_collision - ? &dc->disk.c->writeback_keys_failed - : &dc->disk.c->writeback_keys_done); + if (stripe >= d->nr_stripes) + return; + + sectors_dirty = atomic_add_return(s, + d->stripe_sectors_dirty + stripe); + if (sectors_dirty == d->stripe_size) { + if (!test_bit(stripe, d->full_dirty_stripes)) + set_bit(stripe, d->full_dirty_stripes); + } else { + if (test_bit(stripe, d->full_dirty_stripes)) + clear_bit(stripe, d->full_dirty_stripes); + } + + nr_sectors -= s; + stripe_offset = 0; + stripe++; } +} - bch_keybuf_del(&dc->writeback_keys, w); - atomic_dec_bug(&dc->in_flight); +static bool dirty_pred(struct keybuf *buf, struct bkey *k) +{ + struct cached_dev *dc = container_of(buf, + struct cached_dev, + writeback_keys); - closure_wake_up(&dc->writeback_wait); + BUG_ON(KEY_INODE(k) != dc->disk.id); - closure_return_with_destructor(cl, dirty_io_destructor); + return KEY_DIRTY(k); } -static void dirty_endio(struct bio *bio, int error) +static void refill_full_stripes(struct cached_dev *dc) { - struct keybuf_key *w = bio->bi_private; - struct dirty_io *io = w->private; + struct keybuf *buf = &dc->writeback_keys; + unsigned int start_stripe, next_stripe; + int stripe; + bool wrapped = false; - if (error) - SET_KEY_DIRTY(&w->key, false); + stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); + if (stripe < 0) + stripe = 0; - closure_put(&io->cl); + start_stripe = stripe; + + while (1) { + stripe = find_next_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + if (stripe == dc->disk.nr_stripes) + goto next; + + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + buf->last_scanned = KEY(dc->disk.id, + stripe * dc->disk.stripe_size, 0); + + bch_refill_keybuf(dc->disk.c, buf, + &KEY(dc->disk.id, + next_stripe * dc->disk.stripe_size, 0), + dirty_pred); + + if (array_freelist_empty(&buf->freelist)) + return; + + stripe = next_stripe; +next: + if (wrapped && stripe > start_stripe) + return; + + if (stripe == dc->disk.nr_stripes) { + stripe = 0; + wrapped = true; + } + } } -static void write_dirty(struct closure *cl) +/* + * Returns true if we scanned the entire disk + */ +static bool refill_dirty(struct cached_dev *dc) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - struct keybuf_key *w = io->bio.bi_private; + struct keybuf *buf = &dc->writeback_keys; + struct bkey start = KEY(dc->disk.id, 0, 0); + struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + struct bkey start_pos; + + /* + * make sure keybuf pos is inside the range for this disk - at bringup + * we might not be attached yet so this disk's inode nr isn't + * initialized then + */ + if (bkey_cmp(&buf->last_scanned, &start) < 0 || + bkey_cmp(&buf->last_scanned, &end) > 0) + buf->last_scanned = start; + + if (dc->partial_stripes_expensive) { + refill_full_stripes(dc); + if (array_freelist_empty(&buf->freelist)) + return false; + } + + start_pos = buf->last_scanned; + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - dirty_init(w); - io->bio.bi_rw = WRITE; - io->bio.bi_sector = KEY_START(&w->key); - io->bio.bi_bdev = io->dc->bdev; - io->bio.bi_end_io = dirty_endio; + if (bkey_cmp(&buf->last_scanned, &end) < 0) + return false; - trace_bcache_write_dirty(&io->bio); - closure_bio_submit(&io->bio, cl, &io->dc->disk); + /* + * If we get to the end start scanning again from the beginning, and + * only scan up to where we initially started scanning from: + */ + buf->last_scanned = start; + bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); - continue_at(cl, write_dirty_finish, dirty_wq); + return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; } -static void read_dirty_endio(struct bio *bio, int error) +static int bch_writeback_thread(void *arg) { - struct keybuf_key *w = bio->bi_private; - struct dirty_io *io = w->private; + struct cached_dev *dc = arg; + struct cache_set *c = dc->disk.c; + bool searched_full_index; + + bch_ratelimit_reset(&dc->writeback_rate); + + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + down_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + /* + * If the bache device is detaching, skip here and continue + * to perform writeback. Otherwise, if no dirty data on cache, + * or there is dirty data on cache but writeback is disabled, + * the writeback thread should sleep here and wait for others + * to wake up it. + */ + if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && + (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { + up_write(&dc->writeback_lock); + + if (kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + set_current_state(TASK_RUNNING); + break; + } + + schedule(); + continue; + } + set_current_state(TASK_RUNNING); + + searched_full_index = refill_dirty(dc); + + if (searched_full_index && + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + /* + * If bcache device is detaching via sysfs interface, + * writeback thread should stop after there is no dirty + * data on cache. BCACHE_DEV_DETACHING flag is set in + * bch_cached_dev_detach(). + */ + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + struct closure cl; + + closure_init_stack(&cl); + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + + up_write(&dc->writeback_lock); + break; + } + + /* + * When dirty data rate is high (e.g. 50%+), there might + * be heavy buckets fragmentation after writeback + * finished, which hurts following write performance. + * If users really care about write performance they + * may set BCH_ENABLE_AUTO_GC via sysfs, then when + * BCH_DO_AUTO_GC is set, garbage collection thread + * will be wake up here. After moving gc, the shrunk + * btree may be helpful for following write requests. + */ + if (c->gc_after_writeback == + (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { + c->gc_after_writeback &= ~BCH_DO_AUTO_GC; + force_wake_up_gc(c); + } + } + + up_write(&dc->writeback_lock); + + read_dirty(dc); + + if (searched_full_index) { + unsigned int delay = dc->writeback_delay * HZ; + + while (delay && + !kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + delay = schedule_timeout_interruptible(delay); + + bch_ratelimit_reset(&dc->writeback_rate); + } + } + + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); - bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), - error, "reading dirty data from cache"); + cached_dev_put(dc); + wait_for_kthread_stop(); - dirty_endio(bio, error); + return 0; } -static void read_dirty_submit(struct closure *cl) +/* Init */ +#define INIT_KEYS_EACH_TIME 500000 + +struct sectors_dirty_init { + struct btree_op op; + unsigned int inode; + size_t count; +}; + +static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, + struct bkey *k) { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct sectors_dirty_init *op = container_of(_op, + struct sectors_dirty_init, op); + if (KEY_INODE(k) > op->inode) + return MAP_DONE; - trace_bcache_read_dirty(&io->bio); - closure_bio_submit(&io->bio, cl, &io->dc->disk); + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); - continue_at(cl, write_dirty, dirty_wq); + op->count++; + if (!(op->count % INIT_KEYS_EACH_TIME)) + cond_resched(); + + return MAP_CONTINUE; } -static void read_dirty(struct closure *cl) +static int bch_root_node_dirty_init(struct cache_set *c, + struct bcache_device *d, + struct bkey *k) { - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - unsigned delay = writeback_delay(dc, 0); - struct keybuf_key *w; - struct dirty_io *io; + struct sectors_dirty_init op; + int ret; + + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, + 0); + if (ret < 0) + pr_warn("sectors dirty init failed, ret=%d!\n", ret); /* - * XXX: if we error, background writeback just spins. Should use some - * mempools. + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op.op)->wait); - while (1) { - w = bch_keybuf_next(&dc->writeback_keys); - if (!w) - break; - - BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); - - if (delay > 0 && - (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50)) { - w->private = NULL; + return ret; +} - closure_delay(&dc->writeback, delay); - continue_at(cl, read_dirty, dirty_wq); +static int bch_dirty_init_thread(void *arg) +{ + struct dirty_init_thrd_info *info = arg; + struct bch_dirty_init_state *state = info->state; + struct cache_set *c = state->c; + struct btree_iter_stack iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + + k = p = NULL; + prev_idx = 0; + + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + + while (k) { + spin_lock(&state->idx_lock); + cur_idx = state->key_idx; + state->key_idx++; + spin_unlock(&state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter.iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + atomic_set(&state->enough, 1); + /* Update state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; } - dc->last_read = KEY_OFFSET(&w->key); + if (p) { + if (bch_root_node_dirty_init(c, state->d, p) < 0) + goto out; + } - io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) - * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), - GFP_KERNEL); - if (!io) - goto err; + p = NULL; + prev_idx = cur_idx; + } - w->private = io; - io->dc = dc; +out: + /* In order to wake up state->wait in time */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&state->started)) + wake_up(&state->wait); - dirty_init(w); - io->bio.bi_sector = PTR_OFFSET(&w->key, 0); - io->bio.bi_bdev = PTR_CACHE(dc->disk.c, - &w->key, 0)->bdev; - io->bio.bi_rw = READ; - io->bio.bi_end_io = read_dirty_endio; + return 0; +} - if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) - goto err_free; +static int bch_btre_dirty_init_thread_nr(void) +{ + int n = num_online_cpus()/2; - pr_debug("%s", pkey(&w->key)); + if (n == 0) + n = 1; + else if (n > BCH_DIRTY_INIT_THRD_MAX) + n = BCH_DIRTY_INIT_THRD_MAX; - closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); + return n; +} - delay = writeback_delay(dc, KEY_SIZE(&w->key)); +void bch_sectors_dirty_init(struct bcache_device *d) +{ + int i; + struct btree *b = NULL; + struct bkey *k = NULL; + struct btree_iter_stack iter; + struct sectors_dirty_init op; + struct cache_set *c = d->c; + struct bch_dirty_init_state state; + +retry_lock: + b = c->root; + rw_lock(0, b, b->level); + if (b != c->root) { + rw_unlock(0, b); + goto retry_lock; + } - atomic_inc(&dc->in_flight); + /* Just count root keys if no leaf node */ + if (c->root->level == 0) { + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) { + if (KEY_INODE(k) != op.inode) + continue; + sectors_dirty_init_fn(&op.op, c->root, k); + } - if (!closure_wait_event(&dc->writeback_wait, cl, - atomic_read(&dc->in_flight) < 64)) - continue_at(cl, read_dirty, dirty_wq); + rw_unlock(0, b); + return; } - if (0) { -err_free: - kfree(w->private); -err: - bch_keybuf_del(&dc->writeback_keys, w); + memset(&state, 0, sizeof(struct bch_dirty_init_state)); + state.c = c; + state.d = d; + state.total_threads = bch_btre_dirty_init_thread_nr(); + state.key_idx = 0; + spin_lock_init(&state.idx_lock); + atomic_set(&state.started, 0); + atomic_set(&state.enough, 0); + init_waitqueue_head(&state.wait); + + for (i = 0; i < state.total_threads; i++) { + /* Fetch latest state.enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&state.enough)) + break; + + atomic_inc(&state.started); + state.infos[i].state = &state; + state.infos[i].thread = + kthread_run(bch_dirty_init_thread, &state.infos[i], + "bch_dirtcnt[%d]", i); + if (IS_ERR(state.infos[i].thread)) { + pr_err("fails to run thread bch_dirty_init[%d]\n", i); + atomic_dec(&state.started); + for (--i; i >= 0; i--) + kthread_stop(state.infos[i].thread); + goto out; + } } - refill_dirty(cl); +out: + /* Must wait for all threads to stop. */ + wait_event(state.wait, atomic_read(&state.started) == 0); + rw_unlock(0, b); } void bch_cached_dev_writeback_init(struct cached_dev *dc) { - closure_init_unlocked(&dc->writeback); + sema_init(&dc->in_flight, 64); init_rwsem(&dc->writeback_lock); - - bch_keybuf_init(&dc->writeback_keys, dirty_pred); + bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; - dc->writeback_running = true; + dc->writeback_running = false; + dc->writeback_consider_fragment = true; dc->writeback_percent = 10; dc->writeback_delay = 30; - dc->writeback_rate.rate = 1024; + atomic_long_set(&dc->writeback_rate.rate, 1024); + dc->writeback_rate_minimum = 8; - dc->writeback_rate_update_seconds = 30; - dc->writeback_rate_d_term = 16; - dc->writeback_rate_p_term_inverse = 64; - dc->writeback_rate_d_smooth = 8; + dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_fp_term_low = 1; + dc->writeback_rate_fp_term_mid = 10; + dc->writeback_rate_fp_term_high = 1000; + dc->writeback_rate_i_term_inverse = 10000; - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -} + /* For dc->writeback_lock contention in update_writeback_rate() */ + dc->rate_update_retry = 0; -void bch_writeback_exit(void) -{ - if (dirty_wq) - destroy_workqueue(dirty_wq); + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } -int __init bch_writeback_init(void) +int bch_cached_dev_writeback_start(struct cached_dev *dc) { - dirty_wq = create_singlethread_workqueue("bcache_writeback"); - if (!dirty_wq) + dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); + if (!dc->writeback_write_wq) return -ENOMEM; + cached_dev_get(dc); + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); + if (IS_ERR(dc->writeback_thread)) { + cached_dev_put(dc); + destroy_workqueue(dc->writeback_write_wq); + return PTR_ERR(dc->writeback_thread); + } + dc->writeback_running = true; + + WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + bch_writeback_queue(dc); + return 0; } |
