diff options
Diffstat (limited to 'block/blk-wbt.c')
| -rw-r--r-- | block/blk-wbt.c | 215 |
1 files changed, 135 insertions, 80 deletions
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 68a774d7a7c9..eb8037bae0bd 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,13 +25,79 @@ #include <linux/backing-dev.h> #include <linux/swap.h> +#include "blk-stat.h" #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" +#include "blk.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> +enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_SWAP = 4, /* write, from swap_writeout() */ + WBT_DISCARD = 8, /* discard */ + + WBT_NR_BITS = 4, /* number of bits */ +}; + +enum { + WBT_RWQ_BG = 0, + WBT_RWQ_SWAP, + WBT_RWQ_DISCARD, + WBT_NUM_RWQ, +}; + +/* + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + + short enable_state; /* WBT_STATE_* */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct blk_stat_callback *cb; + + u64 sync_issue; + void *sync_cookie; + + unsigned long last_issue; /* issue time of last read rq */ + unsigned long last_comp; /* completion time of last read rq */ + unsigned long min_lat_nsec; + struct rq_qos rqos; + struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; +}; + +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + static inline void wbt_clear_state(struct request *rq) { rq->wbt_flags = 0; @@ -70,8 +136,9 @@ enum { RWB_MIN_WRITE_SAMPLES = 3, /* - * If we have this number of consecutive windows with not enough - * information to scale up or down, scale up. + * If we have this number of consecutive windows without enough + * information to scale up or down, slowly return to center state + * (step == 0). */ RWB_UNKNOWN_BUMP = 5, }; @@ -79,7 +146,7 @@ enum { static inline bool rwb_enabled(struct rq_wb *rwb) { return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && - rwb->wb_normal != 0; + rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) @@ -98,16 +165,16 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; - return time_before(jiffies, wb->dirty_sleep + HZ); + return time_before(jiffies, bdi->last_bdp_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, enum wbt_flags wb_acct) { - if (wb_acct & WBT_KSWAPD) - return &rwb->rq_wait[WBT_RWQ_KSWAPD]; + if (wb_acct & WBT_SWAP) + return &rwb->rq_wait[WBT_RWQ_SWAP]; else if (wb_acct & WBT_DISCARD) return &rwb->rq_wait[WBT_RWQ_DISCARD]; @@ -134,22 +201,14 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, inflight = atomic_dec_return(&rqw->inflight); /* - * wbt got disabled with IO in flight. Wake up any potential - * waiters, we don't have to do more than that. - */ - if (unlikely(!rwb_enabled(rwb))) { - rwb_wake_all(rwb); - return; - } - - /* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we * wake people up. */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; - else if (rwb->wc && !wb_recent_wait(rwb)) + else if (blk_queue_write_cache(rwb->rqos.disk->queue) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -189,13 +248,14 @@ static void wbt_done(struct rq_qos *rqos, struct request *rq) struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { - if (rwb->sync_cookie == rq) { - rwb->sync_issue = 0; - rwb->sync_cookie = NULL; - } + if (wbt_is_read(rq)) { + if (rwb->sync_cookie == rq) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } - if (wbt_is_read(rq)) wb_timestamp(rwb, &rwb->last_comp); + } } else { WARN_ON_ONCE(rq == rwb->sync_cookie); __wbt_done(rqos, wbt_flags(rq)); @@ -217,13 +277,22 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { - u64 now, issue = READ_ONCE(rwb->sync_issue); + u64 issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; - now = ktime_to_ns(ktime_get()); - return now - issue; + return blk_time_get_ns() - issue; +} + +static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; } enum { @@ -235,7 +304,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -288,7 +357,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -358,13 +427,12 @@ static void wb_timer_fn(struct blk_stat_callback *cb) unsigned int inflight = wbt_inflight(rwb); int status; - if (!rwb->rqos.q->disk) + if (!rwb->rqos.disk) return; status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, - inflight); + trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, @@ -380,9 +448,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) break; case LAT_UNKNOWN_WRITES: /* - * We started a the center step, but don't have a valid - * read/write sample, but we do have writes going on. - * Allow step to go negative, to increase write perf. + * We don't have a valid read/write sample, but we do have + * writes going on. Allow step to go negative, to increase + * write performance. */ scale_up(rwb); break; @@ -427,8 +495,7 @@ bool wbt_disabled(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); - return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT || - RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL; + return !rqos || !rwb_enabled(RQWB(rqos)); } u64 wbt_get_min_lat(struct request_queue *q) @@ -463,31 +530,24 @@ static bool close_io(struct rq_wb *rwb) time_before(now, rwb->last_comp + HZ / 10); } -#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP) static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; - /* - * If we got disabled, just return UINT_MAX. This ensures that - * we'll properly inc a new IO, and dec+wakeup at the end. - */ - if (!rwb_enabled(rwb)) - return UINT_MAX; - if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; /* * At this point we know it's a buffered write. If this is - * kswapd trying to free memory, or REQ_SYNC is set, then + * swap trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ - if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb)) limit = rwb->rq_depth.max_depth; else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* @@ -564,8 +624,8 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; } else if (wbt_should_throttle(bio)) { - if (current_is_kswapd()) - flags |= WBT_KSWAPD; + if (bio->bi_opf & REQ_SWAP) + flags |= WBT_SWAP; if (bio_op(bio) == REQ_OP_DISCARD) flags |= WBT_DISCARD; flags |= WBT_TRACKED; @@ -580,11 +640,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) __wbt_done(rqos, flags); } -/* - * May sleep, if we have exceeded the writeback limits. Caller can pass - * in an irq held spinlock, if it holds one when calling this function. - * If we do sleep, we'll release and re-grab it. - */ +/* May sleep, if we have exceeded the writeback limits. */ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); @@ -640,37 +696,36 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } -void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) -{ - struct rq_qos *rqos = wbt_rq_qos(q); - if (rqos) - RQWB(rqos)->wc = write_cache_on; -} - /* * Enable wbt if defaults are configured that way */ -void wbt_enable_default(struct request_queue *q) +void wbt_enable_default(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct rq_qos *rqos; - bool disable_flag = q->elevator && - test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags); + bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); + + mutex_lock(&disk->rqos_state_mutex); + + if (blk_queue_disable_wbt(q)) + enable = false; /* Throttling already enabled? */ rqos = wbt_rq_qos(q); if (rqos) { - if (!disable_flag && - RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; + mutex_unlock(&disk->rqos_state_mutex); return; } + mutex_unlock(&disk->rqos_state_mutex); /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) return; - if (queue_is_mq(q) && !disable_flag) - wbt_init(q); + if (queue_is_mq(q) && enable) + wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -701,16 +756,15 @@ static int wbt_data_dir(const struct request *rq) static void wbt_queue_depth_changed(struct rq_qos *rqos) { - RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q); + RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); wbt_update_limits(RQWB(rqos)); } static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); - struct request_queue *q = rqos->q; - blk_stat_remove_callback(q, rwb->cb); + blk_stat_remove_callback(rqos->disk->queue, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); } @@ -718,17 +772,19 @@ static void wbt_exit(struct rq_qos *rqos) /* * Disable wbt, if enabled by default. */ -void wbt_disable_default(struct request_queue *q) +void wbt_disable_default(struct gendisk *disk) { - struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_qos *rqos = wbt_rq_qos(disk->queue); struct rq_wb *rwb; if (!rqos) return; + mutex_lock(&disk->rqos_state_mutex); rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); rwb->enable_state = WBT_STATE_OFF_DEFAULT; } + mutex_unlock(&disk->rqos_state_mutex); } EXPORT_SYMBOL_GPL(wbt_disable_default); @@ -820,7 +876,7 @@ static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { }; #endif -static struct rq_qos_ops wbt_rqos_ops = { +static const struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, .track = wbt_track, @@ -834,8 +890,9 @@ static struct rq_qos_ops wbt_rqos_ops = { #endif }; -int wbt_init(struct request_queue *q) +int wbt_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct rq_wb *rwb; int i; int ret; @@ -853,22 +910,20 @@ int wbt_init(struct request_queue *q) for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); - rwb->rqos.id = RQ_QOS_WBT; - rwb->rqos.ops = &wbt_rqos_ops; - rwb->rqos.q = q; rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); - - wbt_queue_depth_changed(&rwb->rqos); + rwb->rq_depth.queue_depth = blk_queue_depth(q); + wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ - ret = rq_qos_add(q, &rwb->rqos); + mutex_lock(&q->rq_qos_mutex); + ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); + mutex_unlock(&q->rq_qos_mutex); if (ret) goto err_free; |
