diff options
Diffstat (limited to 'block/blk-wbt.c')
| -rw-r--r-- | block/blk-wbt.c | 827 |
1 files changed, 504 insertions, 323 deletions
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6a9a0f03a67b..eb8037bae0bd 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * buffered writeback throttling. loosely based on CoDel. We can't drop * packets for IO scheduling, so the logic is something like this: @@ -24,11 +25,99 @@ #include <linux/backing-dev.h> #include <linux/swap.h> +#include "blk-stat.h" #include "blk-wbt.h" +#include "blk-rq-qos.h" +#include "elevator.h" +#include "blk.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> +enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_SWAP = 4, /* write, from swap_writeout() */ + WBT_DISCARD = 8, /* discard */ + + WBT_NR_BITS = 4, /* number of bits */ +}; + +enum { + WBT_RWQ_BG = 0, + WBT_RWQ_SWAP, + WBT_RWQ_DISCARD, + WBT_NUM_RWQ, +}; + +/* + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + + short enable_state; /* WBT_STATE_* */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct blk_stat_callback *cb; + + u64 sync_issue; + void *sync_cookie; + + unsigned long last_issue; /* issue time of last read rq */ + unsigned long last_comp; /* completion time of last read rq */ + unsigned long min_lat_nsec; + struct rq_qos rqos; + struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; +}; + +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + +static inline void wbt_clear_state(struct request *rq) +{ + rq->wbt_flags = 0; +} + +static inline enum wbt_flags wbt_flags(struct request *rq) +{ + return rq->wbt_flags; +} + +static inline bool wbt_is_tracked(struct request *rq) +{ + return rq->wbt_flags & WBT_TRACKED; +} + +static inline bool wbt_is_read(struct request *rq) +{ + return rq->wbt_flags & WBT_READ; +} + enum { /* * Default setting, we'll scale up (to 75% of QD max) or down (min 1) @@ -47,37 +136,17 @@ enum { RWB_MIN_WRITE_SAMPLES = 3, /* - * If we have this number of consecutive windows with not enough - * information to scale up or down, scale up. + * If we have this number of consecutive windows without enough + * information to scale up or down, slowly return to center state + * (step == 0). */ RWB_UNKNOWN_BUMP = 5, }; static inline bool rwb_enabled(struct rq_wb *rwb) { - return rwb && rwb->wb_normal != 0; -} - -/* - * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, - * false if 'v' + 1 would be bigger than 'below'. - */ -static bool atomic_inc_below(atomic_t *v, int below) -{ - int cur = atomic_read(v); - - for (;;) { - int old; - - if (cur >= below) - return false; - old = atomic_cmpxchg(v, cur, cur + 1); - if (old == cur) - break; - cur = old; - } - - return true; + return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && + rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) @@ -96,14 +165,20 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; - return time_before(jiffies, wb->dirty_sleep + HZ); + return time_before(jiffies, bdi->last_bdp_sleep + HZ); } -static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) +static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, + enum wbt_flags wb_acct) { - return &rwb->rq_wait[is_kswapd]; + if (wb_acct & WBT_SWAP) + return &rwb->rq_wait[WBT_RWQ_SWAP]; + else if (wb_acct & WBT_DISCARD) + return &rwb->rq_wait[WBT_RWQ_DISCARD]; + + return &rwb->rq_wait[WBT_RWQ_BG]; } static void rwb_wake_all(struct rq_wb *rwb) @@ -113,36 +188,27 @@ static void rwb_wake_all(struct rq_wb *rwb) for (i = 0; i < WBT_NUM_RWQ; i++) { struct rq_wait *rqw = &rwb->rq_wait[i]; - if (waitqueue_active(&rqw->wait)) + if (wq_has_sleeper(&rqw->wait)) wake_up_all(&rqw->wait); } } -void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) +static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, + enum wbt_flags wb_acct) { - struct rq_wait *rqw; int inflight, limit; - if (!(wb_acct & WBT_TRACKED)) - return; - - rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); inflight = atomic_dec_return(&rqw->inflight); /* - * wbt got disabled with IO in flight. Wake up any potential - * waiters, we don't have to do more than that. + * For discards, our limit is always the background. For writes, if + * the device does write back caching, drop further down before we + * wake people up. */ - if (unlikely(!rwb_enabled(rwb))) { - rwb_wake_all(rwb); - return; - } - - /* - * If the device does write back caching, drop further down - * before we wake people up. - */ - if (rwb->wc && !wb_recent_wait(rwb)) + if (wb_acct & WBT_DISCARD) + limit = rwb->wb_background; + else if (blk_queue_write_cache(rwb->rqos.disk->queue) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -153,7 +219,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) if (inflight && inflight >= limit) return; - if (waitqueue_active(&rqw->wait)) { + if (wq_has_sleeper(&rqw->wait)) { int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) @@ -161,90 +227,40 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) } } -/* - * Called on completion of a request. Note that it's also called when - * a request is merged, when the request gets freed. - */ -void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) +static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) { - if (!rwb) - return; + struct rq_wb *rwb = RQWB(rqos); + struct rq_wait *rqw; - if (!wbt_is_tracked(stat)) { - if (rwb->sync_cookie == stat) { - rwb->sync_issue = 0; - rwb->sync_cookie = NULL; - } + if (!(wb_acct & WBT_TRACKED)) + return; - if (wbt_is_read(stat)) - wb_timestamp(rwb, &rwb->last_comp); - wbt_clear_state(stat); - } else { - WARN_ON_ONCE(stat == rwb->sync_cookie); - __wbt_done(rwb, wbt_stat_to_mask(stat)); - wbt_clear_state(stat); - } + rqw = get_rq_wait(rwb, wb_acct); + wbt_rqw_done(rwb, rqw, wb_acct); } /* - * Return true, if we can't increase the depth further by scaling + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. */ -static bool calc_wb_limits(struct rq_wb *rwb) +static void wbt_done(struct rq_qos *rqos, struct request *rq) { - unsigned int depth; - bool ret = false; + struct rq_wb *rwb = RQWB(rqos); - if (!rwb->min_lat_nsec) { - rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; - return false; - } + if (!wbt_is_tracked(rq)) { + if (wbt_is_read(rq)) { + if (rwb->sync_cookie == rq) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } - /* - * For QD=1 devices, this is a special case. It's important for those - * to have one request ready when one completes, so force a depth of - * 2 for those devices. On the backend, it'll be a depth of 1 anyway, - * since the device can't have more than that in flight. If we're - * scaling down, then keep a setting of 1/1/1. - */ - if (rwb->queue_depth == 1) { - if (rwb->scale_step > 0) - rwb->wb_max = rwb->wb_normal = 1; - else { - rwb->wb_max = rwb->wb_normal = 2; - ret = true; + wb_timestamp(rwb, &rwb->last_comp); } - rwb->wb_background = 1; } else { - /* - * scale_step == 0 is our default state. If we have suffered - * latency spikes, step will be > 0, and we shrink the - * allowed write depths. If step is < 0, we're only doing - * writes, and we allow a temporarily higher depth to - * increase performance. - */ - depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); - if (rwb->scale_step > 0) - depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); - else if (rwb->scale_step < 0) { - unsigned int maxd = 3 * rwb->queue_depth / 4; - - depth = 1 + ((depth - 1) << -rwb->scale_step); - if (depth > maxd) { - depth = maxd; - ret = true; - } - } - - /* - * Set our max/normal/bg queue depths based on how far - * we have scaled down (->scale_step). - */ - rwb->wb_max = depth; - rwb->wb_normal = (rwb->wb_max + 1) / 2; - rwb->wb_background = (rwb->wb_max + 3) / 4; + WARN_ON_ONCE(rq == rwb->sync_cookie); + __wbt_done(rqos, wbt_flags(rq)); } - - return ret; + wbt_clear_state(rq); } static inline bool stat_sample_valid(struct blk_rq_stat *stat) @@ -261,13 +277,22 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { - u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + u64 issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; - now = ktime_to_ns(ktime_get()); - return now - issue; + return blk_time_get_ns() - issue; +} + +static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; } enum { @@ -279,7 +304,8 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; + struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; /* @@ -323,7 +349,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) return LAT_EXCEEDED; } - if (rwb->scale_step) + if (rqd->scale_step) trace_wbt_stat(bdi, stat); return LAT_OK; @@ -331,58 +357,50 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; + struct rq_depth *rqd = &rwb->rq_depth; - trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, - rwb->wb_background, rwb->wb_normal, rwb->wb_max); + trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rqd->max_depth); +} + +static void calc_wb_limits(struct rq_wb *rwb) +{ + if (rwb->min_lat_nsec == 0) { + rwb->wb_normal = rwb->wb_background = 0; + } else if (rwb->rq_depth.max_depth <= 2) { + rwb->wb_normal = rwb->rq_depth.max_depth; + rwb->wb_background = 1; + } else { + rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; + rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; + } } static void scale_up(struct rq_wb *rwb) { - /* - * Hit max in previous round, stop here - */ - if (rwb->scaled_max) + if (!rq_depth_scale_up(&rwb->rq_depth)) return; - - rwb->scale_step--; + calc_wb_limits(rwb); rwb->unknown_cnt = 0; - - rwb->scaled_max = calc_wb_limits(rwb); - rwb_wake_all(rwb); - - rwb_trace_step(rwb, "step up"); + rwb_trace_step(rwb, tracepoint_string("scale up")); } -/* - * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we - * had a latency violation. - */ static void scale_down(struct rq_wb *rwb, bool hard_throttle) { - /* - * Stop scaling down when we've hit the limit. This also prevents - * ->scale_step from going to crazy values, if the device can't - * keep up. - */ - if (rwb->wb_max == 1) + if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle)) return; - - if (rwb->scale_step < 0 && hard_throttle) - rwb->scale_step = 0; - else - rwb->scale_step++; - - rwb->scaled_max = false; - rwb->unknown_cnt = 0; calc_wb_limits(rwb); - rwb_trace_step(rwb, "step down"); + rwb->unknown_cnt = 0; + rwb_trace_step(rwb, tracepoint_string("scale down")); } static void rwb_arm_timer(struct rq_wb *rwb) { - if (rwb->scale_step > 0) { + struct rq_depth *rqd = &rwb->rq_depth; + + if (rqd->scale_step > 0) { /* * We should speed this up, using some variant of a fast * integer inverse square root calculation. Since we only do @@ -390,7 +408,7 @@ static void rwb_arm_timer(struct rq_wb *rwb) * though. */ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, - int_sqrt((rwb->scale_step + 1) << 8)); + int_sqrt((rqd->scale_step + 1) << 8)); } else { /* * For step < 0, we don't want to increase/decrease the @@ -405,13 +423,16 @@ static void rwb_arm_timer(struct rq_wb *rwb) static void wb_timer_fn(struct blk_stat_callback *cb) { struct rq_wb *rwb = cb->data; + struct rq_depth *rqd = &rwb->rq_depth; unsigned int inflight = wbt_inflight(rwb); int status; + if (!rwb->rqos.disk) + return; + status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, - inflight); + trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, @@ -427,9 +448,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) break; case LAT_UNKNOWN_WRITES: /* - * We started a the center step, but don't have a valid - * read/write sample, but we do have writes going on. - * Allow step to go negative, to increase write perf. + * We don't have a valid read/write sample, but we do have + * writes going on. Allow step to go negative, to increase + * write performance. */ scale_up(rwb); break; @@ -441,9 +462,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) * currently don't have a valid read/write sample. For that * case, slowly return to center state (step == 0). */ - if (rwb->scale_step > 0) + if (rqd->scale_step > 0) scale_up(rwb); - else if (rwb->scale_step < 0) + else if (rqd->scale_step < 0) scale_down(rwb, false); break; default: @@ -453,19 +474,54 @@ static void wb_timer_fn(struct blk_stat_callback *cb) /* * Re-arm timer, if we have IO in flight */ - if (rwb->scale_step || inflight) + if (rqd->scale_step || inflight) rwb_arm_timer(rwb); } -void wbt_update_limits(struct rq_wb *rwb) +static void wbt_update_limits(struct rq_wb *rwb) { - rwb->scale_step = 0; - rwb->scaled_max = false; + struct rq_depth *rqd = &rwb->rq_depth; + + rqd->scale_step = 0; + rqd->scaled_max = false; + + rq_depth_calc_max_depth(rqd); calc_wb_limits(rwb); rwb_wake_all(rwb); } +bool wbt_disabled(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + + return !rqos || !rwb_enabled(RQWB(rqos)); +} + +u64 wbt_get_min_lat(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return 0; + return RQWB(rqos)->min_lat_nsec; +} + +void wbt_set_min_lat(struct request_queue *q, u64 val) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return; + + RQWB(rqos)->min_lat_nsec = val; + if (val) + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + else + RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL; + + wbt_update_limits(RQWB(rqos)); +} + + static bool close_io(struct rq_wb *rwb) { const unsigned long now = jiffies; @@ -474,23 +530,26 @@ static bool close_io(struct rq_wb *rwb) time_before(now, rwb->last_comp + HZ / 10); } -#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP) -static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; + if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) + return rwb->wb_background; + /* * At this point we know it's a buffered write. If this is - * kswapd trying to free memory, or REQ_SYNC is set, set, then + * swap trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ - if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) - limit = rwb->wb_max; - else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { + if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb)) + limit = rwb->rq_depth.max_depth; + else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, * limit us to half the depth for background writeback. @@ -502,185 +561,171 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) return limit; } -static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, - wait_queue_entry_t *wait, unsigned long rw) -{ - /* - * inc it here even if disabled, since we'll dec it at completion. - * this only happens if the task was sleeping in __wbt_wait(), - * and someone turned it off at the same time. - */ - if (!rwb_enabled(rwb)) { - atomic_inc(&rqw->inflight); - return true; - } +struct wbt_wait_data { + struct rq_wb *rwb; + enum wbt_flags wb_acct; + blk_opf_t opf; +}; - /* - * If the waitqueue is already active and we are not the next - * in line to be woken up, wait for our turn. - */ - if (waitqueue_active(&rqw->wait) && - rqw->wait.head.next != &wait->entry) - return false; +static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) +{ + struct wbt_wait_data *data = private_data; + return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf)); +} - return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); +static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) +{ + struct wbt_wait_data *data = private_data; + wbt_rqw_done(data->rwb, rqw, data->wb_acct); } /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. */ -static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) - __releases(lock) - __acquires(lock) +static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, + blk_opf_t opf) { - struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); - DEFINE_WAIT(wait); - - if (may_queue(rwb, rqw, &wait, rw)) - return; - - do { - prepare_to_wait_exclusive(&rqw->wait, &wait, - TASK_UNINTERRUPTIBLE); + struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); + struct wbt_wait_data data = { + .rwb = rwb, + .wb_acct = wb_acct, + .opf = opf, + }; - if (may_queue(rwb, rqw, &wait, rw)) - break; - - if (lock) { - spin_unlock_irq(lock); - io_schedule(); - spin_lock_irq(lock); - } else - io_schedule(); - } while (1); - - finish_wait(&rqw->wait, &wait); + rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } -static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +static inline bool wbt_should_throttle(struct bio *bio) { - const int op = bio_op(bio); - - /* - * If not a WRITE, do nothing - */ - if (op != REQ_OP_WRITE) - return false; - - /* - * Don't throttle WRITE_ODIRECT - */ - if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) + switch (bio_op(bio)) { + case REQ_OP_WRITE: + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == + (REQ_SYNC | REQ_IDLE)) + return false; + fallthrough; + case REQ_OP_DISCARD: + return true; + default: return false; - - return true; + } } -/* - * Returns true if the IO request should be accounted, false if not. - * May sleep, if we have exceeded the writeback limits. Caller can pass - * in an irq held spinlock, if it holds one when calling this function. - * If we do sleep, we'll release and re-grab it. - */ -enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) { - unsigned int ret = 0; + enum wbt_flags flags = 0; if (!rwb_enabled(rwb)) return 0; - if (bio_op(bio) == REQ_OP_READ) - ret = WBT_READ; + if (bio_op(bio) == REQ_OP_READ) { + flags = WBT_READ; + } else if (wbt_should_throttle(bio)) { + if (bio->bi_opf & REQ_SWAP) + flags |= WBT_SWAP; + if (bio_op(bio) == REQ_OP_DISCARD) + flags |= WBT_DISCARD; + flags |= WBT_TRACKED; + } + return flags; +} + +static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); + __wbt_done(rqos, flags); +} + +/* May sleep, if we have exceeded the writeback limits. */ +static void wbt_wait(struct rq_qos *rqos, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + enum wbt_flags flags; - if (!wbt_should_throttle(rwb, bio)) { - if (ret & WBT_READ) + flags = bio_to_wbt_flags(rwb, bio); + if (!(flags & WBT_TRACKED)) { + if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); - return ret; + return; } - __wbt_wait(rwb, bio->bi_opf, lock); + __wbt_wait(rwb, flags, bio->bi_opf); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); +} - if (current_is_kswapd()) - ret |= WBT_KSWAPD; - - return ret | WBT_TRACKED; +static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); } -void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) +static void wbt_issue(struct rq_qos *rqos, struct request *rq) { + struct rq_wb *rwb = RQWB(rqos); + if (!rwb_enabled(rwb)) return; /* - * Track sync issue, in case it takes a long time to complete. Allows - * us to react quicker, if a sync IO takes a long time to complete. - * Note that this is just a hint. 'stat' can go away when the - * request completes, so it's important we never dereference it. We - * only use the address to compare with, which is why we store the - * sync_issue time locally. + * Track sync issue, in case it takes a long time to complete. Allows us + * to react quicker, if a sync IO takes a long time to complete. Note + * that this is just a hint. The request can go away when it completes, + * so it's important we never dereference it. We only use the address to + * compare with, which is why we store the sync_issue time locally. */ - if (wbt_is_read(stat) && !rwb->sync_issue) { - rwb->sync_cookie = stat; - rwb->sync_issue = blk_stat_time(stat); + if (wbt_is_read(rq) && !rwb->sync_issue) { + rwb->sync_cookie = rq; + rwb->sync_issue = rq->io_start_time_ns; } } -void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) +static void wbt_requeue(struct rq_qos *rqos, struct request *rq) { + struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; - if (stat == rwb->sync_cookie) { + if (rq == rwb->sync_cookie) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } } -void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) -{ - if (rwb) { - rwb->queue_depth = depth; - wbt_update_limits(rwb); - } -} - -void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) -{ - if (rwb) - rwb->wc = write_cache_on; -} - /* - * Disable wbt, if enabled by default. Only called from CFQ. + * Enable wbt if defaults are configured that way */ -void wbt_disable_default(struct request_queue *q) +void wbt_enable_default(struct gendisk *disk) { - struct rq_wb *rwb = q->rq_wb; + struct request_queue *q = disk->queue; + struct rq_qos *rqos; + bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); - if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) - wbt_exit(q); -} -EXPORT_SYMBOL_GPL(wbt_disable_default); + mutex_lock(&disk->rqos_state_mutex); + + if (blk_queue_disable_wbt(q)) + enable = false; -/* - * Enable wbt if defaults are configured that way - */ -void wbt_enable_default(struct request_queue *q) -{ /* Throttling already enabled? */ - if (q->rq_wb) + rqos = wbt_rq_qos(q); + if (rqos) { + if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; + mutex_unlock(&disk->rqos_state_mutex); return; + } + mutex_unlock(&disk->rqos_state_mutex); /* Queue not registered? Maybe shutting down... */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return; - if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || - (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) - wbt_init(q); + if (queue_is_mq(q) && enable) + wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -698,15 +743,159 @@ u64 wbt_default_latency_nsec(struct request_queue *q) static int wbt_data_dir(const struct request *rq) { - return rq_data_dir(rq); + const enum req_op op = req_op(rq); + + if (op == REQ_OP_READ) + return READ; + else if (op_is_write(op)) + return WRITE; + + /* don't account */ + return -1; +} + +static void wbt_queue_depth_changed(struct rq_qos *rqos) +{ + RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); + wbt_update_limits(RQWB(rqos)); } -int wbt_init(struct request_queue *q) +static void wbt_exit(struct rq_qos *rqos) { + struct rq_wb *rwb = RQWB(rqos); + + blk_stat_remove_callback(rqos->disk->queue, rwb->cb); + blk_stat_free_callback(rwb->cb); + kfree(rwb); +} + +/* + * Disable wbt, if enabled by default. + */ +void wbt_disable_default(struct gendisk *disk) +{ + struct rq_qos *rqos = wbt_rq_qos(disk->queue); struct rq_wb *rwb; + if (!rqos) + return; + mutex_lock(&disk->rqos_state_mutex); + rwb = RQWB(rqos); + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { + blk_stat_deactivate(rwb->cb); + rwb->enable_state = WBT_STATE_OFF_DEFAULT; + } + mutex_unlock(&disk->rqos_state_mutex); +} +EXPORT_SYMBOL_GPL(wbt_disable_default); + +#ifdef CONFIG_BLK_DEBUG_FS +static int wbt_curr_win_nsec_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%llu\n", rwb->cur_win_nsec); + return 0; +} + +static int wbt_enabled_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%d\n", rwb->enable_state); + return 0; +} + +static int wbt_id_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + + seq_printf(m, "%u\n", rqos->id); + return 0; +} + +static int wbt_inflight_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); int i; - BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); + for (i = 0; i < WBT_NUM_RWQ; i++) + seq_printf(m, "%d: inflight %d\n", i, + atomic_read(&rwb->rq_wait[i].inflight)); + return 0; +} + +static int wbt_min_lat_nsec_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%lu\n", rwb->min_lat_nsec); + return 0; +} + +static int wbt_unknown_cnt_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->unknown_cnt); + return 0; +} + +static int wbt_normal_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->wb_normal); + return 0; +} + +static int wbt_background_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->wb_background); + return 0; +} + +static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { + {"curr_win_nsec", 0400, wbt_curr_win_nsec_show}, + {"enabled", 0400, wbt_enabled_show}, + {"id", 0400, wbt_id_show}, + {"inflight", 0400, wbt_inflight_show}, + {"min_lat_nsec", 0400, wbt_min_lat_nsec_show}, + {"unknown_cnt", 0400, wbt_unknown_cnt_show}, + {"wb_normal", 0400, wbt_normal_show}, + {"wb_background", 0400, wbt_background_show}, + {}, +}; +#endif + +static const struct rq_qos_ops wbt_rqos_ops = { + .throttle = wbt_wait, + .issue = wbt_issue, + .track = wbt_track, + .requeue = wbt_requeue, + .done = wbt_done, + .cleanup = wbt_cleanup, + .queue_depth_changed = wbt_queue_depth_changed, + .exit = wbt_exit, +#ifdef CONFIG_BLK_DEBUG_FS + .debugfs_attrs = wbt_debugfs_attrs, +#endif +}; + +int wbt_init(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct rq_wb *rwb; + int i; + int ret; rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) @@ -718,41 +907,33 @@ int wbt_init(struct request_queue *q) return -ENOMEM; } - for (i = 0; i < WBT_NUM_RWQ; i++) { - atomic_set(&rwb->rq_wait[i].inflight, 0); - init_waitqueue_head(&rwb->rq_wait[i].wait); - } + for (i = 0; i < WBT_NUM_RWQ; i++) + rq_wait_init(&rwb->rq_wait[i]); - rwb->wc = 1; - rwb->queue_depth = RWB_DEF_DEPTH; rwb->last_comp = rwb->last_issue = jiffies; - rwb->queue = q; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; + rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + rwb->rq_depth.queue_depth = blk_queue_depth(q); wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ - q->rq_wb = rwb; - blk_stat_add_callback(q, rwb->cb); + mutex_lock(&q->rq_qos_mutex); + ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); + mutex_unlock(&q->rq_qos_mutex); + if (ret) + goto err_free; - rwb->min_lat_nsec = wbt_default_latency_nsec(q); - - wbt_set_queue_depth(rwb, blk_queue_depth(q)); - wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + blk_stat_add_callback(q, rwb->cb); return 0; -} -void wbt_exit(struct request_queue *q) -{ - struct rq_wb *rwb = q->rq_wb; +err_free: + blk_stat_free_callback(rwb->cb); + kfree(rwb); + return ret; - if (rwb) { - blk_stat_remove_callback(q, rwb->cb); - blk_stat_free_callback(rwb->cb); - q->rq_wb = NULL; - kfree(rwb); - } } |
