summaryrefslogtreecommitdiff
path: root/block/blk-wbt.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-wbt.c')
-rw-r--r--block/blk-wbt.c215
1 files changed, 135 insertions, 80 deletions
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 68a774d7a7c9..eb8037bae0bd 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -25,13 +25,79 @@
#include <linux/backing-dev.h>
#include <linux/swap.h>
+#include "blk-stat.h"
#include "blk-wbt.h"
#include "blk-rq-qos.h"
#include "elevator.h"
+#include "blk.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
+enum wbt_flags {
+ WBT_TRACKED = 1, /* write, tracked for throttling */
+ WBT_READ = 2, /* read */
+ WBT_SWAP = 4, /* write, from swap_writeout() */
+ WBT_DISCARD = 8, /* discard */
+
+ WBT_NR_BITS = 4, /* number of bits */
+};
+
+enum {
+ WBT_RWQ_BG = 0,
+ WBT_RWQ_SWAP,
+ WBT_RWQ_DISCARD,
+ WBT_NUM_RWQ,
+};
+
+/*
+ * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
+ * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
+ * to WBT_STATE_OFF/ON_MANUAL.
+ */
+enum {
+ WBT_STATE_ON_DEFAULT = 1, /* on by default */
+ WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */
+ WBT_STATE_OFF_DEFAULT = 3, /* off by default */
+ WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */
+};
+
+struct rq_wb {
+ /*
+ * Settings that govern how we throttle
+ */
+ unsigned int wb_background; /* background writeback */
+ unsigned int wb_normal; /* normal writeback */
+
+ short enable_state; /* WBT_STATE_* */
+
+ /*
+ * Number of consecutive periods where we don't have enough
+ * information to make a firm scale up/down decision.
+ */
+ unsigned int unknown_cnt;
+
+ u64 win_nsec; /* default window size */
+ u64 cur_win_nsec; /* current window size */
+
+ struct blk_stat_callback *cb;
+
+ u64 sync_issue;
+ void *sync_cookie;
+
+ unsigned long last_issue; /* issue time of last read rq */
+ unsigned long last_comp; /* completion time of last read rq */
+ unsigned long min_lat_nsec;
+ struct rq_qos rqos;
+ struct rq_wait rq_wait[WBT_NUM_RWQ];
+ struct rq_depth rq_depth;
+};
+
+static inline struct rq_wb *RQWB(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct rq_wb, rqos);
+}
+
static inline void wbt_clear_state(struct request *rq)
{
rq->wbt_flags = 0;
@@ -70,8 +136,9 @@ enum {
RWB_MIN_WRITE_SAMPLES = 3,
/*
- * If we have this number of consecutive windows with not enough
- * information to scale up or down, scale up.
+ * If we have this number of consecutive windows without enough
+ * information to scale up or down, slowly return to center state
+ * (step == 0).
*/
RWB_UNKNOWN_BUMP = 5,
};
@@ -79,7 +146,7 @@ enum {
static inline bool rwb_enabled(struct rq_wb *rwb)
{
return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
- rwb->wb_normal != 0;
+ rwb->enable_state != WBT_STATE_OFF_MANUAL;
}
static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
@@ -98,16 +165,16 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
enum wbt_flags wb_acct)
{
- if (wb_acct & WBT_KSWAPD)
- return &rwb->rq_wait[WBT_RWQ_KSWAPD];
+ if (wb_acct & WBT_SWAP)
+ return &rwb->rq_wait[WBT_RWQ_SWAP];
else if (wb_acct & WBT_DISCARD)
return &rwb->rq_wait[WBT_RWQ_DISCARD];
@@ -134,22 +201,14 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
inflight = atomic_dec_return(&rqw->inflight);
/*
- * wbt got disabled with IO in flight. Wake up any potential
- * waiters, we don't have to do more than that.
- */
- if (unlikely(!rwb_enabled(rwb))) {
- rwb_wake_all(rwb);
- return;
- }
-
- /*
* For discards, our limit is always the background. For writes, if
* the device does write back caching, drop further down before we
* wake people up.
*/
if (wb_acct & WBT_DISCARD)
limit = rwb->wb_background;
- else if (rwb->wc && !wb_recent_wait(rwb))
+ else if (blk_queue_write_cache(rwb->rqos.disk->queue) &&
+ !wb_recent_wait(rwb))
limit = 0;
else
limit = rwb->wb_normal;
@@ -189,13 +248,14 @@ static void wbt_done(struct rq_qos *rqos, struct request *rq)
struct rq_wb *rwb = RQWB(rqos);
if (!wbt_is_tracked(rq)) {
- if (rwb->sync_cookie == rq) {
- rwb->sync_issue = 0;
- rwb->sync_cookie = NULL;
- }
+ if (wbt_is_read(rq)) {
+ if (rwb->sync_cookie == rq) {
+ rwb->sync_issue = 0;
+ rwb->sync_cookie = NULL;
+ }
- if (wbt_is_read(rq))
wb_timestamp(rwb, &rwb->last_comp);
+ }
} else {
WARN_ON_ONCE(rq == rwb->sync_cookie);
__wbt_done(rqos, wbt_flags(rq));
@@ -217,13 +277,22 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
{
- u64 now, issue = READ_ONCE(rwb->sync_issue);
+ u64 issue = READ_ONCE(rwb->sync_issue);
if (!issue || !rwb->sync_cookie)
return 0;
- now = ktime_to_ns(ktime_get());
- return now - issue;
+ return blk_time_get_ns() - issue;
+}
+
+static inline unsigned int wbt_inflight(struct rq_wb *rwb)
+{
+ unsigned int i, ret = 0;
+
+ for (i = 0; i < WBT_NUM_RWQ; i++)
+ ret += atomic_read(&rwb->rq_wait[i].inflight);
+
+ return ret;
}
enum {
@@ -235,7 +304,7 @@ enum {
static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
- struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
struct rq_depth *rqd = &rwb->rq_depth;
u64 thislat;
@@ -288,7 +357,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
- struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
struct rq_depth *rqd = &rwb->rq_depth;
trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
@@ -358,13 +427,12 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
unsigned int inflight = wbt_inflight(rwb);
int status;
- if (!rwb->rqos.q->disk)
+ if (!rwb->rqos.disk)
return;
status = latency_exceeded(rwb, cb->stat);
- trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step,
- inflight);
+ trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight);
/*
* If we exceeded the latency target, step down. If we did not,
@@ -380,9 +448,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
break;
case LAT_UNKNOWN_WRITES:
/*
- * We started a the center step, but don't have a valid
- * read/write sample, but we do have writes going on.
- * Allow step to go negative, to increase write perf.
+ * We don't have a valid read/write sample, but we do have
+ * writes going on. Allow step to go negative, to increase
+ * write performance.
*/
scale_up(rwb);
break;
@@ -427,8 +495,7 @@ bool wbt_disabled(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
- return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT ||
- RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL;
+ return !rqos || !rwb_enabled(RQWB(rqos));
}
u64 wbt_get_min_lat(struct request_queue *q)
@@ -463,31 +530,24 @@ static bool close_io(struct rq_wb *rwb)
time_before(now, rwb->last_comp + HZ / 10);
}
-#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
+#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP)
static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
{
unsigned int limit;
- /*
- * If we got disabled, just return UINT_MAX. This ensures that
- * we'll properly inc a new IO, and dec+wakeup at the end.
- */
- if (!rwb_enabled(rwb))
- return UINT_MAX;
-
if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
return rwb->wb_background;
/*
* At this point we know it's a buffered write. If this is
- * kswapd trying to free memory, or REQ_SYNC is set, then
+ * swap trying to free memory, or REQ_SYNC is set, then
* it's WB_SYNC_ALL writeback, and we'll use the max limit for
* that. If the write is marked as a background write, then use
* the idle limit, or go to normal if we haven't had competing
* IO for a bit.
*/
- if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb))
limit = rwb->rq_depth.max_depth;
else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
/*
@@ -564,8 +624,8 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
if (bio_op(bio) == REQ_OP_READ) {
flags = WBT_READ;
} else if (wbt_should_throttle(bio)) {
- if (current_is_kswapd())
- flags |= WBT_KSWAPD;
+ if (bio->bi_opf & REQ_SWAP)
+ flags |= WBT_SWAP;
if (bio_op(bio) == REQ_OP_DISCARD)
flags |= WBT_DISCARD;
flags |= WBT_TRACKED;
@@ -580,11 +640,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
__wbt_done(rqos, flags);
}
-/*
- * May sleep, if we have exceeded the writeback limits. Caller can pass
- * in an irq held spinlock, if it holds one when calling this function.
- * If we do sleep, we'll release and re-grab it.
- */
+/* May sleep, if we have exceeded the writeback limits. */
static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
{
struct rq_wb *rwb = RQWB(rqos);
@@ -640,37 +696,36 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
}
}
-void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
-{
- struct rq_qos *rqos = wbt_rq_qos(q);
- if (rqos)
- RQWB(rqos)->wc = write_cache_on;
-}
-
/*
* Enable wbt if defaults are configured that way
*/
-void wbt_enable_default(struct request_queue *q)
+void wbt_enable_default(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct rq_qos *rqos;
- bool disable_flag = q->elevator &&
- test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags);
+ bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
+
+ mutex_lock(&disk->rqos_state_mutex);
+
+ if (blk_queue_disable_wbt(q))
+ enable = false;
/* Throttling already enabled? */
rqos = wbt_rq_qos(q);
if (rqos) {
- if (!disable_flag &&
- RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
+ if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
+ mutex_unlock(&disk->rqos_state_mutex);
return;
}
+ mutex_unlock(&disk->rqos_state_mutex);
/* Queue not registered? Maybe shutting down... */
if (!blk_queue_registered(q))
return;
- if (queue_is_mq(q) && !disable_flag)
- wbt_init(q);
+ if (queue_is_mq(q) && enable)
+ wbt_init(disk);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);
@@ -701,16 +756,15 @@ static int wbt_data_dir(const struct request *rq)
static void wbt_queue_depth_changed(struct rq_qos *rqos)
{
- RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q);
+ RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
wbt_update_limits(RQWB(rqos));
}
static void wbt_exit(struct rq_qos *rqos)
{
struct rq_wb *rwb = RQWB(rqos);
- struct request_queue *q = rqos->q;
- blk_stat_remove_callback(q, rwb->cb);
+ blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
blk_stat_free_callback(rwb->cb);
kfree(rwb);
}
@@ -718,17 +772,19 @@ static void wbt_exit(struct rq_qos *rqos)
/*
* Disable wbt, if enabled by default.
*/
-void wbt_disable_default(struct request_queue *q)
+void wbt_disable_default(struct gendisk *disk)
{
- struct rq_qos *rqos = wbt_rq_qos(q);
+ struct rq_qos *rqos = wbt_rq_qos(disk->queue);
struct rq_wb *rwb;
if (!rqos)
return;
+ mutex_lock(&disk->rqos_state_mutex);
rwb = RQWB(rqos);
if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
blk_stat_deactivate(rwb->cb);
rwb->enable_state = WBT_STATE_OFF_DEFAULT;
}
+ mutex_unlock(&disk->rqos_state_mutex);
}
EXPORT_SYMBOL_GPL(wbt_disable_default);
@@ -820,7 +876,7 @@ static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
};
#endif
-static struct rq_qos_ops wbt_rqos_ops = {
+static const struct rq_qos_ops wbt_rqos_ops = {
.throttle = wbt_wait,
.issue = wbt_issue,
.track = wbt_track,
@@ -834,8 +890,9 @@ static struct rq_qos_ops wbt_rqos_ops = {
#endif
};
-int wbt_init(struct request_queue *q)
+int wbt_init(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct rq_wb *rwb;
int i;
int ret;
@@ -853,22 +910,20 @@ int wbt_init(struct request_queue *q)
for (i = 0; i < WBT_NUM_RWQ; i++)
rq_wait_init(&rwb->rq_wait[i]);
- rwb->rqos.id = RQ_QOS_WBT;
- rwb->rqos.ops = &wbt_rqos_ops;
- rwb->rqos.q = q;
rwb->last_comp = rwb->last_issue = jiffies;
rwb->win_nsec = RWB_WINDOW_NSEC;
rwb->enable_state = WBT_STATE_ON_DEFAULT;
- rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags);
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
-
- wbt_queue_depth_changed(&rwb->rqos);
+ rwb->rq_depth.queue_depth = blk_queue_depth(q);
+ wbt_update_limits(rwb);
/*
* Assign rwb and add the stats callback.
*/
- ret = rq_qos_add(q, &rwb->rqos);
+ mutex_lock(&q->rq_qos_mutex);
+ ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
+ mutex_unlock(&q->rq_qos_mutex);
if (ret)
goto err_free;