diff options
Diffstat (limited to 'block/blk-iocost.c')
| -rw-r--r-- | block/blk-iocost.c | 473 |
1 files changed, 287 insertions, 186 deletions
diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c2d6bc88d3f1..a0416927d33d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -111,7 +111,7 @@ * busy signal. * * As devices can have deep queues and be unfair in how the queued commands - * are executed, soley depending on rq wait may not result in satisfactory + * are executed, solely depending on rq wait may not result in satisfactory * control quality. For a better control quality, completion latency QoS * parameters can be configured so that the device is considered saturated * if N'th percentile completion latency rises above the set point. @@ -178,12 +178,12 @@ #include <linux/time64.h> #include <linux/parser.h> #include <linux/sched/signal.h> -#include <linux/blk-cgroup.h> #include <asm/local.h> #include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" +#include "blk-cgroup.h" #ifdef CONFIG_TRACEPOINTS @@ -232,7 +232,9 @@ enum { /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, +}; +enum { /* * As vtime is used to calculate the cost of each IO, it needs to * be fairly high precision. For example, it should be able to @@ -256,6 +258,11 @@ enum { VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, VRATE_CLAMP_ADJ_PCT = 4, + /* switch iff the conditions are met for longer than this */ + AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, +}; + +enum { /* if IOs end up waiting for requests, issue less */ RQ_WAIT_BUSY_PCT = 5, @@ -294,9 +301,6 @@ enum { /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, - /* switch iff the conditions are met for longer than this */ - AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, - /* * Count IO size in 4k pages. The 12bit shift helps keeping * size-proportional components of cost calculation in closer @@ -533,8 +537,7 @@ struct ioc_gq { /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; - struct iocg_stat local_stat; - struct iocg_stat desc_stat; + struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; @@ -557,7 +560,6 @@ struct ioc_now { u64 now_ns; u64 now; u64 vnow; - u64 vrate; }; struct iocg_wait { @@ -646,7 +648,7 @@ static const struct ioc_params autop[] = { * vrate adjust percentages indexed by ioc->busy_level. We adjust up on * vtime credit shortage and down on device saturation. */ -static u32 vrate_adj_pct[] = +static const u32 vrate_adj_pct[] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -665,17 +667,13 @@ static struct ioc *q_to_ioc(struct request_queue *q) return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } -static const char *q_name(struct request_queue *q) -{ - if (blk_queue_registered(q)) - return kobject_name(q->kobj.parent); - else - return "<unknown>"; -} - static const char __maybe_unused *ioc_name(struct ioc *ioc) { - return q_name(ioc->rqos.q); + struct gendisk *disk = ioc->rqos.disk; + + if (!disk) + return "<unknown>"; + return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) @@ -802,7 +800,11 @@ static void ioc_refresh_period_us(struct ioc *ioc) ioc_refresh_margins(ioc); } -static int ioc_autop_idx(struct ioc *ioc) +/* + * ioc->rqos.disk isn't initialized when this function is called from + * the init path. + */ +static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) { int idx = ioc->autop_idx; const struct ioc_params *p = &autop[idx]; @@ -810,11 +812,11 @@ static int ioc_autop_idx(struct ioc *ioc) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(ioc->rqos.q)) + if (!blk_queue_nonrot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ - if (blk_queue_depth(ioc->rqos.q) == 1) + if (blk_queue_depth(disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ @@ -827,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc) /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) @@ -870,9 +872,14 @@ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, *page = *seqio = *randio = 0; - if (bps) - *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, - DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE)); + if (bps) { + u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); + + if (bps_pages) + *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); + else + *page = 1; + } if (seqiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); @@ -898,21 +905,28 @@ static void ioc_refresh_lcoefs(struct ioc *ioc) &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); } -static bool ioc_refresh_params(struct ioc *ioc, bool force) +/* + * struct gendisk is required as an argument because ioc->rqos.disk + * is not properly initialized when called from the init path. + */ +static bool ioc_refresh_params_disk(struct ioc *ioc, bool force, + struct gendisk *disk) { const struct ioc_params *p; int idx; lockdep_assert_held(&ioc->lock); - idx = ioc_autop_idx(ioc); + idx = ioc_autop_idx(ioc, disk); p = &autop[idx]; if (idx == ioc->autop_idx && !force) return false; - if (idx != ioc->autop_idx) + if (idx != ioc->autop_idx) { atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + ioc->vtime_base_rate = VTIME_PER_USEC; + } ioc->autop_idx = idx; ioc->autop_too_fast_at = 0; @@ -928,12 +942,17 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * VTIME_PER_USEC, MILLION); - ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] * - VTIME_PER_USEC, MILLION); + ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * + VTIME_PER_USEC, MILLION); return true; } +static bool ioc_refresh_params(struct ioc *ioc, bool force) +{ + return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); +} + /* * When an iocg accumulates too much vtime or gets deactivated, we throw away * some vtime, which lowers the overall device utilization. As the exact amount @@ -980,7 +999,7 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { if (ioc->busy_level != prev_busy_level || nr_lagging) - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); @@ -1023,10 +1042,11 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, static void ioc_now(struct ioc *ioc, struct ioc_now *now) { unsigned seq; + u64 vrate; - now->now_ns = ktime_get(); + now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); - now->vrate = atomic64_read(&ioc->vtime_rate); + vrate = atomic64_read(&ioc->vtime_rate); /* * The current vtime is @@ -1039,7 +1059,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) do { seq = read_seqcount_begin(&ioc->period_seqcount); now->vnow = ioc->period_at_vtime + - (now->now - ioc->period_at) * now->vrate; + (now->now - ioc->period_at) * vrate; } while (read_seqcount_retry(&ioc->period_seqcount, seq)); } @@ -1078,7 +1098,14 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, iocg->child_active_sum); } else { - inuse = clamp_t(u32, inuse, 1, active); + /* + * It may be tempting to turn this into a clamp expression with + * a lower limit of 1 but active may be 0, which cannot be used + * as an upper limit in that situation. This expression allows + * active to clamp inuse unless it is 0, in which case inuse + * becomes 1. + */ + inuse = min(inuse, active) ?: 1; } iocg->last_inuse = iocg->inuse; @@ -1241,7 +1268,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period; + u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; @@ -1327,16 +1354,24 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 tdelta, delay, new_delay; + u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); + /* + * If the delay is set by another CPU, we may be in the past. No need to + * change anything if so. This avoids decay calculation underflow. + */ + if (time_before64(now->now, iocg->delay_at)) + return false; + /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; - if (iocg->delay) - delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + shift = div64_u64(tdelta, USEC_PER_SEC); + if (iocg->delay && shift < BITS_PER_LONG) + delay = iocg->delay >> shift; else delay = 0; @@ -1371,7 +1406,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return true; } else { if (iocg->indelay_since) { - iocg->local_stat.indelay_us += now->now - iocg->indelay_since; + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; @@ -1411,15 +1446,18 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); - /* make sure that nobody messed with @iocg */ - WARN_ON_ONCE(list_empty(&iocg->active_list)); + /* + * make sure that nobody messed with @iocg. Check iocg->pd.online + * to avoid warn when removing blkcg or disk. + */ + WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); WARN_ON_ONCE(iocg->inuse > 1); iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { - iocg->local_stat.indebt_us += now->now - iocg->indebt_since; + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, @@ -1431,7 +1469,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); - struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; + struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; @@ -1440,16 +1478,17 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, return -1; iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); + wait->committed = true; /* * autoremove_wake_function() removes the wait entry only when it - * actually changed the task state. We want the wait always - * removed. Remove explicitly and use default_wake_function(). + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). Note that the + * order of operations is important as finish_wait() tests whether + * @wq_entry is removed without grabbing the lock. */ - list_del_init(&wq_entry->entry); - wait->committed = true; - default_wake_function(wq_entry, mode, flags, key); + list_del_init_careful(&wq_entry->entry); return 0; } @@ -1512,7 +1551,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; @@ -1640,11 +1679,30 @@ static void iocg_build_inner_walk(struct ioc_gq *iocg, } } +/* propagate the deltas to the parent */ +static void iocg_flush_stat_upward(struct ioc_gq *iocg) +{ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->stat; + + parent_stat->usage_us += + iocg->stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + iocg->stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + iocg->stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + iocg->stat.indelay_us - iocg->last_stat.indelay_us; + } + + iocg->last_stat = iocg->stat; +} + /* collect per-cpu counters and propagate the deltas to the parent */ -static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - struct iocg_stat new_stat; u64 abs_vusage = 0; u64 vusage_delta; int cpu; @@ -1660,34 +1718,9 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); - iocg->local_stat.usage_us += iocg->usage_delta_us; - - /* propagate upwards */ - new_stat.usage_us = - iocg->local_stat.usage_us + iocg->desc_stat.usage_us; - new_stat.wait_us = - iocg->local_stat.wait_us + iocg->desc_stat.wait_us; - new_stat.indebt_us = - iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; - new_stat.indelay_us = - iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; - - /* propagate the deltas to the parent */ - if (iocg->level > 0) { - struct iocg_stat *parent_stat = - &iocg->ancestors[iocg->level - 1]->desc_stat; - - parent_stat->usage_us += - new_stat.usage_us - iocg->last_stat.usage_us; - parent_stat->wait_us += - new_stat.wait_us - iocg->last_stat.wait_us; - parent_stat->indebt_us += - new_stat.indebt_us - iocg->last_stat.indebt_us; - parent_stat->indelay_us += - new_stat.indelay_us - iocg->last_stat.indelay_us; - } + iocg->stat.usage_us += iocg->usage_delta_us; - iocg->last_stat = new_stat; + iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ @@ -1698,13 +1731,13 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } @@ -2050,7 +2083,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { struct ioc_gq *iocg; - u64 dur, usage_pct, nr_cycles; + u64 dur, usage_pct, nr_cycles, nr_cycles_shift; /* if no debtor, reset the cycle */ if (!nr_debtors) { @@ -2112,10 +2145,12 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, old_debt = iocg->abs_vdebt; old_delay = iocg->delay; + nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); if (iocg->abs_vdebt) - iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; + iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; + if (iocg->delay) - iocg->delay = iocg->delay >> nr_cycles ?: 1; + iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; iocg_kick_waitq(iocg, true, now); @@ -2151,16 +2186,16 @@ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { - iocg->local_stat.indebt_us += + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { - iocg->local_stat.indelay_us += + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } @@ -2213,8 +2248,8 @@ static void ioc_timer_fn(struct timer_list *timer) LIST_HEAD(surpluses); int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; - u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; - u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 ppm_rthr; + u32 ppm_wthr; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; int prev_busy_level; @@ -2225,6 +2260,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* take care of active iocgs */ spin_lock_irq(&ioc->lock); + ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; ioc_now(ioc, &now); period_vtime = now.vnow - ioc->period_at_vtime; @@ -2297,10 +2334,8 @@ static void ioc_timer_fn(struct timer_list *timer) else usage_dur = max_t(u64, now.now - ioc->period_at, 1); - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); + usage = clamp(DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. @@ -2310,11 +2345,28 @@ static void ioc_timer_fn(struct timer_list *timer) hwm = current_hweight_max(iocg); new_hwi = hweight_after_donation(iocg, old_hwi, hwm, usage, &now); - if (new_hwi < hwm) { + /* + * Donation calculation assumes hweight_after_donation + * to be positive, a condition that a donor w/ hwa < 2 + * can't meet. Don't bother with donation if hwa is + * below 2. It's not gonna make a meaningful difference + * anyway. + */ + if (new_hwi < hwm && hwa >= 2) { iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); - } else { + } else if (!iocg->abs_vdebt) { + /* + * @iocg doesn't have enough to donate. Reset + * its inuse to active. + * + * Don't reset debtors as their inuse's are + * owned by debt handling. This shouldn't affect + * donation calculuation in any meaningful way + * as @iocg doesn't have a meaningful amount of + * share anyway. + */ TRACE_IOCG_PATH(inuse_shortage, iocg, &now, iocg->inuse, iocg->active, iocg->hweight_inuse, new_hwi); @@ -2421,6 +2473,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; + unsigned long flags; current_hweight(iocg, NULL, &hwi); old_hwi = hwi; @@ -2439,11 +2492,11 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, iocg->inuse == iocg->active) return cost; - spin_lock_irq(&ioc->lock); + spin_lock_irqsave(&ioc->lock, flags); /* we own inuse only when @iocg is in the normal active state */ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); return cost; } @@ -2464,7 +2517,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, } while (time_after64(vtime + cost, now->vnow) && iocg->inuse != iocg->active); - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); TRACE_IOCG_PATH(inuse_adjust, iocg, now, old_inuse, iocg->inuse, old_hwi, hwi); @@ -2481,6 +2534,10 @@ static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, u64 seek_pages = 0; u64 cost = 0; + /* Can't calculate cost for empty bio */ + if (!bio->bi_iter.bi_size) + goto out; + switch (bio_op(bio)) { case REQ_OP_READ: coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; @@ -2629,7 +2686,7 @@ retry_lock: if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; @@ -2659,8 +2716,7 @@ retry_lock: * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ - init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); - wait.wait.private = current; + init_wait_func(&wait.wait, iocg_wake_fn); wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ @@ -2730,7 +2786,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); @@ -2758,7 +2814,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) return; - switch (req_op(rq) & REQ_OP_MASK) { + switch (req_op(rq)) { case REQ_OP_READ: pidx = QOS_RLAT; rw = READ; @@ -2771,7 +2827,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) return; } - on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); @@ -2801,18 +2857,18 @@ static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); - blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); spin_lock_irq(&ioc->lock); ioc->running = IOC_STOP; spin_unlock_irq(&ioc->lock); - del_timer_sync(&ioc->timer); + timer_shutdown_sync(&ioc->timer); free_percpu(ioc->pcpu_stat); kfree(ioc); } -static struct rq_qos_ops ioc_rqos_ops = { +static const struct rq_qos_ops ioc_rqos_ops = { .throttle = ioc_rqos_throttle, .merge = ioc_rqos_merge, .done_bio = ioc_rqos_done_bio, @@ -2821,10 +2877,9 @@ static struct rq_qos_ops ioc_rqos_ops = { .exit = ioc_rqos_exit, }; -static int blk_iocost_init(struct request_queue *q) +static int blk_iocost_init(struct gendisk *disk) { struct ioc *ioc; - struct rq_qos *rqos; int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); @@ -2847,11 +2902,6 @@ static int blk_iocost_init(struct request_queue *q) local64_set(&ccs->rq_wait_ns, 0); } - rqos = &ioc->rqos; - rqos->id = RQ_QOS_COST; - rqos->ops = &ioc_rqos_ops; - rqos->q = q; - spin_lock_init(&ioc->lock); timer_setup(&ioc->timer, ioc_timer_fn, 0); INIT_LIST_HEAD(&ioc->active_iocgs); @@ -2860,30 +2910,36 @@ static int blk_iocost_init(struct request_queue *q) ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); - ioc->period_at = ktime_to_us(ktime_get()); + ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); spin_lock_irq(&ioc->lock); ioc->autop_idx = AUTOP_INVALID; - ioc_refresh_params(ioc, true); + ioc_refresh_params_disk(ioc, true, disk); spin_unlock_irq(&ioc->lock); /* - * rqos must be added before activation to allow iocg_pd_init() to + * rqos must be added before activation to allow ioc_pd_init() to * lookup the ioc from q. This means that the rqos methods may get * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ - rq_qos_add(q, rqos); - ret = blkcg_activate_policy(q, &blkcg_policy_iocost); - if (ret) { - rq_qos_del(q, rqos); - free_percpu(ioc->pcpu_stat); - kfree(ioc); - return ret; - } + ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); + if (ret) + goto err_free_ioc; + + ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); + if (ret) + goto err_del_qos; return 0; + +err_del_qos: + rq_qos_del(&ioc->rqos); +err_free_ioc: + free_percpu(ioc->pcpu_stat); + kfree(ioc); + return ret; } static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) @@ -2903,13 +2959,14 @@ static void ioc_cpd_free(struct blkcg_policy_data *cpd) kfree(container_of(cpd, struct ioc_cgrp, cpd)); } -static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; - iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, + disk->node_id); if (!iocg) return NULL; @@ -2944,8 +3001,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); - hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - iocg->waitq_timer.function = iocg_waitq_timer_fn; + hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->level = blkg->blkcg->css.cgroup->level; @@ -2987,34 +3043,28 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; - size_t pos = 0; if (!ioc->enabled) - return 0; + return; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); - pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", - vp10k / 100, vp10k % 100); + seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } - pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", - iocg->last_stat.usage_us); + seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) - pos += scnprintf(buf + pos, size - pos, - " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", - iocg->last_stat.wait_us, - iocg->last_stat.indebt_us, - iocg->last_stat.indelay_us); - - return pos; + seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, @@ -3060,26 +3110,28 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) return -EINVAL; - spin_lock(&blkcg->lock); + spin_lock_irq(&blkcg->lock); iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { - spin_lock_irq(&iocg->ioc->lock); + spin_lock(&iocg->ioc->lock); ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); - spin_unlock_irq(&iocg->ioc->lock); + spin_unlock(&iocg->ioc->lock); } } - spin_unlock(&blkcg->lock); + spin_unlock_irq(&blkcg->lock); return nbytes; } - ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); if (ret) - return ret; + goto err; iocg = blkg_to_iocg(ctx.blkg); @@ -3098,12 +3150,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return nbytes; einval: - blkg_conf_finish(&ctx); - return -EINVAL; + ret = -EINVAL; +err: + blkg_conf_exit(&ctx); + return ret; } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, @@ -3115,6 +3169,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, if (!dname) return 0; + spin_lock(&ioc->lock); seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", ioc->params.qos[QOS_RPPM] / 10000, @@ -3127,6 +3182,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, ioc->params.qos[QOS_MIN] % 10000 / 100, ioc->params.qos[QOS_MAX] / 10000, ioc->params.qos[QOS_MAX] % 10000 / 100); + spin_unlock(&ioc->lock); return 0; } @@ -3158,32 +3214,46 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct block_device *bdev; + struct blkg_conf_ctx ctx; + struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; - char *p; + char *body, *p; + unsigned long memflags; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + blkg_conf_init(&ctx, input); + + memflags = blkg_conf_open_bdev_frozen(&ctx); + if (IS_ERR_VALUE(memflags)) { + ret = memflags; + goto err; + } + + body = ctx.body; + disk = ctx.bdev->bd_disk; + if (!queue_is_mq(disk->queue)) { + ret = -EOPNOTSUPP; + goto err; + } - ioc = q_to_ioc(bdev->bd_disk->queue); + ioc = q_to_ioc(disk->queue); if (!ioc) { - ret = blk_iocost_init(bdev->bd_disk->queue); + ret = blk_iocost_init(disk); if (ret) goto err; - ioc = q_to_ioc(bdev->bd_disk->queue); + ioc = q_to_ioc(disk->queue); } + blk_mq_quiesce_queue(disk->queue); + spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; user = ioc->user_qos_params; - spin_unlock_irq(&ioc->lock); - while ((p = strsep(&input, " \t\n"))) { + while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; @@ -3194,7 +3264,8 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, switch (match_token(p, qos_ctrl_tokens, args)) { case QOS_ENABLE: - match_u64(&args[0], &v); + if (match_u64(&args[0], &v)) + goto einval; enable = v; continue; case QOS_CTRL: @@ -3248,14 +3319,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; - spin_lock_irq(&ioc->lock); - - if (enable) { - blk_stat_enable_accounting(ioc->rqos.q); - blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + if (enable && !ioc->enabled) { + blk_stat_enable_accounting(disk->queue); + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; - } else { - blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + } else if (!enable && ioc->enabled) { + blk_stat_disable_accounting(disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } @@ -3269,12 +3339,21 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - blkdev_put_no_open(bdev); + if (enable) + wbt_disable_default(disk); + else + wbt_enable_default(disk); + + blk_mq_unquiesce_queue(disk->queue); + + blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: + spin_unlock_irq(&ioc->lock); + blk_mq_unquiesce_queue(disk->queue); ret = -EINVAL; err: - blkdev_put_no_open(bdev); + blkg_conf_exit_frozen(&ctx, memflags); return ret; } @@ -3288,12 +3367,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf, if (!dname) return 0; + spin_lock(&ioc->lock); seq_printf(sf, "%s ctrl=%s model=linear " "rbps=%llu rseqiops=%llu rrandiops=%llu " "wbps=%llu wseqiops=%llu wrandiops=%llu\n", dname, ioc->user_cost_model ? "user" : "auto", u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); + spin_unlock(&ioc->lock); return 0; } @@ -3325,31 +3406,44 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct block_device *bdev; + struct blkg_conf_ctx ctx; + struct request_queue *q; + unsigned int memflags; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; - char *p; + char *body, *p; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + blkg_conf_init(&ctx, input); + + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; + + body = ctx.body; + q = bdev_get_queue(ctx.bdev); + if (!queue_is_mq(q)) { + ret = -EOPNOTSUPP; + goto err; + } - ioc = q_to_ioc(bdev->bd_disk->queue); + ioc = q_to_ioc(q); if (!ioc) { - ret = blk_iocost_init(bdev->bd_disk->queue); + ret = blk_iocost_init(ctx.bdev->bd_disk); if (ret) goto err; - ioc = q_to_ioc(bdev->bd_disk->queue); + ioc = q_to_ioc(q); } + memflags = blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; - spin_unlock_irq(&ioc->lock); - while ((p = strsep(&input, " \t\n"))) { + while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; @@ -3384,7 +3478,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, user = true; } - spin_lock_irq(&ioc->lock); if (user) { memcpy(ioc->params.i_lcoefs, u, sizeof(u)); ioc->user_cost_model = true; @@ -3394,13 +3487,21 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - blkdev_put_no_open(bdev); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); + + blkg_conf_exit(&ctx); return nbytes; einval: + spin_unlock_irq(&ioc->lock); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); + ret = -EINVAL; err: - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return ret; } |
