diff options
Diffstat (limited to 'drivers/cpuidle/governors/menu.c')
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 212 |
1 files changed, 76 insertions, 136 deletions
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b96e3da0fedd..52d5d26fc7c6 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -14,14 +14,12 @@ #include <linux/ktime.h> #include <linux/hrtimer.h> #include <linux/tick.h> -#include <linux/sched.h> -#include <linux/sched/loadavg.h> #include <linux/sched/stat.h> #include <linux/math64.h> #include "gov.h" -#define BUCKETS 12 +#define BUCKETS 6 #define INTERVAL_SHIFT 3 #define INTERVALS (1UL << INTERVAL_SHIFT) #define RESOLUTION 1024 @@ -31,12 +29,11 @@ /* * Concepts and ideas behind the menu governor * - * For the menu governor, there are 3 decision factors for picking a C + * For the menu governor, there are 2 decision factors for picking a C * state: * 1) Energy break even point - * 2) Performance impact - * 3) Latency tolerance (from pmqos infrastructure) - * These three factors are treated independently. + * 2) Latency tolerance (from pmqos infrastructure) + * These two factors are treated independently. * * Energy break even point * ----------------------- @@ -44,7 +41,7 @@ * the C state is required to actually break even on this cost. CPUIDLE * provides us this duration in the "target_residency" field. So all that we * need is a good prediction of how long we'll be idle. Like the traditional - * menu governor, we start with the actual known "next timer event" time. + * menu governor, we take the actual known "next timer event" time. * * Since there are other source of wakeups (interrupts for example) than * the next timer event, this estimation is rather optimistic. To get a @@ -53,59 +50,21 @@ * duration always was 50% of the next timer tick, the correction factor will * be 0.5. * - * menu uses a running average for this correction factor, however it uses a - * set of factors, not just a single factor. This stems from the realization - * that the ratio is dependent on the order of magnitude of the expected - * duration; if we expect 500 milliseconds of idle time the likelihood of - * getting an interrupt very early is much higher than if we expect 50 micro - * seconds of idle time. A second independent factor that has big impact on - * the actual factor is if there is (disk) IO outstanding or not. - * (as a special twist, we consider every sleep longer than 50 milliseconds - * as perfect; there are no power gains for sleeping longer than this) - * - * For these two reasons we keep an array of 12 independent factors, that gets - * indexed based on the magnitude of the expected duration as well as the - * "is IO outstanding" property. + * menu uses a running average for this correction factor, but it uses a set of + * factors, not just a single factor. This stems from the realization that the + * ratio is dependent on the order of magnitude of the expected duration; if we + * expect 500 milliseconds of idle time the likelihood of getting an interrupt + * very early is much higher than if we expect 50 micro seconds of idle time. + * For this reason, menu keeps an array of 6 independent factors, that gets + * indexed based on the magnitude of the expected duration. * * Repeatable-interval-detector * ---------------------------- * There are some cases where "next timer" is a completely unusable predictor: * Those cases where the interval is fixed, for example due to hardware - * interrupt mitigation, but also due to fixed transfer rate devices such as - * mice. + * interrupt mitigation, but also due to fixed transfer rate devices like mice. * For this, we use a different predictor: We track the duration of the last 8 - * intervals and if the stand deviation of these 8 intervals is below a - * threshold value, we use the average of these intervals as prediction. - * - * Limiting Performance Impact - * --------------------------- - * C states, especially those with large exit latencies, can have a real - * noticeable impact on workloads, which is not acceptable for most sysadmins, - * and in addition, less performance has a power price of its own. - * - * As a general rule of thumb, menu assumes that the following heuristic - * holds: - * The busier the system, the less impact of C states is acceptable - * - * This rule-of-thumb is implemented using a performance-multiplier: - * If the exit latency times the performance multiplier is longer than - * the predicted duration, the C state is not considered a candidate - * for selection due to a too high performance impact. So the higher - * this multiplier is, the longer we need to be idle to pick a deep C - * state, and thus the less likely a busy CPU will hit such a deep - * C state. - * - * Two factors are used in determing this multiplier: - * a value of 10 is added for each point of "per cpu load average" we have. - * a value of 5 points is added for each process that is waiting for - * IO on this CPU. - * (these values are experimentally determined) - * - * The load average factor gives a longer term (few seconds) input to the - * decision, while the iowait value gives a cpu local instantanious input. - * The iowait factor may look low, but realize that this is also already - * represented in the system load average. - * + * intervals and use them to estimate the duration of the next one. */ struct menu_device { @@ -119,19 +78,10 @@ struct menu_device { int interval_ptr; }; -static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) +static inline int which_bucket(u64 duration_ns) { int bucket = 0; - /* - * We keep two groups of stats; one with no - * IO pending, one without. - * This allows us to calculate - * E(duration)|iowait - */ - if (nr_iowaiters) - bucket = BUCKETS/2; - if (duration_ns < 10ULL * NSEC_PER_USEC) return bucket; if (duration_ns < 100ULL * NSEC_PER_USEC) @@ -145,19 +95,6 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) return bucket + 5; } -/* - * Return a multiplier for the exit latency that is intended - * to take performance requirements into account. - * The more performance critical we estimate the system - * to be, the higher this multiplier, and thus the higher - * the barrier to go to an expensive C state. - */ -static inline int performance_multiplier(unsigned int nr_iowaiters) -{ - /* for IO wait tasks (per cpu!) we add 10x each */ - return 1 + 10 * nr_iowaiters; -} - static DEFINE_PER_CPU(struct menu_device, menu_devices); static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); @@ -170,53 +107,52 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); */ static unsigned int get_typical_interval(struct menu_device *data) { - int i, divisor; - unsigned int min, max, thresh, avg; - uint64_t sum, variance; - - thresh = INT_MAX; /* Discard outliers above this value */ + s64 value, min_thresh = -1, max_thresh = UINT_MAX; + unsigned int max, min, divisor; + u64 avg, variance, avg_sq; + int i; again: - - /* First calculate the average of past intervals */ - min = UINT_MAX; + /* Compute the average and variance of past intervals. */ max = 0; - sum = 0; + min = UINT_MAX; + avg = 0; + variance = 0; divisor = 0; for (i = 0; i < INTERVALS; i++) { - unsigned int value = data->intervals[i]; - if (value <= thresh) { - sum += value; - divisor++; - if (value > max) - max = value; - - if (value < min) - min = value; - } + value = data->intervals[i]; + /* + * Discard the samples outside the interval between the min and + * max thresholds. + */ + if (value <= min_thresh || value >= max_thresh) + continue; + + divisor++; + + avg += value; + variance += value * value; + + if (value > max) + max = value; + + if (value < min) + min = value; } if (!max) return UINT_MAX; - if (divisor == INTERVALS) - avg = sum >> INTERVAL_SHIFT; - else - avg = div_u64(sum, divisor); - - /* Then try to determine variance */ - variance = 0; - for (i = 0; i < INTERVALS; i++) { - unsigned int value = data->intervals[i]; - if (value <= thresh) { - int64_t diff = (int64_t)value - avg; - variance += diff * diff; - } - } - if (divisor == INTERVALS) + if (divisor == INTERVALS) { + avg >>= INTERVAL_SHIFT; variance >>= INTERVAL_SHIFT; - else + } else { + do_div(avg, divisor); do_div(variance, divisor); + } + + avg_sq = avg * avg; + variance -= avg_sq; /* * The typical interval is obtained when standard deviation is @@ -231,25 +167,40 @@ again: * Use this result only if there is no timer to wake us up sooner. */ if (likely(variance <= U64_MAX/36)) { - if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3)) - || variance <= 400) { + if ((avg_sq > variance * 36 && divisor * 4 >= INTERVALS * 3) || + variance <= 400) return avg; - } } /* - * If we have outliers to the upside in our distribution, discard - * those by setting the threshold to exclude these outliers, then + * If there are outliers, discard them by setting thresholds to exclude + * data points at a large enough distance from the average, then * calculate the average and standard deviation again. Once we get - * down to the bottom 3/4 of our samples, stop excluding samples. + * down to the last 3/4 of our samples, stop excluding samples. * * This can deal with workloads that have long pauses interspersed * with sporadic activity with a bunch of short pauses. */ - if ((divisor * 4) <= INTERVALS * 3) + if (divisor * 4 <= INTERVALS * 3) { + /* + * If there are sufficiently many data points still under + * consideration after the outliers have been eliminated, + * returning without a prediction would be a mistake because it + * is likely that the next interval will not exceed the current + * maximum, so return the latter in that case. + */ + if (divisor >= INTERVALS / 2) + return max; + return UINT_MAX; + } + + /* Update the thresholds for the next round. */ + if (avg - min > max - avg) + min_thresh = min; + else + max_thresh = max; - thresh = max - 1; goto again; } @@ -265,8 +216,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct menu_device *data = this_cpu_ptr(&menu_devices); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); u64 predicted_ns; - u64 interactivity_req; - unsigned int nr_iowaiters; ktime_t delta, delta_tick; int i, idx; @@ -275,8 +224,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } - nr_iowaiters = nr_iowait_cpu(dev->cpu); - /* Find the shortest expected idle interval. */ predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; if (predicted_ns > RESIDENCY_THRESHOLD_NS) { @@ -290,7 +237,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } data->next_timer_ns = delta; - data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + data->bucket = which_bucket(data->next_timer_ns); /* Round up the result for half microseconds. */ timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + @@ -308,7 +255,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ data->next_timer_ns = KTIME_MAX; delta_tick = TICK_NSEC / 2; - data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); + data->bucket = BUCKETS - 1; } if (unlikely(drv->state_count <= 1 || latency_req == 0) || @@ -335,15 +282,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ if (predicted_ns < TICK_NSEC) predicted_ns = data->next_timer_ns; - } else { - /* - * Use the performance multiplier and the user-configurable - * latency_req to determine the maximum exit latency. - */ - interactivity_req = div64_u64(predicted_ns, - performance_multiplier(nr_iowaiters)); - if (latency_req > interactivity_req) - latency_req = interactivity_req; + } else if (latency_req > predicted_ns) { + latency_req = predicted_ns; } /* |