From a5a0809bc58e133d674e45175b052c9bdf002f1d Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Sun, 23 Jul 2017 08:54:25 -0700
Subject: cpufreq: schedutil: Make iowait boost more energy efficient

Currently the iowait_boost feature in schedutil makes the frequency
go to max on iowait wakeups.  This feature was added to handle a case
that Peter described where the throughput of operations involving
continuous I/O requests [1] is reduced due to running at a lower
frequency, however the lower throughput itself causes utilization to
be low and hence causing frequency to be low hence its "stuck".

Instead of going to max, its also possible to achieve the same effect
by ramping up to max if there are repeated in_iowait wakeups
happening. This patch is an attempt to do that. We start from a lower
frequency (policy->min) and double the boost for every consecutive
iowait update until we reach the maximum iowait boost frequency
(iowait_boost_max).

I ran a synthetic test (continuous O_DIRECT writes in a loop) on an
x86 machine with intel_pstate in passive mode using schedutil.  In
this test the iowait_boost value ramped from 800MHz to 4GHz in 60ms.
The patch achieves the desired improved throughput as the existing
behavior.

[1] https://patchwork.kernel.org/patch/9735885/

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

(limited to 'kernel/sched/cpufreq_schedutil.c')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 29a397067ffa..148844a995a8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,6 +53,7 @@ struct sugov_cpu {
 	struct update_util_data update_util;
 	struct sugov_policy *sg_policy;
 
+	bool iowait_boost_pending;
 	unsigned long iowait_boost;
 	unsigned long iowait_boost_max;
 	u64 last_update;
@@ -169,30 +170,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 				   unsigned int flags)
 {
 	if (flags & SCHED_CPUFREQ_IOWAIT) {
-		sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+		if (sg_cpu->iowait_boost_pending)
+			return;
+
+		sg_cpu->iowait_boost_pending = true;
+
+		if (sg_cpu->iowait_boost) {
+			sg_cpu->iowait_boost <<= 1;
+			if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+				sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+		} else {
+			sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+		}
 	} else if (sg_cpu->iowait_boost) {
 		s64 delta_ns = time - sg_cpu->last_update;
 
 		/* Clear iowait_boost if the CPU apprears to have been idle. */
-		if (delta_ns > TICK_NSEC)
+		if (delta_ns > TICK_NSEC) {
 			sg_cpu->iowait_boost = 0;
+			sg_cpu->iowait_boost_pending = false;
+		}
 	}
 }
 
 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 			       unsigned long *max)
 {
-	unsigned long boost_util = sg_cpu->iowait_boost;
-	unsigned long boost_max = sg_cpu->iowait_boost_max;
+	unsigned long boost_util, boost_max;
 
-	if (!boost_util)
+	if (!sg_cpu->iowait_boost)
 		return;
 
+	if (sg_cpu->iowait_boost_pending) {
+		sg_cpu->iowait_boost_pending = false;
+	} else {
+		sg_cpu->iowait_boost >>= 1;
+		if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+			sg_cpu->iowait_boost = 0;
+			return;
+		}
+	}
+
+	boost_util = sg_cpu->iowait_boost;
+	boost_max = sg_cpu->iowait_boost_max;
+
 	if (*util * boost_max < *max * boost_util) {
 		*util = boost_util;
 		*max = boost_max;
 	}
-	sg_cpu->iowait_boost >>= 1;
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -264,6 +289,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 		delta_ns = time - j_sg_cpu->last_update;
 		if (delta_ns > TICK_NSEC) {
 			j_sg_cpu->iowait_boost = 0;
+			j_sg_cpu->iowait_boost_pending = false;
 			continue;
 		}
 		if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
-- 
cgit 


From 251accf98591d7f59f7a2bac2e05c66d16bf2811 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Sun, 23 Jul 2017 08:54:26 -0700
Subject: cpufreq: schedutil: Use unsigned int for iowait boost

Make iowait_boost and iowait_boost_max as unsigned int since its unit
is kHz and this is consistent with struct cpufreq_policy.  Also change
the local variables in sugov_iowait_boost() to match this.

Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/cpufreq_schedutil.c')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 148844a995a8..ddd385f2a985 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -54,8 +54,8 @@ struct sugov_cpu {
 	struct sugov_policy *sg_policy;
 
 	bool iowait_boost_pending;
-	unsigned long iowait_boost;
-	unsigned long iowait_boost_max;
+	unsigned int iowait_boost;
+	unsigned int iowait_boost_max;
 	u64 last_update;
 
 	/* The fields below are only needed when sharing a policy. */
@@ -196,7 +196,7 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 			       unsigned long *max)
 {
-	unsigned long boost_util, boost_max;
+	unsigned int boost_util, boost_max;
 
 	if (!sg_cpu->iowait_boost)
 		return;
-- 
cgit 


From 674e75411fc260b0d4532701228cfe12fc090da8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 28 Jul 2017 12:16:38 +0530
Subject: sched: cpufreq: Allow remote cpufreq callbacks

With Android UI and benchmarks the latency of cpufreq response to
certain scheduling events can become very critical. Currently, callbacks
into cpufreq governors are only made from the scheduler if the target
CPU of the event is the same as the current CPU. This means there are
certain situations where a target CPU may not run the cpufreq governor
for some time.

One testcase to show this behavior is where a task starts running on
CPU0, then a new task is also spawned on CPU0 by a task on CPU1. If the
system is configured such that the new tasks should receive maximum
demand initially, this should result in CPU0 increasing frequency
immediately. But because of the above mentioned limitation though, this
does not occur.

This patch updates the scheduler core to call the cpufreq callbacks for
remote CPUs as well.

The schedutil, ondemand and conservative governors are updated to
process cpufreq utilization update hooks called for remote CPUs where
the remote CPU is managed by the cpufreq policy of the local CPU.

The intel_pstate driver is updated to always reject remote callbacks.

This is tested with couple of usecases (Android: hackbench, recentfling,
galleryfling, vellamo, Ubuntu: hackbench) on ARM hikey board (64 bit
octa-core, single policy). Only galleryfling showed minor improvements,
while others didn't had much deviation.

The reason being that this patch only targets a corner case, where
following are required to be true to improve performance and that
doesn't happen too often with these tests:

- Task is migrated to another CPU.
- The task has high demand, and should take the target CPU to higher
  OPPs.
- And the target CPU doesn't call into the cpufreq governor until the
  next tick.

Based on initial work from Steve Muckle.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

(limited to 'kernel/sched/cpufreq_schedutil.c')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index ddd385f2a985..7dbc76801f86 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -52,6 +52,7 @@ struct sugov_policy {
 struct sugov_cpu {
 	struct update_util_data update_util;
 	struct sugov_policy *sg_policy;
+	unsigned int cpu;
 
 	bool iowait_boost_pending;
 	unsigned int iowait_boost;
@@ -77,6 +78,21 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 {
 	s64 delta_ns;
 
+	/*
+	 * Since cpufreq_update_util() is called with rq->lock held for
+	 * the @target_cpu, our per-cpu data is fully serialized.
+	 *
+	 * However, drivers cannot in general deal with cross-cpu
+	 * requests, so while get_next_freq() will work, our
+	 * sugov_update_commit() call may not.
+	 *
+	 * Hence stop here for remote requests if they aren't supported
+	 * by the hardware, as calculating the frequency is pointless if
+	 * we cannot in fact act on it.
+	 */
+	if (!cpufreq_can_do_remote_dvfs(sg_policy->policy))
+		return false;
+
 	if (sg_policy->work_in_progress)
 		return false;
 
@@ -155,12 +171,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
 {
-	struct rq *rq = this_rq();
+	struct rq *rq = cpu_rq(cpu);
 	unsigned long cfs_max;
 
-	cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+	cfs_max = arch_scale_cpu_capacity(NULL, cpu);
 
 	*util = min(rq->cfs.avg.util_avg, cfs_max);
 	*max = cfs_max;
@@ -254,7 +270,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	if (flags & SCHED_CPUFREQ_RT_DL) {
 		next_f = policy->cpuinfo.max_freq;
 	} else {
-		sugov_get_util(&util, &max);
+		sugov_get_util(&util, &max, sg_cpu->cpu);
 		sugov_iowait_boost(sg_cpu, &util, &max);
 		next_f = get_next_freq(sg_policy, util, max);
 		/*
@@ -316,7 +332,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	unsigned long util, max;
 	unsigned int next_f;
 
-	sugov_get_util(&util, &max);
+	sugov_get_util(&util, &max, sg_cpu->cpu);
 
 	raw_spin_lock(&sg_policy->update_lock);
 
@@ -697,6 +713,11 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 
 static int __init sugov_register(void)
 {
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu(sugov_cpu, cpu).cpu = cpu;
+
 	return cpufreq_register_governor(&schedutil_gov);
 }
 fs_initcall(sugov_register);
-- 
cgit 


From 209887e6b974c22328487b55d0f390522b014b03 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 9 Aug 2017 10:21:46 +0530
Subject: cpufreq: Return 0 from ->fast_switch() on errors

CPUFREQ_ENTRY_INVALID is a special symbol which is used to specify that
an entry in the cpufreq table is invalid. But using it outside of the
scope of the cpufreq table looks a bit incorrect.

We can represent an invalid frequency by writing it as 0 instead if we
need. Note that it is already done that way for the return value of the
->get() callback.

Lets do the same for ->fast_switch() and not use CPUFREQ_ENTRY_INVALID
outside of the scope of cpufreq table.

Also update the comment over cpufreq_driver_fast_switch() to clearly
mention what this returns.

None of the drivers return CPUFREQ_ENTRY_INVALID as of now from
->fast_switch() callback and so we don't need to update any of those.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched/cpufreq_schedutil.c')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7dbc76801f86..2ba04bb3182a 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -123,7 +123,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 
 	if (policy->fast_switch_enabled) {
 		next_freq = cpufreq_driver_fast_switch(policy, next_freq);
-		if (next_freq == CPUFREQ_ENTRY_INVALID)
+		if (!next_freq)
 			return;
 
 		policy->cur = next_freq;
-- 
cgit 


From e2cabe48c20efb174ce0c01190f8b9c5f3ea1d13 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 10 Aug 2017 09:50:55 +0530
Subject: cpufreq: schedutil: Don't restrict kthread to related_cpus
 unnecessarily

Utilization update callbacks are now processed remotely, even on the
CPUs that don't share cpufreq policy with the target CPU (if
dvfs_possible_from_any_cpu flag is set).

But in non-fast switch paths, the frequency is changed only from one of
policy->related_cpus. This happens because the kthread which does the
actual update is bound to a subset of CPUs (i.e. related_cpus).

Allow frequency to be remotely updated as well (i.e. call
__cpufreq_driver_target()) if dvfs_possible_from_any_cpu flag is set.

Reported-by: Pavan Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/sched/cpufreq_schedutil.c')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2ba04bb3182a..69571ee6a175 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -487,7 +487,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 	}
 
 	sg_policy->thread = thread;
-	kthread_bind_mask(thread, policy->related_cpus);
+
+	/* Kthread is bound to all CPUs by default */
+	if (!policy->dvfs_possible_from_any_cpu)
+		kthread_bind_mask(thread, policy->related_cpus);
+
 	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 	mutex_init(&sg_policy->work_lock);
 
-- 
cgit 


From c49cbc19b31e069cb344921c7286d7549767d10e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 14 Aug 2017 14:50:16 +0530
Subject: cpufreq: schedutil: Always process remote callback with slow
 switching

The frequency update from the utilization update handlers can be divided
into two parts:

(A) Finding the next frequency
(B) Updating the frequency

While any CPU can do (A), (B) can be restricted to a group of CPUs only,
depending on the current platform.

For platforms where fast cpufreq switching is possible, both (A) and (B)
are always done from the same CPU and that CPU should be capable of
changing the frequency of the target CPU.

But for platforms where fast cpufreq switching isn't possible, after
doing (A) we wake up a kthread which will eventually do (B). This
kthread is already bound to the right set of CPUs, i.e. only those which
can change the frequency of CPUs of a cpufreq policy. And so any CPU
can actually do (A) in this case, as the frequency is updated from the
right set of CPUs only.

Check cpufreq_can_do_remote_dvfs() only for the fast switching case.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel/sched/cpufreq_schedutil.c')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 69571ee6a175..a07f17a5f38f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -84,13 +84,18 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 	 *
 	 * However, drivers cannot in general deal with cross-cpu
 	 * requests, so while get_next_freq() will work, our
-	 * sugov_update_commit() call may not.
+	 * sugov_update_commit() call may not for the fast switching platforms.
 	 *
 	 * Hence stop here for remote requests if they aren't supported
 	 * by the hardware, as calculating the frequency is pointless if
 	 * we cannot in fact act on it.
+	 *
+	 * For the slow switching platforms, the kthread is always scheduled on
+	 * the right set of CPUs and any CPU can find the next frequency and
+	 * schedule the kthread.
 	 */
-	if (!cpufreq_can_do_remote_dvfs(sg_policy->policy))
+	if (sg_policy->policy->fast_switch_enabled &&
+	    !cpufreq_can_do_remote_dvfs(sg_policy->policy))
 		return false;
 
 	if (sg_policy->work_in_progress)
-- 
cgit