summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-21 14:26:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-21 14:26:28 -0700
commit02b82b02c34321dde10d003aafcd831a769b2a8a (patch)
treeead9a121b6fb2d37bbcc054f711b09d5bd788518 /drivers
parent242ba6656d604aa8dc87451fc08143cb28d5a587 (diff)
parentec3d8b8365e9865b43099e943ec5f0bc12f28f96 (diff)
Merge tag 'pm-5.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull power management updates from Rafael Wysocki: "These are mostly fixes and cleanups all over the code and a new piece of documentation for Intel uncore frequency scaling. Functionality-wise, the intel_idle driver will support Sapphire Rapids Xeons natively now (with some extra facilities for controlling C-states more precisely on those systems), virtual guests will take the ACPI S4 hardware signature into account by default, the intel_pstate driver will take the defualt EPP value from the firmware, cpupower utility will support the AMD P-state driver added in the previous cycle, and there is a new tracer utility for that driver. Specifics: - Allow device_pm_check_callbacks() to be called from interrupt context without issues (Dmitry Baryshkov). - Modify devm_pm_runtime_enable() to automatically handle pm_runtime_dont_use_autosuspend() at driver exit time (Douglas Anderson). - Make the schedutil cpufreq governor use to_gov_attr_set() instead of open coding it (Kevin Hao). - Replace acpi_bus_get_device() with acpi_fetch_acpi_dev() in the cpufreq longhaul driver (Rafael Wysocki). - Unify show() and store() naming in cpufreq and make it use __ATTR_XX (Lianjie Zhang). - Make the intel_pstate driver use the EPP value set by the firmware by default (Srinivas Pandruvada). - Re-order the init checks in the powernow-k8 cpufreq driver (Mario Limonciello). - Make the ACPI processor idle driver check for architectural support for LPI to avoid using it on x86 by mistake (Mario Limonciello). - Add Sapphire Rapids Xeon support to the intel_idle driver (Artem Bityutskiy). - Add 'preferred_cstates' module argument to the intel_idle driver to work around C1 and C1E handling issue on Sapphire Rapids (Artem Bityutskiy). - Add core C6 optimization on Sapphire Rapids to the intel_idle driver (Artem Bityutskiy). - Optimize the haltpoll cpuidle driver a bit (Li RongQing). - Remove leftover text from intel_idle() kerneldoc comment and fix up white space in intel_idle (Rafael Wysocki). - Fix load_image_and_restore() error path (Ye Bin). - Fix typos in comments in the system wakeup hadling code (Tom Rix). - Clean up non-kernel-doc comments in hibernation code (Jiapeng Chong). - Fix __setup handler error handling in system-wide suspend and hibernation core code (Randy Dunlap). - Add device name to suspend_report_result() (Youngjin Jang). - Make virtual guests honour ACPI S4 hardware signature by default (David Woodhouse). - Block power off of a parent PM domain unless child is in deepest state (Ulf Hansson). - Use dev_err_probe() to simplify error handling for generic PM domains (Ahmad Fatoum). - Fix sleep-in-atomic bug caused by genpd_debug_remove() (Shawn Guo). - Document Intel uncore frequency scaling (Srinivas Pandruvada). - Add DTPM hierarchy description (Daniel Lezcano). - Change the locking scheme in DTPM (Daniel Lezcano). - Fix dtpm_cpu cleanup at exit time and missing virtual DTPM pointer release (Daniel Lezcano). - Make dtpm_node_callback[] static (kernel test robot). - Fix spelling mistake "initialze" -> "initialize" in dtpm_create_hierarchy() (Colin Ian King). - Add tracer tool for the amd-pstate driver (Jinzhou Su). - Fix PC6 displaying in turbostat on some systems (Artem Bityutskiy). - Add AMD P-State support to the cpupower utility (Huang Rui)" * tag 'pm-5.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: (58 commits) cpufreq: powernow-k8: Re-order the init checks cpuidle: intel_idle: Drop redundant backslash at line end cpuidle: intel_idle: Update intel_idle() kerneldoc comment PM: hibernate: Honour ACPI hardware signature by default for virtual guests cpufreq: intel_pstate: Use firmware default EPP cpufreq: unify show() and store() naming and use __ATTR_XX PM: core: keep irq flags in device_pm_check_callbacks() cpuidle: haltpoll: Call cpuidle_poll_state_init() later Documentation: amd-pstate: add tracer tool introduction tools/power/x86/amd_pstate_tracer: Add tracer tool for AMD P-state tools/power/x86/intel_pstate_tracer: make tracer as a module cpufreq: amd-pstate: Add more tracepoint for AMD P-State module PM: sleep: Add device name to suspend_report_result() turbostat: fix PC6 displaying on some systems intel_idle: add core C6 optimization for SPR intel_idle: add 'preferred_cstates' module argument intel_idle: add SPR support PM: runtime: Have devm_pm_runtime_enable() handle pm_runtime_dont_use_autosuspend() ACPI: processor idle: Check for architectural support for LPI cpuidle: PSCI: Move the `has_lpi` check to the beginning of the function ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/processor_idle.c15
-rw-r--r--drivers/acpi/sleep.c11
-rw-r--r--drivers/base/power/domain.c42
-rw-r--r--drivers/base/power/main.c16
-rw-r--r--drivers/base/power/runtime.c5
-rw-r--r--drivers/base/power/wakeirq.c2
-rw-r--r--drivers/base/power/wakeup.c4
-rw-r--r--drivers/cpufreq/amd-pstate-trace.h22
-rw-r--r--drivers/cpufreq/amd-pstate.c59
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c10
-rw-r--r--drivers/cpufreq/cpufreq_governor.c6
-rw-r--r--drivers/cpufreq/cpufreq_governor.h12
-rw-r--r--drivers/cpufreq/cpufreq_governor_attr_set.c5
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c10
-rw-r--r--drivers/cpufreq/intel_pstate.c38
-rw-r--r--drivers/cpufreq/longhaul.c4
-rw-r--r--drivers/cpufreq/powernow-k8.c6
-rw-r--r--drivers/cpuidle/cpuidle-haltpoll.c4
-rw-r--r--drivers/idle/intel_idle.c111
-rw-r--r--drivers/pci/pci-driver.c14
-rw-r--r--drivers/pnp/driver.c2
-rw-r--r--drivers/powercap/Kconfig8
-rw-r--r--drivers/powercap/Makefile1
-rw-r--r--drivers/powercap/dtpm.c333
-rw-r--r--drivers/powercap/dtpm_cpu.c55
-rw-r--r--drivers/powercap/dtpm_devfreq.c203
-rw-r--r--drivers/powercap/dtpm_subsys.h22
-rw-r--r--drivers/soc/rockchip/Kconfig8
-rw-r--r--drivers/soc/rockchip/Makefile1
-rw-r--r--drivers/soc/rockchip/dtpm.c65
-rw-r--r--drivers/usb/core/hcd-pci.c4
31 files changed, 925 insertions, 173 deletions
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index f8e9fa82cb9b..32b20efff5f8 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -1080,6 +1080,11 @@ static int flatten_lpi_states(struct acpi_processor *pr,
return 0;
}
+int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu)
+{
+ return -EOPNOTSUPP;
+}
+
static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
{
int ret, i;
@@ -1088,6 +1093,11 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
struct acpi_device *d = NULL;
struct acpi_lpi_states_array info[2], *tmp, *prev, *curr;
+ /* make sure our architecture has support */
+ ret = acpi_processor_ffh_lpi_probe(pr->id);
+ if (ret == -EOPNOTSUPP)
+ return ret;
+
if (!osc_pc_lpi_support_confirmed)
return -EOPNOTSUPP;
@@ -1139,11 +1149,6 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
return 0;
}
-int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu)
-{
- return -ENODEV;
-}
-
int __weak acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi)
{
return -ENODEV;
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index d068ff42fce4..c992e57b2c79 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -871,12 +871,7 @@ static inline void acpi_sleep_syscore_init(void) {}
#ifdef CONFIG_HIBERNATION
static unsigned long s4_hardware_signature;
static struct acpi_table_facs *facs;
-static int sigcheck = -1; /* Default behaviour is just to warn */
-
-void __init acpi_check_s4_hw_signature(int check)
-{
- sigcheck = check;
-}
+int acpi_check_s4_hw_signature = -1; /* Default behaviour is just to warn */
static int acpi_hibernation_begin(pm_message_t stage)
{
@@ -1001,7 +996,7 @@ static void acpi_sleep_hibernate_setup(void)
hibernation_set_ops(old_suspend_ordering ?
&acpi_hibernation_ops_old : &acpi_hibernation_ops);
sleep_states[ACPI_STATE_S4] = 1;
- if (!sigcheck)
+ if (!acpi_check_s4_hw_signature)
return;
acpi_get_table(ACPI_SIG_FACS, 1, (struct acpi_table_header **)&facs);
@@ -1013,7 +1008,7 @@ static void acpi_sleep_hibernate_setup(void)
*/
s4_hardware_signature = facs->hardware_signature;
- if (sigcheck > 0) {
+ if (acpi_check_s4_hw_signature > 0) {
/*
* If we're actually obeying the ACPI specification
* then the signature is written out as part of the
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 5db704f02e71..1ee878d126fd 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -636,6 +636,18 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
atomic_read(&genpd->sd_count) > 0)
return -EBUSY;
+ /*
+ * The children must be in their deepest (powered-off) states to allow
+ * the parent to be powered off. Note that, there's no need for
+ * additional locking, as powering on a child, requires the parent's
+ * lock to be acquired first.
+ */
+ list_for_each_entry(link, &genpd->parent_links, parent_node) {
+ struct generic_pm_domain *child = link->child;
+ if (child->state_idx < child->state_count - 1)
+ return -EBUSY;
+ }
+
list_for_each_entry(pdd, &genpd->dev_list, list_node) {
enum pm_qos_flags_status stat;
@@ -1073,6 +1085,13 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
|| atomic_read(&genpd->sd_count) > 0)
return;
+ /* Check that the children are in their deepest (powered-off) state. */
+ list_for_each_entry(link, &genpd->parent_links, parent_node) {
+ struct generic_pm_domain *child = link->child;
+ if (child->state_idx < child->state_count - 1)
+ return;
+ }
+
/* Choose the deepest state when suspending */
genpd->state_idx = genpd->state_count - 1;
if (_genpd_power_off(genpd, false))
@@ -2058,9 +2077,9 @@ static int genpd_remove(struct generic_pm_domain *genpd)
kfree(link);
}
- genpd_debug_remove(genpd);
list_del(&genpd->gpd_list_node);
genpd_unlock(genpd);
+ genpd_debug_remove(genpd);
cancel_work_sync(&genpd->power_off_work);
if (genpd_is_cpu_domain(genpd))
free_cpumask_var(genpd->cpus);
@@ -2248,12 +2267,8 @@ int of_genpd_add_provider_simple(struct device_node *np,
/* Parse genpd OPP table */
if (genpd->set_performance_state) {
ret = dev_pm_opp_of_add_table(&genpd->dev);
- if (ret) {
- if (ret != -EPROBE_DEFER)
- dev_err(&genpd->dev, "Failed to add OPP table: %d\n",
- ret);
- return ret;
- }
+ if (ret)
+ return dev_err_probe(&genpd->dev, ret, "Failed to add OPP table\n");
/*
* Save table for faster processing while setting performance
@@ -2312,9 +2327,8 @@ int of_genpd_add_provider_onecell(struct device_node *np,
if (genpd->set_performance_state) {
ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i);
if (ret) {
- if (ret != -EPROBE_DEFER)
- dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n",
- i, ret);
+ dev_err_probe(&genpd->dev, ret,
+ "Failed to add OPP table for index %d\n", i);
goto error;
}
@@ -2672,12 +2686,8 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev,
ret = genpd_add_device(pd, dev, base_dev);
mutex_unlock(&gpd_list_lock);
- if (ret < 0) {
- if (ret != -EPROBE_DEFER)
- dev_err(dev, "failed to add to PM domain %s: %d",
- pd->name, ret);
- return ret;
- }
+ if (ret < 0)
+ return dev_err_probe(dev, ret, "failed to add to PM domain %s\n", pd->name);
dev->pm_domain->detach = genpd_dev_pm_detach;
dev->pm_domain->sync = genpd_dev_pm_sync;
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 04ea92cbd9cf..c50139207794 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -485,7 +485,7 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev,
trace_device_pm_callback_start(dev, info, state.event);
error = cb(dev);
trace_device_pm_callback_end(dev, error);
- suspend_report_result(cb, error);
+ suspend_report_result(dev, cb, error);
initcall_debug_report(dev, calltime, cb, error);
@@ -1568,7 +1568,7 @@ static int legacy_suspend(struct device *dev, pm_message_t state,
trace_device_pm_callback_start(dev, info, state.event);
error = cb(dev, state);
trace_device_pm_callback_end(dev, error);
- suspend_report_result(cb, error);
+ suspend_report_result(dev, cb, error);
initcall_debug_report(dev, calltime, cb, error);
@@ -1855,7 +1855,7 @@ unlock:
device_unlock(dev);
if (ret < 0) {
- suspend_report_result(callback, ret);
+ suspend_report_result(dev, callback, ret);
pm_runtime_put(dev);
return ret;
}
@@ -1960,10 +1960,10 @@ int dpm_suspend_start(pm_message_t state)
}
EXPORT_SYMBOL_GPL(dpm_suspend_start);
-void __suspend_report_result(const char *function, void *fn, int ret)
+void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret)
{
if (ret)
- pr_err("%s(): %pS returns %d\n", function, fn, ret);
+ dev_err(dev, "%s(): %pS returns %d\n", function, fn, ret);
}
EXPORT_SYMBOL_GPL(__suspend_report_result);
@@ -2018,7 +2018,9 @@ static bool pm_ops_is_empty(const struct dev_pm_ops *ops)
void device_pm_check_callbacks(struct device *dev)
{
- spin_lock_irq(&dev->power.lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->power.lock, flags);
dev->power.no_pm_callbacks =
(!dev->bus || (pm_ops_is_empty(dev->bus->pm) &&
!dev->bus->suspend && !dev->bus->resume)) &&
@@ -2027,7 +2029,7 @@ void device_pm_check_callbacks(struct device *dev)
(!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) &&
(!dev->driver || (pm_ops_is_empty(dev->driver->pm) &&
!dev->driver->suspend && !dev->driver->resume));
- spin_unlock_irq(&dev->power.lock);
+ spin_unlock_irqrestore(&dev->power.lock, flags);
}
bool dev_pm_skip_suspend(struct device *dev)
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 2f3cce17219b..d4059e6ffeae 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1476,11 +1476,16 @@ EXPORT_SYMBOL_GPL(pm_runtime_enable);
static void pm_runtime_disable_action(void *data)
{
+ pm_runtime_dont_use_autosuspend(data);
pm_runtime_disable(data);
}
/**
* devm_pm_runtime_enable - devres-enabled version of pm_runtime_enable.
+ *
+ * NOTE: this will also handle calling pm_runtime_dont_use_autosuspend() for
+ * you at driver exit time if needed.
+ *
* @dev: Device to handle.
*/
int devm_pm_runtime_enable(struct device *dev)
diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c
index 0004db4a9d3b..d487a6bac630 100644
--- a/drivers/base/power/wakeirq.c
+++ b/drivers/base/power/wakeirq.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq);
*
* Enables wakeirq conditionally. We need to enable wake-up interrupt
* lazily on the first rpm_suspend(). This is needed as the consumer device
- * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would
+ * starts in RPM_SUSPENDED state, and the first pm_runtime_get() would
* otherwise try to disable already disabled wakeirq. The wake-up interrupt
* starts disabled with IRQ_NOAUTOEN set.
*
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 8666590201c9..a57d469676ca 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -587,7 +587,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws)
* @ws: Wakeup source to handle.
*
* Update the @ws' statistics and, if @ws has just been activated, notify the PM
- * core of the event by incrementing the counter of of wakeup events being
+ * core of the event by incrementing the counter of the wakeup events being
* processed.
*/
static void wakeup_source_activate(struct wakeup_source *ws)
@@ -733,7 +733,7 @@ static void wakeup_source_deactivate(struct wakeup_source *ws)
/*
* Increment the counter of registered wakeup events and decrement the
- * couter of wakeup events in progress simultaneously.
+ * counter of wakeup events in progress simultaneously.
*/
cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count);
trace_wakeup_source_deactivate(ws->name, cec);
diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h
index 647505957d4f..35f38ae67fb1 100644
--- a/drivers/cpufreq/amd-pstate-trace.h
+++ b/drivers/cpufreq/amd-pstate-trace.h
@@ -27,6 +27,10 @@ TRACE_EVENT(amd_pstate_perf,
TP_PROTO(unsigned long min_perf,
unsigned long target_perf,
unsigned long capacity,
+ u64 freq,
+ u64 mperf,
+ u64 aperf,
+ u64 tsc,
unsigned int cpu_id,
bool changed,
bool fast_switch
@@ -35,6 +39,10 @@ TRACE_EVENT(amd_pstate_perf,
TP_ARGS(min_perf,
target_perf,
capacity,
+ freq,
+ mperf,
+ aperf,
+ tsc,
cpu_id,
changed,
fast_switch
@@ -44,6 +52,10 @@ TRACE_EVENT(amd_pstate_perf,
__field(unsigned long, min_perf)
__field(unsigned long, target_perf)
__field(unsigned long, capacity)
+ __field(unsigned long long, freq)
+ __field(unsigned long long, mperf)
+ __field(unsigned long long, aperf)
+ __field(unsigned long long, tsc)
__field(unsigned int, cpu_id)
__field(bool, changed)
__field(bool, fast_switch)
@@ -53,15 +65,23 @@ TRACE_EVENT(amd_pstate_perf,
__entry->min_perf = min_perf;
__entry->target_perf = target_perf;
__entry->capacity = capacity;
+ __entry->freq = freq;
+ __entry->mperf = mperf;
+ __entry->aperf = aperf;
+ __entry->tsc = tsc;
__entry->cpu_id = cpu_id;
__entry->changed = changed;
__entry->fast_switch = fast_switch;
),
- TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu cpu_id=%u changed=%s fast_switch=%s",
+ TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u changed=%s fast_switch=%s",
(unsigned long)__entry->min_perf,
(unsigned long)__entry->target_perf,
(unsigned long)__entry->capacity,
+ (unsigned long long)__entry->freq,
+ (unsigned long long)__entry->mperf,
+ (unsigned long long)__entry->aperf,
+ (unsigned long long)__entry->tsc,
(unsigned int)__entry->cpu_id,
(__entry->changed) ? "true" : "false",
(__entry->fast_switch) ? "true" : "false"
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 9ce75ed11f8e..7be38bc6a673 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -66,6 +66,18 @@ MODULE_PARM_DESC(shared_mem,
static struct cpufreq_driver amd_pstate_driver;
/**
+ * struct amd_aperf_mperf
+ * @aperf: actual performance frequency clock count
+ * @mperf: maximum performance frequency clock count
+ * @tsc: time stamp counter
+ */
+struct amd_aperf_mperf {
+ u64 aperf;
+ u64 mperf;
+ u64 tsc;
+};
+
+/**
* struct amd_cpudata - private CPU data for AMD P-State
* @cpu: CPU number
* @req: constraint request to apply
@@ -81,6 +93,9 @@ static struct cpufreq_driver amd_pstate_driver;
* @min_freq: the frequency that mapped to lowest_perf
* @nominal_freq: the frequency that mapped to nominal_perf
* @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf
+ * @cur: Difference of Aperf/Mperf/tsc count between last and current sample
+ * @prev: Last Aperf/Mperf/tsc count value read from register
+ * @freq: current cpu frequency value
* @boost_supported: check whether the Processor or SBIOS supports boost mode
*
* The amd_cpudata is key private data for each CPU thread in AMD P-State, and
@@ -102,6 +117,10 @@ struct amd_cpudata {
u32 nominal_freq;
u32 lowest_nonlinear_freq;
+ struct amd_aperf_mperf cur;
+ struct amd_aperf_mperf prev;
+
+ u64 freq;
bool boost_supported;
};
@@ -211,6 +230,39 @@ static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
max_perf, fast_switch);
}
+static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
+{
+ u64 aperf, mperf, tsc;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ tsc = rdtsc();
+
+ if (cpudata->prev.mperf == mperf || cpudata->prev.tsc == tsc) {
+ local_irq_restore(flags);
+ return false;
+ }
+
+ local_irq_restore(flags);
+
+ cpudata->cur.aperf = aperf;
+ cpudata->cur.mperf = mperf;
+ cpudata->cur.tsc = tsc;
+ cpudata->cur.aperf -= cpudata->prev.aperf;
+ cpudata->cur.mperf -= cpudata->prev.mperf;
+ cpudata->cur.tsc -= cpudata->prev.tsc;
+
+ cpudata->prev.aperf = aperf;
+ cpudata->prev.mperf = mperf;
+ cpudata->prev.tsc = tsc;
+
+ cpudata->freq = div64_u64((cpudata->cur.aperf * cpu_khz), cpudata->cur.mperf);
+
+ return true;
+}
+
static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
u32 des_perf, u32 max_perf, bool fast_switch)
{
@@ -226,8 +278,11 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
value &= ~AMD_CPPC_MAX_PERF(~0L);
value |= AMD_CPPC_MAX_PERF(max_perf);
- trace_amd_pstate_perf(min_perf, des_perf, max_perf,
- cpudata->cpu, (value != prev), fast_switch);
+ if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) {
+ trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
+ cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc,
+ cpudata->cpu, (value != prev), fast_switch);
+ }
if (value == prev)
return;
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 08515f7e515f..b6bd0ff35323 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -146,7 +146,7 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
/************************** sysfs interface ************************/
-static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
+static ssize_t sampling_down_factor_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -161,7 +161,7 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
return count;
}
-static ssize_t store_up_threshold(struct gov_attr_set *attr_set,
+static ssize_t up_threshold_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -177,7 +177,7 @@ static ssize_t store_up_threshold(struct gov_attr_set *attr_set,
return count;
}
-static ssize_t store_down_threshold(struct gov_attr_set *attr_set,
+static ssize_t down_threshold_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -195,7 +195,7 @@ static ssize_t store_down_threshold(struct gov_attr_set *attr_set,
return count;
}
-static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set,
+static ssize_t ignore_nice_load_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -220,7 +220,7 @@ static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set,
return count;
}
-static ssize_t store_freq_step(struct gov_attr_set *attr_set, const char *buf,
+static ssize_t freq_step_store(struct gov_attr_set *attr_set, const char *buf,
size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 63f7c219062b..0d42cf8b88d8 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(gov_dbs_data_mutex);
/* Common sysfs tunables */
/*
- * store_sampling_rate - update sampling rate effective immediately if needed.
+ * sampling_rate_store - update sampling rate effective immediately if needed.
*
* If new rate is smaller than the old, simply updating
* dbs.sampling_rate might not be appropriate. For example, if the
@@ -41,7 +41,7 @@ static DEFINE_MUTEX(gov_dbs_data_mutex);
* This must be called with dbs_data->mutex held, otherwise traversing
* policy_dbs_list isn't safe.
*/
-ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
+ssize_t sampling_rate_store(struct gov_attr_set *attr_set, const char *buf,
size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -80,7 +80,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
return count;
}
-EXPORT_SYMBOL_GPL(store_sampling_rate);
+EXPORT_SYMBOL_GPL(sampling_rate_store);
/**
* gov_update_cpu_data - Update CPU load data.
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index bab8e6140377..a5a0bc3cc23e 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -51,7 +51,7 @@ static inline struct dbs_data *to_dbs_data(struct gov_attr_set *attr_set)
}
#define gov_show_one(_gov, file_name) \
-static ssize_t show_##file_name \
+static ssize_t file_name##_show \
(struct gov_attr_set *attr_set, char *buf) \
{ \
struct dbs_data *dbs_data = to_dbs_data(attr_set); \
@@ -60,7 +60,7 @@ static ssize_t show_##file_name \
}
#define gov_show_one_common(file_name) \
-static ssize_t show_##file_name \
+static ssize_t file_name##_show \
(struct gov_attr_set *attr_set, char *buf) \
{ \
struct dbs_data *dbs_data = to_dbs_data(attr_set); \
@@ -68,12 +68,10 @@ static ssize_t show_##file_name \
}
#define gov_attr_ro(_name) \
-static struct governor_attr _name = \
-__ATTR(_name, 0444, show_##_name, NULL)
+static struct governor_attr _name = __ATTR_RO(_name)
#define gov_attr_rw(_name) \
-static struct governor_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
+static struct governor_attr _name = __ATTR_RW(_name)
/* Common to all CPUs of a policy */
struct policy_dbs_info {
@@ -176,7 +174,7 @@ void od_register_powersave_bias_handler(unsigned int (*f)
(struct cpufreq_policy *, unsigned int, unsigned int),
unsigned int powersave_bias);
void od_unregister_powersave_bias_handler(void);
-ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
+ssize_t sampling_rate_store(struct gov_attr_set *attr_set, const char *buf,
size_t count);
void gov_update_cpu_data(struct dbs_data *dbs_data);
#endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_governor_attr_set.c b/drivers/cpufreq/cpufreq_governor_attr_set.c
index a6f365b9cc1a..771770ea0ed0 100644
--- a/drivers/cpufreq/cpufreq_governor_attr_set.c
+++ b/drivers/cpufreq/cpufreq_governor_attr_set.c
@@ -8,11 +8,6 @@
#include "cpufreq_governor.h"
-static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj)
-{
- return container_of(kobj, struct gov_attr_set, kobj);
-}
-
static inline struct governor_attr *to_gov_attr(struct attribute *attr)
{
return container_of(attr, struct governor_attr, attr);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 6a41ea4729b8..e8fbf970ff07 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -202,7 +202,7 @@ static unsigned int od_dbs_update(struct cpufreq_policy *policy)
/************************** sysfs interface ************************/
static struct dbs_governor od_dbs_gov;
-static ssize_t store_io_is_busy(struct gov_attr_set *attr_set, const char *buf,
+static ssize_t io_is_busy_store(struct gov_attr_set *attr_set, const char *buf,
size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -220,7 +220,7 @@ static ssize_t store_io_is_busy(struct gov_attr_set *attr_set, const char *buf,
return count;
}
-static ssize_t store_up_threshold(struct gov_attr_set *attr_set,
+static ssize_t up_threshold_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -237,7 +237,7 @@ static ssize_t store_up_threshold(struct gov_attr_set *attr_set,
return count;
}
-static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
+static ssize_t sampling_down_factor_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -265,7 +265,7 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
return count;
}
-static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set,
+static ssize_t ignore_nice_load_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@@ -290,7 +290,7 @@ static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set,
return count;
}
-static ssize_t store_powersave_bias(struct gov_attr_set *attr_set,
+static ssize_t powersave_bias_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index bc7f7e6759bd..846bb3a78788 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1692,6 +1692,37 @@ static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata)
}
}
+static void intel_pstate_update_epp_defaults(struct cpudata *cpudata)
+{
+ cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
+
+ /*
+ * If this CPU gen doesn't call for change in balance_perf
+ * EPP return.
+ */
+ if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE)
+ return;
+
+ /*
+ * If powerup EPP is something other than chipset default 0x80 and
+ * - is more performance oriented than 0x80 (default balance_perf EPP)
+ * - But less performance oriented than performance EPP
+ * then use this as new balance_perf EPP.
+ */
+ if (cpudata->epp_default < HWP_EPP_BALANCE_PERFORMANCE &&
+ cpudata->epp_default > HWP_EPP_PERFORMANCE) {
+ epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = cpudata->epp_default;
+ return;
+ }
+
+ /*
+ * Use hard coded value per gen to update the balance_perf
+ * and default EPP.
+ */
+ cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE];
+ intel_pstate_set_epp(cpudata, cpudata->epp_default);
+}
+
static void intel_pstate_hwp_enable(struct cpudata *cpudata)
{
/* First disable HWP notification interrupt till we activate again */
@@ -1705,12 +1736,7 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
if (cpudata->epp_default >= 0)
return;
- if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE) {
- cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
- } else {
- cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE];
- intel_pstate_set_epp(cpudata, cpudata->epp_default);
- }
+ intel_pstate_update_epp_defaults(cpudata);
}
static int atom_get_min_pstate(void)
diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
index c538a153ee82..3e000e1a75c6 100644
--- a/drivers/cpufreq/longhaul.c
+++ b/drivers/cpufreq/longhaul.c
@@ -668,9 +668,9 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
u32 nesting_level,
void *context, void **return_value)
{
- struct acpi_device *d;
+ struct acpi_device *d = acpi_fetch_acpi_dev(obj_handle);
- if (acpi_bus_get_device(obj_handle, &d))
+ if (!d)
return 0;
*return_value = acpi_driver_data(d);
diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c
index 12ab4014af71..d289036beff2 100644
--- a/drivers/cpufreq/powernow-k8.c
+++ b/drivers/cpufreq/powernow-k8.c
@@ -1172,14 +1172,14 @@ static int powernowk8_init(void)
unsigned int i, supported_cpus = 0;
int ret;
+ if (!x86_match_cpu(powernow_k8_ids))
+ return -ENODEV;
+
if (boot_cpu_has(X86_FEATURE_HW_PSTATE)) {
__request_acpi_cpufreq();
return -ENODEV;
}
- if (!x86_match_cpu(powernow_k8_ids))
- return -ENODEV;
-
cpus_read_lock();
for_each_online_cpu(i) {
smp_call_function_single(i, check_supported_cpu, &ret, 1);
diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
index fcc53215bac8..3a39a7f48b77 100644
--- a/drivers/cpuidle/cpuidle-haltpoll.c
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -108,11 +108,11 @@ static int __init haltpoll_init(void)
if (boot_option_idle_override != IDLE_NO_OVERRIDE)
return -ENODEV;
- cpuidle_poll_state_init(drv);
-
if (!kvm_para_available() || !haltpoll_want())
return -ENODEV;
+ cpuidle_poll_state_init(drv);
+
ret = cpuidle_register_driver(drv);
if (ret < 0)
return ret;
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 0b66e25c0e2d..b7640cfe0020 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -64,6 +64,7 @@ static struct cpuidle_driver intel_idle_driver = {
/* intel_idle.max_cstate=0 disables driver */
static int max_cstate = CPUIDLE_STATE_MAX - 1;
static unsigned int disabled_states_mask;
+static unsigned int preferred_states_mask;
static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
@@ -121,9 +122,6 @@ static unsigned int mwait_substates __initdata;
* If the local APIC timer is not known to be reliable in the target idle state,
* enable one-shot tick broadcasting for the target CPU before executing MWAIT.
*
- * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
- * flushing user TLBs.
- *
* Must be called under local_irq_disable().
*/
static __cpuidle int intel_idle(struct cpuidle_device *dev,
@@ -761,6 +759,46 @@ static struct cpuidle_state icx_cstates[] __initdata = {
.enter = NULL }
};
+/*
+ * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
+ * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
+ * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
+ * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
+ * both C1 and C1E requests end up with C1, so there is effectively no C1E.
+ *
+ * By default we enable C1 and disable C1E by marking it with
+ * 'CPUIDLE_FLAG_UNUSABLE'.
+ */
+static struct cpuidle_state spr_cstates[] __initdata = {
+ {
+ .name = "C1",
+ .desc = "MWAIT 0x00",
+ .flags = MWAIT2flg(0x00),
+ .exit_latency = 1,
+ .target_residency = 1,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .name = "C1E",
+ .desc = "MWAIT 0x01",
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
+ CPUIDLE_FLAG_UNUSABLE,
+ .exit_latency = 2,
+ .target_residency = 4,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .name = "C6",
+ .desc = "MWAIT 0x20",
+ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 290,
+ .target_residency = 800,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .enter = NULL }
+};
+
static struct cpuidle_state atom_cstates[] __initdata = {
{
.name = "C1E",
@@ -1104,6 +1142,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
.use_acpi = true,
};
+static const struct idle_cpu idle_cpu_spr __initconst = {
+ .state_table = spr_cstates,
+ .disable_promotion_to_c1e = true,
+ .use_acpi = true,
+};
+
static const struct idle_cpu idle_cpu_avn __initconst = {
.state_table = avn_cstates,
.disable_promotion_to_c1e = true,
@@ -1166,6 +1210,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt),
@@ -1353,6 +1398,8 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
#endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
+static void c1e_promotion_enable(void);
+
/**
* ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
*
@@ -1523,6 +1570,41 @@ static void __init skx_idle_state_table_update(void)
}
}
+/**
+ * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
+ */
+static void __init spr_idle_state_table_update(void)
+{
+ unsigned long long msr;
+
+ /* Check if user prefers C1E over C1. */
+ if (preferred_states_mask & BIT(2)) {
+ if (preferred_states_mask & BIT(1))
+ /* Both can't be enabled, stick to the defaults. */
+ return;
+
+ spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
+ spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
+
+ /* Enable C1E using the "C1E promotion" bit. */
+ c1e_promotion_enable();
+ disable_promotion_to_c1e = false;
+ }
+
+ /*
+ * By default, the C6 state assumes the worst-case scenario of package
+ * C6. However, if PC6 is disabled, we update the numbers to match
+ * core C6.
+ */
+ rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+ /* Limit value 2 and above allow for PC6. */
+ if ((msr & 0x7) < 2) {
+ spr_cstates[2].exit_latency = 190;
+ spr_cstates[2].target_residency = 600;
+ }
+}
+
static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
{
unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1557,6 +1639,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
case INTEL_FAM6_SKYLAKE_X:
skx_idle_state_table_update();
break;
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ spr_idle_state_table_update();
+ break;
}
for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
@@ -1629,6 +1714,15 @@ static void auto_demotion_disable(void)
wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
}
+static void c1e_promotion_enable(void)
+{
+ unsigned long long msr_bits;
+
+ rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+ msr_bits |= 0x2;
+ wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
static void c1e_promotion_disable(void)
{
unsigned long long msr_bits;
@@ -1798,3 +1892,14 @@ module_param(max_cstate, int, 0444);
*/
module_param_named(states_off, disabled_states_mask, uint, 0444);
MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
+/*
+ * Some platforms come with mutually exclusive C-states, so that if one is
+ * enabled, the other C-states must not be used. Example: C1 and C1E on
+ * Sapphire Rapids platform. This parameter allows for selecting the
+ * preferred C-states among the groups of mutually exclusive C-states - the
+ * selected C-states will be registered, the other C-states from the mutually
+ * exclusive group won't be registered. If the platform has no mutually
+ * exclusive C-states, this parameter has no effect.
+ */
+module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
+MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 588588cfda48..415f7664b010 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -596,7 +596,7 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state)
int error;
error = drv->suspend(pci_dev, state);
- suspend_report_result(drv->suspend, error);
+ suspend_report_result(dev, drv->suspend, error);
if (error)
return error;
@@ -775,7 +775,7 @@ static int pci_pm_suspend(struct device *dev)
int error;
error = pm->suspend(dev);
- suspend_report_result(pm->suspend, error);
+ suspend_report_result(dev, pm->suspend, error);
if (error)
return error;
@@ -821,7 +821,7 @@ static int pci_pm_suspend_noirq(struct device *dev)
int error;
error = pm->suspend_noirq(dev);
- suspend_report_result(pm->suspend_noirq, error);
+ suspend_report_result(dev, pm->suspend_noirq, error);
if (error)
return error;
@@ -1010,7 +1010,7 @@ static int pci_pm_freeze(struct device *dev)
int error;
error = pm->freeze(dev);
- suspend_report_result(pm->freeze, error);
+ suspend_report_result(dev, pm->freeze, error);
if (error)
return error;
}
@@ -1030,7 +1030,7 @@ static int pci_pm_freeze_noirq(struct device *dev)
int error;
error = pm->freeze_noirq(dev);
- suspend_report_result(pm->freeze_noirq, error);
+ suspend_report_result(dev, pm->freeze_noirq, error);
if (error)
return error;
}
@@ -1116,7 +1116,7 @@ static int pci_pm_poweroff(struct device *dev)
int error;
error = pm->poweroff(dev);
- suspend_report_result(pm->poweroff, error);
+ suspend_report_result(dev, pm->poweroff, error);
if (error)
return error;
}
@@ -1154,7 +1154,7 @@ static int pci_pm_poweroff_noirq(struct device *dev)
int error;
error = pm->poweroff_noirq(dev);
- suspend_report_result(pm->poweroff_noirq, error);
+ suspend_report_result(dev, pm->poweroff_noirq, error);
if (error)
return error;
}
diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index cc6757dfa3f1..c02e7bf643a6 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -171,7 +171,7 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state)
if (pnp_drv->driver.pm && pnp_drv->driver.pm->suspend) {
error = pnp_drv->driver.pm->suspend(dev);
- suspend_report_result(pnp_drv->driver.pm->suspend, error);
+ suspend_report_result(dev, pnp_drv->driver.pm->suspend, error);
if (error)
return error;
}
diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index 8242e8c5ed77..515e3ceb3393 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -46,6 +46,7 @@ config IDLE_INJECT
config DTPM
bool "Power capping for Dynamic Thermal Power Management (EXPERIMENTAL)"
+ depends on OF
help
This enables support for the power capping for the dynamic
thermal power management userspace engine.
@@ -56,4 +57,11 @@ config DTPM_CPU
help
This enables support for CPU power limitation based on
energy model.
+
+config DTPM_DEVFREQ
+ bool "Add device power capping based on the energy model"
+ depends on DTPM && ENERGY_MODEL
+ help
+ This enables support for device power limitation based on
+ energy model.
endif
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index fabcf388a8d3..494617cdad88 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_DTPM) += dtpm.o
obj-$(CONFIG_DTPM_CPU) += dtpm_cpu.o
+obj-$(CONFIG_DTPM_DEVFREQ) += dtpm_devfreq.o
obj-$(CONFIG_POWERCAP) += powercap_sys.o
obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o
obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o
diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c
index 8cb45f2d3d78..ce920f17f45f 100644
--- a/drivers/powercap/dtpm.c
+++ b/drivers/powercap/dtpm.c
@@ -23,6 +23,9 @@
#include <linux/powercap.h>
#include <linux/slab.h>
#include <linux/mutex.h>
+#include <linux/of.h>
+
+#include "dtpm_subsys.h"
#define DTPM_POWER_LIMIT_FLAG 0
@@ -48,9 +51,7 @@ static int get_max_power_range_uw(struct powercap_zone *pcz, u64 *max_power_uw)
{
struct dtpm *dtpm = to_dtpm(pcz);
- mutex_lock(&dtpm_lock);
*max_power_uw = dtpm->power_max - dtpm->power_min;
- mutex_unlock(&dtpm_lock);
return 0;
}
@@ -80,14 +81,7 @@ static int __get_power_uw(struct dtpm *dtpm, u64 *power_uw)
static int get_power_uw(struct powercap_zone *pcz, u64 *power_uw)
{
- struct dtpm *dtpm = to_dtpm(pcz);
- int ret;
-
- mutex_lock(&dtpm_lock);
- ret = __get_power_uw(dtpm, power_uw);
- mutex_unlock(&dtpm_lock);
-
- return ret;
+ return __get_power_uw(to_dtpm(pcz), power_uw);
}
static void __dtpm_rebalance_weight(struct dtpm *dtpm)
@@ -130,7 +124,16 @@ static void __dtpm_add_power(struct dtpm *dtpm)
}
}
-static int __dtpm_update_power(struct dtpm *dtpm)
+/**
+ * dtpm_update_power - Update the power on the dtpm
+ * @dtpm: a pointer to a dtpm structure to update
+ *
+ * Function to update the power values of the dtpm node specified in
+ * parameter. These new values will be propagated to the tree.
+ *
+ * Return: zero on success, -EINVAL if the values are inconsistent
+ */
+int dtpm_update_power(struct dtpm *dtpm)
{
int ret;
@@ -153,26 +156,6 @@ static int __dtpm_update_power(struct dtpm *dtpm)
}
/**
- * dtpm_update_power - Update the power on the dtpm
- * @dtpm: a pointer to a dtpm structure to update
- *
- * Function to update the power values of the dtpm node specified in
- * parameter. These new values will be propagated to the tree.
- *
- * Return: zero on success, -EINVAL if the values are inconsistent
- */
-int dtpm_update_power(struct dtpm *dtpm)
-{
- int ret;
-
- mutex_lock(&dtpm_lock);
- ret = __dtpm_update_power(dtpm);
- mutex_unlock(&dtpm_lock);
-
- return ret;
-}
-
-/**
* dtpm_release_zone - Cleanup when the node is released
* @pcz: a pointer to a powercap_zone structure
*
@@ -188,48 +171,28 @@ int dtpm_release_zone(struct powercap_zone *pcz)
struct dtpm *dtpm = to_dtpm(pcz);
struct dtpm *parent = dtpm->parent;
- mutex_lock(&dtpm_lock);
-
- if (!list_empty(&dtpm->children)) {
- mutex_unlock(&dtpm_lock);
+ if (!list_empty(&dtpm->children))
return -EBUSY;
- }
if (parent)
list_del(&dtpm->sibling);
__dtpm_sub_power(dtpm);
- mutex_unlock(&dtpm_lock);
-
if (dtpm->ops)
dtpm->ops->release(dtpm);
+ else
+ kfree(dtpm);
- if (root == dtpm)
- root = NULL;
-
- kfree(dtpm);
-
- return 0;
-}
-
-static int __get_power_limit_uw(struct dtpm *dtpm, int cid, u64 *power_limit)
-{
- *power_limit = dtpm->power_limit;
return 0;
}
static int get_power_limit_uw(struct powercap_zone *pcz,
int cid, u64 *power_limit)
{
- struct dtpm *dtpm = to_dtpm(pcz);
- int ret;
-
- mutex_lock(&dtpm_lock);
- ret = __get_power_limit_uw(dtpm, cid, power_limit);
- mutex_unlock(&dtpm_lock);
-
- return ret;
+ *power_limit = to_dtpm(pcz)->power_limit;
+
+ return 0;
}
/*
@@ -289,7 +252,7 @@ static int __set_power_limit_uw(struct dtpm *dtpm, int cid, u64 power_limit)
ret = __set_power_limit_uw(child, cid, power);
if (!ret)
- ret = __get_power_limit_uw(child, cid, &power);
+ ret = get_power_limit_uw(&child->zone, cid, &power);
if (ret)
break;
@@ -307,8 +270,6 @@ static int set_power_limit_uw(struct powercap_zone *pcz,
struct dtpm *dtpm = to_dtpm(pcz);
int ret;
- mutex_lock(&dtpm_lock);
-
/*
* Don't allow values outside of the power range previously
* set when initializing the power numbers.
@@ -320,8 +281,6 @@ static int set_power_limit_uw(struct powercap_zone *pcz,
pr_debug("%s: power limit: %llu uW, power max: %llu uW\n",
dtpm->zone.name, dtpm->power_limit, dtpm->power_max);
- mutex_unlock(&dtpm_lock);
-
return ret;
}
@@ -332,11 +291,7 @@ static const char *get_constraint_name(struct powercap_zone *pcz, int cid)
static int get_max_power_uw(struct powercap_zone *pcz, int id, u64 *max_power)
{
- struct dtpm *dtpm = to_dtpm(pcz);
-
- mutex_lock(&dtpm_lock);
- *max_power = dtpm->power_max;
- mutex_unlock(&dtpm_lock);
+ *max_power = to_dtpm(pcz)->power_max;
return 0;
}
@@ -439,8 +394,6 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent)
if (IS_ERR(pcz))
return PTR_ERR(pcz);
- mutex_lock(&dtpm_lock);
-
if (parent) {
list_add_tail(&dtpm->sibling, &parent->children);
dtpm->parent = parent;
@@ -456,19 +409,253 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent)
pr_debug("Registered dtpm node '%s' / %llu-%llu uW, \n",
dtpm->zone.name, dtpm->power_min, dtpm->power_max);
- mutex_unlock(&dtpm_lock);
+ return 0;
+}
+
+static struct dtpm *dtpm_setup_virtual(const struct dtpm_node *hierarchy,
+ struct dtpm *parent)
+{
+ struct dtpm *dtpm;
+ int ret;
+
+ dtpm = kzalloc(sizeof(*dtpm), GFP_KERNEL);
+ if (!dtpm)
+ return ERR_PTR(-ENOMEM);
+ dtpm_init(dtpm, NULL);
+
+ ret = dtpm_register(hierarchy->name, dtpm, parent);
+ if (ret) {
+ pr_err("Failed to register dtpm node '%s': %d\n",
+ hierarchy->name, ret);
+ kfree(dtpm);
+ return ERR_PTR(ret);
+ }
+
+ return dtpm;
+}
+
+static struct dtpm *dtpm_setup_dt(const struct dtpm_node *hierarchy,
+ struct dtpm *parent)
+{
+ struct device_node *np;
+ int i, ret;
+
+ np = of_find_node_by_path(hierarchy->name);
+ if (!np) {
+ pr_err("Failed to find '%s'\n", hierarchy->name);
+ return ERR_PTR(-ENXIO);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(dtpm_subsys); i++) {
+
+ if (!dtpm_subsys[i]->setup)
+ continue;
+
+ ret = dtpm_subsys[i]->setup(parent, np);
+ if (ret) {
+ pr_err("Failed to setup '%s': %d\n", dtpm_subsys[i]->name, ret);
+ of_node_put(np);
+ return ERR_PTR(ret);
+ }
+ }
+
+ of_node_put(np);
+
+ /*
+ * By returning a NULL pointer, we let know the caller there
+ * is no child for us as we are a leaf of the tree
+ */
+ return NULL;
+}
+
+typedef struct dtpm * (*dtpm_node_callback_t)(const struct dtpm_node *, struct dtpm *);
+
+static dtpm_node_callback_t dtpm_node_callback[] = {
+ [DTPM_NODE_VIRTUAL] = dtpm_setup_virtual,
+ [DTPM_NODE_DT] = dtpm_setup_dt,
+};
+
+static int dtpm_for_each_child(const struct dtpm_node *hierarchy,
+ const struct dtpm_node *it, struct dtpm *parent)
+{
+ struct dtpm *dtpm;
+ int i, ret;
+
+ for (i = 0; hierarchy[i].name; i++) {
+
+ if (hierarchy[i].parent != it)
+ continue;
+
+ dtpm = dtpm_node_callback[hierarchy[i].type](&hierarchy[i], parent);
+
+ /*
+ * A NULL pointer means there is no children, hence we
+ * continue without going deeper in the recursivity.
+ */
+ if (!dtpm)
+ continue;
+
+ /*
+ * There are multiple reasons why the callback could
+ * fail. The generic glue is abstracting the backend
+ * and therefore it is not possible to report back or
+ * take a decision based on the error. In any case,
+ * if this call fails, it is not critical in the
+ * hierarchy creation, we can assume the underlying
+ * service is not found, so we continue without this
+ * branch in the tree but with a warning to log the
+ * information the node was not created.
+ */
+ if (IS_ERR(dtpm)) {
+ pr_warn("Failed to create '%s' in the hierarchy\n",
+ hierarchy[i].name);
+ continue;
+ }
+
+ ret = dtpm_for_each_child(hierarchy, &hierarchy[i], dtpm);
+ if (ret)
+ return ret;
+ }
return 0;
}
-static int __init init_dtpm(void)
+/**
+ * dtpm_create_hierarchy - Create the dtpm hierarchy
+ * @hierarchy: An array of struct dtpm_node describing the hierarchy
+ *
+ * The function is called by the platform specific code with the
+ * description of the different node in the hierarchy. It creates the
+ * tree in the sysfs filesystem under the powercap dtpm entry.
+ *
+ * The expected tree has the format:
+ *
+ * struct dtpm_node hierarchy[] = {
+ * [0] { .name = "topmost", type = DTPM_NODE_VIRTUAL },
+ * [1] { .name = "package", .type = DTPM_NODE_VIRTUAL, .parent = &hierarchy[0] },
+ * [2] { .name = "/cpus/cpu0", .type = DTPM_NODE_DT, .parent = &hierarchy[1] },
+ * [3] { .name = "/cpus/cpu1", .type = DTPM_NODE_DT, .parent = &hierarchy[1] },
+ * [4] { .name = "/cpus/cpu2", .type = DTPM_NODE_DT, .parent = &hierarchy[1] },
+ * [5] { .name = "/cpus/cpu3", .type = DTPM_NODE_DT, .parent = &hierarchy[1] },
+ * [6] { }
+ * };
+ *
+ * The last element is always an empty one and marks the end of the
+ * array.
+ *
+ * Return: zero on success, a negative value in case of error. Errors
+ * are reported back from the underlying functions.
+ */
+int dtpm_create_hierarchy(struct of_device_id *dtpm_match_table)
{
+ const struct of_device_id *match;
+ const struct dtpm_node *hierarchy;
+ struct device_node *np;
+ int i, ret;
+
+ mutex_lock(&dtpm_lock);
+
+ if (pct) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
pct = powercap_register_control_type(NULL, "dtpm", NULL);
if (IS_ERR(pct)) {
pr_err("Failed to register control type\n");
- return PTR_ERR(pct);
+ ret = PTR_ERR(pct);
+ goto out_pct;
}
+ ret = -ENODEV;
+ np = of_find_node_by_path("/");
+ if (!np)
+ goto out_err;
+
+ match = of_match_node(dtpm_match_table, np);
+
+ of_node_put(np);
+
+ if (!match)
+ goto out_err;
+
+ hierarchy = match->data;
+ if (!hierarchy) {
+ ret = -EFAULT;
+ goto out_err;
+ }
+
+ ret = dtpm_for_each_child(hierarchy, NULL, NULL);
+ if (ret)
+ goto out_err;
+
+ for (i = 0; i < ARRAY_SIZE(dtpm_subsys); i++) {
+
+ if (!dtpm_subsys[i]->init)
+ continue;
+
+ ret = dtpm_subsys[i]->init();
+ if (ret)
+ pr_info("Failed to initialize '%s': %d",
+ dtpm_subsys[i]->name, ret);
+ }
+
+ mutex_unlock(&dtpm_lock);
+
return 0;
+
+out_err:
+ powercap_unregister_control_type(pct);
+out_pct:
+ pct = NULL;
+out_unlock:
+ mutex_unlock(&dtpm_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dtpm_create_hierarchy);
+
+static void __dtpm_destroy_hierarchy(struct dtpm *dtpm)
+{
+ struct dtpm *child, *aux;
+
+ list_for_each_entry_safe(child, aux, &dtpm->children, sibling)
+ __dtpm_destroy_hierarchy(child);
+
+ /*
+ * At this point, we know all children were removed from the
+ * recursive call before
+ */
+ dtpm_unregister(dtpm);
+}
+
+void dtpm_destroy_hierarchy(void)
+{
+ int i;
+
+ mutex_lock(&dtpm_lock);
+
+ if (!pct)
+ goto out_unlock;
+
+ __dtpm_destroy_hierarchy(root);
+
+
+ for (i = 0; i < ARRAY_SIZE(dtpm_subsys); i++) {
+
+ if (!dtpm_subsys[i]->exit)
+ continue;
+
+ dtpm_subsys[i]->exit();
+ }
+
+ powercap_unregister_control_type(pct);
+
+ pct = NULL;
+
+ root = NULL;
+
+out_unlock:
+ mutex_unlock(&dtpm_lock);
}
-late_initcall(init_dtpm);
+EXPORT_SYMBOL_GPL(dtpm_destroy_hierarchy);
diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
index b740866b228d..bca2f912d349 100644
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -21,6 +21,7 @@
#include <linux/cpuhotplug.h>
#include <linux/dtpm.h>
#include <linux/energy_model.h>
+#include <linux/of.h>
#include <linux/pm_qos.h>
#include <linux/slab.h>
#include <linux/units.h>
@@ -150,10 +151,17 @@ static int update_pd_power_uw(struct dtpm *dtpm)
static void pd_release(struct dtpm *dtpm)
{
struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm);
+ struct cpufreq_policy *policy;
if (freq_qos_request_active(&dtpm_cpu->qos_req))
freq_qos_remove_request(&dtpm_cpu->qos_req);
+ policy = cpufreq_cpu_get(dtpm_cpu->cpu);
+ if (policy) {
+ for_each_cpu(dtpm_cpu->cpu, policy->related_cpus)
+ per_cpu(dtpm_per_cpu, dtpm_cpu->cpu) = NULL;
+ }
+
kfree(dtpm_cpu);
}
@@ -178,11 +186,26 @@ static int cpuhp_dtpm_cpu_offline(unsigned int cpu)
static int cpuhp_dtpm_cpu_online(unsigned int cpu)
{
struct dtpm_cpu *dtpm_cpu;
+
+ dtpm_cpu = per_cpu(dtpm_per_cpu, cpu);
+ if (dtpm_cpu)
+ return dtpm_update_power(&dtpm_cpu->dtpm);
+
+ return 0;
+}
+
+static int __dtpm_cpu_setup(int cpu, struct dtpm *parent)
+{
+ struct dtpm_cpu *dtpm_cpu;
struct cpufreq_policy *policy;
struct em_perf_domain *pd;
char name[CPUFREQ_NAME_LEN];
int ret = -ENOMEM;
+ dtpm_cpu = per_cpu(dtpm_per_cpu, cpu);
+ if (dtpm_cpu)
+ return 0;
+
policy = cpufreq_cpu_get(cpu);
if (!policy)
return 0;
@@ -191,10 +214,6 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu)
if (!pd)
return -EINVAL;
- dtpm_cpu = per_cpu(dtpm_per_cpu, cpu);
- if (dtpm_cpu)
- return dtpm_update_power(&dtpm_cpu->dtpm);
-
dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL);
if (!dtpm_cpu)
return -ENOMEM;
@@ -207,7 +226,7 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu)
snprintf(name, sizeof(name), "cpu%d-cpufreq", dtpm_cpu->cpu);
- ret = dtpm_register(name, &dtpm_cpu->dtpm, NULL);
+ ret = dtpm_register(name, &dtpm_cpu->dtpm, parent);
if (ret)
goto out_kfree_dtpm_cpu;
@@ -231,7 +250,18 @@ out_kfree_dtpm_cpu:
return ret;
}
-static int __init dtpm_cpu_init(void)
+static int dtpm_cpu_setup(struct dtpm *dtpm, struct device_node *np)
+{
+ int cpu;
+
+ cpu = of_cpu_node_to_id(np);
+ if (cpu < 0)
+ return 0;
+
+ return __dtpm_cpu_setup(cpu, dtpm);
+}
+
+static int dtpm_cpu_init(void)
{
int ret;
@@ -269,4 +299,15 @@ static int __init dtpm_cpu_init(void)
return 0;
}
-DTPM_DECLARE(dtpm_cpu, dtpm_cpu_init);
+static void dtpm_cpu_exit(void)
+{
+ cpuhp_remove_state_nocalls(CPUHP_AP_ONLINE_DYN);
+ cpuhp_remove_state_nocalls(CPUHP_AP_DTPM_CPU_DEAD);
+}
+
+struct dtpm_subsys_ops dtpm_cpu_ops = {
+ .name = KBUILD_MODNAME,
+ .init = dtpm_cpu_init,
+ .exit = dtpm_cpu_exit,
+ .setup = dtpm_cpu_setup,
+};
diff --git a/drivers/powercap/dtpm_devfreq.c b/drivers/powercap/dtpm_devfreq.c
new file mode 100644
index 000000000000..91276761a31d
--- /dev/null
+++ b/drivers/powercap/dtpm_devfreq.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * The devfreq device combined with the energy model and the load can
+ * give an estimation of the power consumption as well as limiting the
+ * power.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpumask.h>
+#include <linux/devfreq.h>
+#include <linux/dtpm.h>
+#include <linux/energy_model.h>
+#include <linux/of.h>
+#include <linux/pm_qos.h>
+#include <linux/slab.h>
+#include <linux/units.h>
+
+struct dtpm_devfreq {
+ struct dtpm dtpm;
+ struct dev_pm_qos_request qos_req;
+ struct devfreq *devfreq;
+};
+
+static struct dtpm_devfreq *to_dtpm_devfreq(struct dtpm *dtpm)
+{
+ return container_of(dtpm, struct dtpm_devfreq, dtpm);
+}
+
+static int update_pd_power_uw(struct dtpm *dtpm)
+{
+ struct dtpm_devfreq *dtpm_devfreq = to_dtpm_devfreq(dtpm);
+ struct devfreq *devfreq = dtpm_devfreq->devfreq;
+ struct device *dev = devfreq->dev.parent;
+ struct em_perf_domain *pd = em_pd_get(dev);
+
+ dtpm->power_min = pd->table[0].power;
+ dtpm->power_min *= MICROWATT_PER_MILLIWATT;
+
+ dtpm->power_max = pd->table[pd->nr_perf_states - 1].power;
+ dtpm->power_max *= MICROWATT_PER_MILLIWATT;
+
+ return 0;
+}
+
+static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
+{
+ struct dtpm_devfreq *dtpm_devfreq = to_dtpm_devfreq(dtpm);
+ struct devfreq *devfreq = dtpm_devfreq->devfreq;
+ struct device *dev = devfreq->dev.parent;
+ struct em_perf_domain *pd = em_pd_get(dev);
+ unsigned long freq;
+ u64 power;
+ int i;
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+
+ power = pd->table[i].power * MICROWATT_PER_MILLIWATT;
+ if (power > power_limit)
+ break;
+ }
+
+ freq = pd->table[i - 1].frequency;
+
+ dev_pm_qos_update_request(&dtpm_devfreq->qos_req, freq);
+
+ power_limit = pd->table[i - 1].power * MICROWATT_PER_MILLIWATT;
+
+ return power_limit;
+}
+
+static void _normalize_load(struct devfreq_dev_status *status)
+{
+ if (status->total_time > 0xfffff) {
+ status->total_time >>= 10;
+ status->busy_time >>= 10;
+ }
+
+ status->busy_time <<= 10;
+ status->busy_time /= status->total_time ? : 1;
+
+ status->busy_time = status->busy_time ? : 1;
+ status->total_time = 1024;
+}
+
+static u64 get_pd_power_uw(struct dtpm *dtpm)
+{
+ struct dtpm_devfreq *dtpm_devfreq = to_dtpm_devfreq(dtpm);
+ struct devfreq *devfreq = dtpm_devfreq->devfreq;
+ struct device *dev = devfreq->dev.parent;
+ struct em_perf_domain *pd = em_pd_get(dev);
+ struct devfreq_dev_status status;
+ unsigned long freq;
+ u64 power;
+ int i;
+
+ mutex_lock(&devfreq->lock);
+ status = devfreq->last_status;
+ mutex_unlock(&devfreq->lock);
+
+ freq = DIV_ROUND_UP(status.current_frequency, HZ_PER_KHZ);
+ _normalize_load(&status);
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+
+ if (pd->table[i].frequency < freq)
+ continue;
+
+ power = pd->table[i].power * MICROWATT_PER_MILLIWATT;
+ power *= status.busy_time;
+ power >>= 10;
+
+ return power;
+ }
+
+ return 0;
+}
+
+static void pd_release(struct dtpm *dtpm)
+{
+ struct dtpm_devfreq *dtpm_devfreq = to_dtpm_devfreq(dtpm);
+
+ if (dev_pm_qos_request_active(&dtpm_devfreq->qos_req))
+ dev_pm_qos_remove_request(&dtpm_devfreq->qos_req);
+
+ kfree(dtpm_devfreq);
+}
+
+static struct dtpm_ops dtpm_ops = {
+ .set_power_uw = set_pd_power_limit,
+ .get_power_uw = get_pd_power_uw,
+ .update_power_uw = update_pd_power_uw,
+ .release = pd_release,
+};
+
+static int __dtpm_devfreq_setup(struct devfreq *devfreq, struct dtpm *parent)
+{
+ struct device *dev = devfreq->dev.parent;
+ struct dtpm_devfreq *dtpm_devfreq;
+ struct em_perf_domain *pd;
+ int ret = -ENOMEM;
+
+ pd = em_pd_get(dev);
+ if (!pd) {
+ ret = dev_pm_opp_of_register_em(dev, NULL);
+ if (ret) {
+ pr_err("No energy model available for '%s'\n", dev_name(dev));
+ return -EINVAL;
+ }
+ }
+
+ dtpm_devfreq = kzalloc(sizeof(*dtpm_devfreq), GFP_KERNEL);
+ if (!dtpm_devfreq)
+ return -ENOMEM;
+
+ dtpm_init(&dtpm_devfreq->dtpm, &dtpm_ops);
+
+ dtpm_devfreq->devfreq = devfreq;
+
+ ret = dtpm_register(dev_name(dev), &dtpm_devfreq->dtpm, parent);
+ if (ret) {
+ pr_err("Failed to register '%s': %d\n", dev_name(dev), ret);
+ kfree(dtpm_devfreq);
+ return ret;
+ }
+
+ ret = dev_pm_qos_add_request(dev, &dtpm_devfreq->qos_req,
+ DEV_PM_QOS_MAX_FREQUENCY,
+ PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
+ if (ret) {
+ pr_err("Failed to add QoS request: %d\n", ret);
+ goto out_dtpm_unregister;
+ }
+
+ dtpm_update_power(&dtpm_devfreq->dtpm);
+
+ return 0;
+
+out_dtpm_unregister:
+ dtpm_unregister(&dtpm_devfreq->dtpm);
+
+ return ret;
+}
+
+static int dtpm_devfreq_setup(struct dtpm *dtpm, struct device_node *np)
+{
+ struct devfreq *devfreq;
+
+ devfreq = devfreq_get_devfreq_by_node(np);
+ if (IS_ERR(devfreq))
+ return 0;
+
+ return __dtpm_devfreq_setup(devfreq, dtpm);
+}
+
+struct dtpm_subsys_ops dtpm_devfreq_ops = {
+ .name = KBUILD_MODNAME,
+ .setup = dtpm_devfreq_setup,
+};
diff --git a/drivers/powercap/dtpm_subsys.h b/drivers/powercap/dtpm_subsys.h
new file mode 100644
index 000000000000..db1712938a96
--- /dev/null
+++ b/drivers/powercap/dtpm_subsys.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Linaro Ltd
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+#ifndef ___DTPM_SUBSYS_H__
+#define ___DTPM_SUBSYS_H__
+
+extern struct dtpm_subsys_ops dtpm_cpu_ops;
+extern struct dtpm_subsys_ops dtpm_devfreq_ops;
+
+struct dtpm_subsys_ops *dtpm_subsys[] = {
+#ifdef CONFIG_DTPM_CPU
+ &dtpm_cpu_ops,
+#endif
+#ifdef CONFIG_DTPM_DEVFREQ
+ &dtpm_devfreq_ops,
+#endif
+};
+
+#endif
diff --git a/drivers/soc/rockchip/Kconfig b/drivers/soc/rockchip/Kconfig
index 25eb2c1e31bb..156ac0e0c8fe 100644
--- a/drivers/soc/rockchip/Kconfig
+++ b/drivers/soc/rockchip/Kconfig
@@ -34,4 +34,12 @@ config ROCKCHIP_PM_DOMAINS
If unsure, say N.
+config ROCKCHIP_DTPM
+ tristate "Rockchip DTPM hierarchy"
+ depends on DTPM && m
+ help
+ Describe the hierarchy for the Dynamic Thermal Power
+ Management tree on this platform. That will create all the
+ power capping capable devices.
+
endif
diff --git a/drivers/soc/rockchip/Makefile b/drivers/soc/rockchip/Makefile
index 875032f7344e..05f31a4e743c 100644
--- a/drivers/soc/rockchip/Makefile
+++ b/drivers/soc/rockchip/Makefile
@@ -5,3 +5,4 @@
obj-$(CONFIG_ROCKCHIP_GRF) += grf.o
obj-$(CONFIG_ROCKCHIP_IODOMAIN) += io-domain.o
obj-$(CONFIG_ROCKCHIP_PM_DOMAINS) += pm_domains.o
+obj-$(CONFIG_ROCKCHIP_DTPM) += dtpm.o
diff --git a/drivers/soc/rockchip/dtpm.c b/drivers/soc/rockchip/dtpm.c
new file mode 100644
index 000000000000..5a23784b5221
--- /dev/null
+++ b/drivers/soc/rockchip/dtpm.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * DTPM hierarchy description
+ */
+#include <linux/dtpm.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
+static struct dtpm_node __initdata rk3399_hierarchy[] = {
+ [0]{ .name = "rk3399",
+ .type = DTPM_NODE_VIRTUAL },
+ [1]{ .name = "package",
+ .type = DTPM_NODE_VIRTUAL,
+ .parent = &rk3399_hierarchy[0] },
+ [2]{ .name = "/cpus/cpu@0",
+ .type = DTPM_NODE_DT,
+ .parent = &rk3399_hierarchy[1] },
+ [3]{ .name = "/cpus/cpu@1",
+ .type = DTPM_NODE_DT,
+ .parent = &rk3399_hierarchy[1] },
+ [4]{ .name = "/cpus/cpu@2",
+ .type = DTPM_NODE_DT,
+ .parent = &rk3399_hierarchy[1] },
+ [5]{ .name = "/cpus/cpu@3",
+ .type = DTPM_NODE_DT,
+ .parent = &rk3399_hierarchy[1] },
+ [6]{ .name = "/cpus/cpu@100",
+ .type = DTPM_NODE_DT,
+ .parent = &rk3399_hierarchy[1] },
+ [7]{ .name = "/cpus/cpu@101",
+ .type = DTPM_NODE_DT,
+ .parent = &rk3399_hierarchy[1] },
+ [8]{ .name = "/gpu@ff9a0000",
+ .type = DTPM_NODE_DT,
+ .parent = &rk3399_hierarchy[1] },
+ [9]{ /* sentinel */ }
+};
+
+static struct of_device_id __initdata rockchip_dtpm_match_table[] = {
+ { .compatible = "rockchip,rk3399", .data = rk3399_hierarchy },
+ {},
+};
+
+static int __init rockchip_dtpm_init(void)
+{
+ return dtpm_create_hierarchy(rockchip_dtpm_match_table);
+}
+module_init(rockchip_dtpm_init);
+
+static void __exit rockchip_dtpm_exit(void)
+{
+ return dtpm_destroy_hierarchy();
+}
+module_exit(rockchip_dtpm_exit);
+
+MODULE_SOFTDEP("pre: panfrost cpufreq-dt");
+MODULE_DESCRIPTION("Rockchip DTPM driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:dtpm");
+MODULE_AUTHOR("Daniel Lezcano <daniel.lezcano@kernel.org");
diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c
index d630cccd2e6e..dd44e37a454a 100644
--- a/drivers/usb/core/hcd-pci.c
+++ b/drivers/usb/core/hcd-pci.c
@@ -446,7 +446,7 @@ static int suspend_common(struct device *dev, bool do_wakeup)
HCD_WAKEUP_PENDING(hcd->shared_hcd))
return -EBUSY;
retval = hcd->driver->pci_suspend(hcd, do_wakeup);
- suspend_report_result(hcd->driver->pci_suspend, retval);
+ suspend_report_result(dev, hcd->driver->pci_suspend, retval);
/* Check again in case wakeup raced with pci_suspend */
if ((retval == 0 && do_wakeup && HCD_WAKEUP_PENDING(hcd)) ||
@@ -556,7 +556,7 @@ static int hcd_pci_suspend_noirq(struct device *dev)
dev_dbg(dev, "--> PCI %s\n",
pci_power_name(pci_dev->current_state));
} else {
- suspend_report_result(pci_prepare_to_sleep, retval);
+ suspend_report_result(dev, pci_prepare_to_sleep, retval);
return retval;
}