summaryrefslogtreecommitdiff
path: root/drivers/powercap
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/powercap')
-rw-r--r--drivers/powercap/dtpm.c2
-rw-r--r--drivers/powercap/dtpm_cpu.c51
-rw-r--r--drivers/powercap/dtpm_devfreq.c36
-rw-r--r--drivers/powercap/idle_inject.c18
-rw-r--r--drivers/powercap/intel_rapl_common.c796
-rw-r--r--drivers/powercap/intel_rapl_msr.c29
-rw-r--r--drivers/powercap/intel_rapl_tpmi.c37
-rw-r--r--drivers/powercap/powercap_sys.c3
8 files changed, 835 insertions, 137 deletions
diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c
index ce920f17f45f..f390665743c4 100644
--- a/drivers/powercap/dtpm.c
+++ b/drivers/powercap/dtpm.c
@@ -522,7 +522,7 @@ static int dtpm_for_each_child(const struct dtpm_node *hierarchy,
/**
* dtpm_create_hierarchy - Create the dtpm hierarchy
- * @hierarchy: An array of struct dtpm_node describing the hierarchy
+ * @dtpm_match_table: Pointer to the array of device ID structures
*
* The function is called by the platform specific code with the
* description of the different node in the hierarchy. It creates the
diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
index 9193c3b8edeb..6b6f51b21550 100644
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -42,28 +42,29 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
{
struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm);
struct em_perf_domain *pd = em_cpu_get(dtpm_cpu->cpu);
- struct cpumask cpus;
+ struct em_perf_state *table;
unsigned long freq;
u64 power;
int i, nr_cpus;
- cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus));
- nr_cpus = cpumask_weight(&cpus);
+ nr_cpus = cpumask_weight_and(cpu_online_mask, to_cpumask(pd->cpus));
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
for (i = 0; i < pd->nr_perf_states; i++) {
- power = pd->table[i].power * nr_cpus;
+ power = table[i].power * nr_cpus;
if (power > power_limit)
break;
}
- freq = pd->table[i - 1].frequency;
+ freq = table[i - 1].frequency;
+ power_limit = table[i - 1].power * nr_cpus;
+ rcu_read_unlock();
freq_qos_update_request(&dtpm_cpu->qos_req, freq);
- power_limit = pd->table[i - 1].power * nr_cpus;
-
return power_limit;
}
@@ -87,9 +88,11 @@ static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power)
static u64 get_pd_power_uw(struct dtpm *dtpm)
{
struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm);
+ struct em_perf_state *table;
struct em_perf_domain *pd;
struct cpumask *pd_mask;
unsigned long freq;
+ u64 power = 0;
int i;
pd = em_cpu_get(dtpm_cpu->cpu);
@@ -98,33 +101,41 @@ static u64 get_pd_power_uw(struct dtpm *dtpm)
freq = cpufreq_quick_get(dtpm_cpu->cpu);
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
for (i = 0; i < pd->nr_perf_states; i++) {
- if (pd->table[i].frequency < freq)
+ if (table[i].frequency < freq)
continue;
- return scale_pd_power_uw(pd_mask, pd->table[i].power);
+ power = scale_pd_power_uw(pd_mask, table[i].power);
+ break;
}
+ rcu_read_unlock();
- return 0;
+ return power;
}
static int update_pd_power_uw(struct dtpm *dtpm)
{
struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm);
struct em_perf_domain *em = em_cpu_get(dtpm_cpu->cpu);
- struct cpumask cpus;
+ struct em_perf_state *table;
int nr_cpus;
- cpumask_and(&cpus, cpu_online_mask, to_cpumask(em->cpus));
- nr_cpus = cpumask_weight(&cpus);
+ nr_cpus = cpumask_weight_and(cpu_online_mask, to_cpumask(em->cpus));
- dtpm->power_min = em->table[0].power;
+ rcu_read_lock();
+ table = em_perf_state_from_pd(em);
+
+ dtpm->power_min = table[0].power;
dtpm->power_min *= nr_cpus;
- dtpm->power_max = em->table[em->nr_perf_states - 1].power;
+ dtpm->power_max = table[em->nr_perf_states - 1].power;
dtpm->power_max *= nr_cpus;
+ rcu_read_unlock();
+
return 0;
}
@@ -143,7 +154,7 @@ static void pd_release(struct dtpm *dtpm)
cpufreq_cpu_put(policy);
}
-
+
kfree(dtpm_cpu);
}
@@ -180,6 +191,7 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent)
{
struct dtpm_cpu *dtpm_cpu;
struct cpufreq_policy *policy;
+ struct em_perf_state *table;
struct em_perf_domain *pd;
char name[CPUFREQ_NAME_LEN];
int ret = -ENOMEM;
@@ -216,10 +228,13 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent)
if (ret)
goto out_kfree_dtpm_cpu;
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
ret = freq_qos_add_request(&policy->constraints,
&dtpm_cpu->qos_req, FREQ_QOS_MAX,
- pd->table[pd->nr_perf_states - 1].frequency);
- if (ret)
+ table[pd->nr_perf_states - 1].frequency);
+ rcu_read_unlock();
+ if (ret < 0)
goto out_dtpm_unregister;
cpufreq_cpu_put(policy);
diff --git a/drivers/powercap/dtpm_devfreq.c b/drivers/powercap/dtpm_devfreq.c
index 612c3b59dd5b..d1dff6ccab12 100644
--- a/drivers/powercap/dtpm_devfreq.c
+++ b/drivers/powercap/dtpm_devfreq.c
@@ -37,11 +37,16 @@ static int update_pd_power_uw(struct dtpm *dtpm)
struct devfreq *devfreq = dtpm_devfreq->devfreq;
struct device *dev = devfreq->dev.parent;
struct em_perf_domain *pd = em_pd_get(dev);
+ struct em_perf_state *table;
- dtpm->power_min = pd->table[0].power;
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
- dtpm->power_max = pd->table[pd->nr_perf_states - 1].power;
+ dtpm->power_min = table[0].power;
+ dtpm->power_max = table[pd->nr_perf_states - 1].power;
+
+ rcu_read_unlock();
return 0;
}
@@ -51,20 +56,23 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
struct devfreq *devfreq = dtpm_devfreq->devfreq;
struct device *dev = devfreq->dev.parent;
struct em_perf_domain *pd = em_pd_get(dev);
+ struct em_perf_state *table;
unsigned long freq;
int i;
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
for (i = 0; i < pd->nr_perf_states; i++) {
- if (pd->table[i].power > power_limit)
+ if (table[i].power > power_limit)
break;
}
- freq = pd->table[i - 1].frequency;
+ freq = table[i - 1].frequency;
+ power_limit = table[i - 1].power;
+ rcu_read_unlock();
dev_pm_qos_update_request(&dtpm_devfreq->qos_req, freq);
- power_limit = pd->table[i - 1].power;
-
return power_limit;
}
@@ -89,8 +97,9 @@ static u64 get_pd_power_uw(struct dtpm *dtpm)
struct device *dev = devfreq->dev.parent;
struct em_perf_domain *pd = em_pd_get(dev);
struct devfreq_dev_status status;
+ struct em_perf_state *table;
unsigned long freq;
- u64 power;
+ u64 power = 0;
int i;
mutex_lock(&devfreq->lock);
@@ -100,19 +109,22 @@ static u64 get_pd_power_uw(struct dtpm *dtpm)
freq = DIV_ROUND_UP(status.current_frequency, HZ_PER_KHZ);
_normalize_load(&status);
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
for (i = 0; i < pd->nr_perf_states; i++) {
- if (pd->table[i].frequency < freq)
+ if (table[i].frequency < freq)
continue;
- power = pd->table[i].power;
+ power = table[i].power;
power *= status.busy_time;
power >>= 10;
- return power;
+ break;
}
+ rcu_read_unlock();
- return 0;
+ return power;
}
static void pd_release(struct dtpm *dtpm)
@@ -166,7 +178,7 @@ static int __dtpm_devfreq_setup(struct devfreq *devfreq, struct dtpm *parent)
ret = dev_pm_qos_add_request(dev, &dtpm_devfreq->qos_req,
DEV_PM_QOS_MAX_FREQUENCY,
PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
- if (ret) {
+ if (ret < 0) {
pr_err("Failed to add QoS request: %d\n", ret);
goto out_dtpm_unregister;
}
diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c
index e18a2cc4e46a..04c212953ded 100644
--- a/drivers/powercap/idle_inject.c
+++ b/drivers/powercap/idle_inject.c
@@ -127,7 +127,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
struct idle_inject_device *ii_dev =
container_of(timer, struct idle_inject_device, timer);
- if (!ii_dev->update || (ii_dev->update && ii_dev->update()))
+ if (!ii_dev->update || ii_dev->update())
idle_inject_wakeup(ii_dev);
duration_us = READ_ONCE(ii_dev->run_duration_us);
@@ -179,7 +179,7 @@ void idle_inject_set_duration(struct idle_inject_device *ii_dev,
if (!run_duration_us)
pr_debug("CPU is forced to 100 percent idle\n");
}
-EXPORT_SYMBOL_NS_GPL(idle_inject_set_duration, IDLE_INJECT);
+EXPORT_SYMBOL_NS_GPL(idle_inject_set_duration, "IDLE_INJECT");
/**
* idle_inject_get_duration - idle and run duration retrieval helper
@@ -194,7 +194,7 @@ void idle_inject_get_duration(struct idle_inject_device *ii_dev,
*run_duration_us = READ_ONCE(ii_dev->run_duration_us);
*idle_duration_us = READ_ONCE(ii_dev->idle_duration_us);
}
-EXPORT_SYMBOL_NS_GPL(idle_inject_get_duration, IDLE_INJECT);
+EXPORT_SYMBOL_NS_GPL(idle_inject_get_duration, "IDLE_INJECT");
/**
* idle_inject_set_latency - set the maximum latency allowed
@@ -206,7 +206,7 @@ void idle_inject_set_latency(struct idle_inject_device *ii_dev,
{
WRITE_ONCE(ii_dev->latency_us, latency_us);
}
-EXPORT_SYMBOL_NS_GPL(idle_inject_set_latency, IDLE_INJECT);
+EXPORT_SYMBOL_NS_GPL(idle_inject_set_latency, "IDLE_INJECT");
/**
* idle_inject_start - start idle injections
@@ -238,7 +238,7 @@ int idle_inject_start(struct idle_inject_device *ii_dev)
return 0;
}
-EXPORT_SYMBOL_NS_GPL(idle_inject_start, IDLE_INJECT);
+EXPORT_SYMBOL_NS_GPL(idle_inject_start, "IDLE_INJECT");
/**
* idle_inject_stop - stops idle injections
@@ -285,7 +285,7 @@ void idle_inject_stop(struct idle_inject_device *ii_dev)
cpu_hotplug_enable();
}
-EXPORT_SYMBOL_NS_GPL(idle_inject_stop, IDLE_INJECT);
+EXPORT_SYMBOL_NS_GPL(idle_inject_stop, "IDLE_INJECT");
/**
* idle_inject_setup - prepare the current task for idle injection
@@ -367,7 +367,7 @@ out_rollback:
return NULL;
}
-EXPORT_SYMBOL_NS_GPL(idle_inject_register_full, IDLE_INJECT);
+EXPORT_SYMBOL_NS_GPL(idle_inject_register_full, "IDLE_INJECT");
/**
* idle_inject_register - initialize idle injection on a set of CPUs
@@ -384,7 +384,7 @@ struct idle_inject_device *idle_inject_register(struct cpumask *cpumask)
{
return idle_inject_register_full(cpumask, NULL);
}
-EXPORT_SYMBOL_NS_GPL(idle_inject_register, IDLE_INJECT);
+EXPORT_SYMBOL_NS_GPL(idle_inject_register, "IDLE_INJECT");
/**
* idle_inject_unregister - unregister idle injection control device
@@ -405,7 +405,7 @@ void idle_inject_unregister(struct idle_inject_device *ii_dev)
kfree(ii_dev);
}
-EXPORT_SYMBOL_NS_GPL(idle_inject_unregister, IDLE_INJECT);
+EXPORT_SYMBOL_NS_GPL(idle_inject_unregister, "IDLE_INJECT");
static struct smp_hotplug_thread idle_inject_threads = {
.store = &idle_inject_thread.tsk,
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 2feed036c1cd..77d75e1f14a9 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -5,26 +5,29 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/bitmap.h>
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/intel_rapl.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/slab.h>
#include <linux/log2.h>
-#include <linux/bitmap.h>
-#include <linux/delay.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
#include <linux/powercap.h>
-#include <linux/suspend.h>
-#include <linux/intel_rapl.h>
#include <linux/processor.h>
-#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
-#include <asm/iosf_mbi.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
+#include <asm/iosf_mbi.h>
/* bitmasks for RAPL MSRs, used by primitive access functions */
#define ENERGY_STATUS_MASK 0xffffffff
@@ -737,7 +740,7 @@ static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim)
{
struct rapl_primitive_info *rpi = rp->priv->rpi;
- if (prim < 0 || prim > NR_RAPL_PRIMITIVES || !rpi)
+ if (prim < 0 || prim >= NR_RAPL_PRIMITIVES || !rpi)
return NULL;
return &rpi[prim];
@@ -759,6 +762,11 @@ static int rapl_config(struct rapl_package *rp)
default:
return -EINVAL;
}
+
+ /* defaults_msr can be NULL on unsupported platforms */
+ if (!rp->priv->defaults || !rp->priv->rpi)
+ return -ENODEV;
+
return 0;
}
@@ -1214,66 +1222,72 @@ static const struct rapl_defaults rapl_defaults_amd = {
};
static const struct x86_cpu_id rapl_ids[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &rapl_defaults_core),
-
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &rapl_defaults_core),
-
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &rapl_defaults_hsw_server),
-
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &rapl_defaults_hsw_server),
-
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &rapl_defaults_hsw_server),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &rapl_defaults_hsw_server),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &rapl_defaults_hsw_server),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &rapl_defaults_spr_server),
- X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core),
-
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &rapl_defaults_cht),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &rapl_defaults_tng),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &rapl_defaults_ann),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &rapl_defaults_core),
-
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &rapl_defaults_hsw_server),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core),
+
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &rapl_defaults_core),
+
+ X86_MATCH_VFM(INTEL_HASWELL, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_HASWELL_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &rapl_defaults_hsw_server),
+
+ X86_MATCH_VFM(INTEL_BROADWELL, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &rapl_defaults_hsw_server),
+
+ X86_MATCH_VFM(INTEL_SKYLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core),
+
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &rapl_defaults_ann),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &rapl_defaults_core),
+
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &rapl_defaults_hsw_server),
X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd),
X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd),
+ X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd),
X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd),
{}
};
@@ -1498,8 +1512,588 @@ static int rapl_detect_domains(struct rapl_package *rp)
return 0;
}
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Support for RAPL PMU
+ *
+ * Register a PMU if any of the registered RAPL Packages have the requirement
+ * of exposing its energy counters via Perf PMU.
+ *
+ * PMU Name:
+ * power
+ *
+ * Events:
+ * Name Event id RAPL Domain
+ * energy_cores 0x01 RAPL_DOMAIN_PP0
+ * energy_pkg 0x02 RAPL_DOMAIN_PACKAGE
+ * energy_ram 0x03 RAPL_DOMAIN_DRAM
+ * energy_gpu 0x04 RAPL_DOMAIN_PP1
+ * energy_psys 0x05 RAPL_DOMAIN_PLATFORM
+ *
+ * Unit:
+ * Joules
+ *
+ * Scale:
+ * 2.3283064365386962890625e-10
+ * The same RAPL domain in different RAPL Packages may have different
+ * energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as
+ * the fixed unit for all energy counters, and covert each hardware
+ * counter increase to N times of PMU event counter increases.
+ *
+ * This is fully compatible with the current MSR RAPL PMU. This means that
+ * userspace programs like turbostat can use the same code to handle RAPL Perf
+ * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running
+ * underlying on the platform.
+ *
+ * Note that RAPL Packages can be probed/removed dynamically, and the events
+ * supported by each TPMI RAPL device can be different. Thus the RAPL PMU
+ * support is done on demand, which means
+ * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for
+ * unsupported counters are not exposed.
+ * 2. PMU is unregistered and registered when a new RAPL Package is probed and
+ * supports new counters that are not supported by current PMU.
+ * 3. PMU is unregistered when all registered RAPL Packages don't need PMU.
+ */
+
+struct rapl_pmu {
+ struct pmu pmu; /* Perf PMU structure */
+ u64 timer_ms; /* Maximum expiration time to avoid counter overflow */
+ unsigned long domain_map; /* Events supported by current registered PMU */
+ bool registered; /* Whether the PMU has been registered or not */
+};
+
+static struct rapl_pmu rapl_pmu;
+
+/* PMU helpers */
+
+static int get_pmu_cpu(struct rapl_package *rp)
+{
+ int cpu;
+
+ if (!rp->has_pmu)
+ return nr_cpu_ids;
+
+ /* Only TPMI RAPL is supported for now */
+ if (rp->priv->type != RAPL_IF_TPMI)
+ return nr_cpu_ids;
+
+ /* TPMI RAPL uses any CPU in the package for PMU */
+ for_each_online_cpu(cpu)
+ if (topology_physical_package_id(cpu) == rp->id)
+ return cpu;
+
+ return nr_cpu_ids;
+}
+
+static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
+{
+ if (!rp->has_pmu)
+ return false;
+
+ /* Only TPMI RAPL is supported for now */
+ if (rp->priv->type != RAPL_IF_TPMI)
+ return false;
+
+ /* TPMI RAPL uses any CPU in the package for PMU */
+ return topology_physical_package_id(cpu) == rp->id;
+}
+
+static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event)
+{
+ struct rapl_package *rp = event->pmu_private;
+
+ return &rp->pmu_data;
+}
+
+/* PMU event callbacks */
+
+static u64 event_read_counter(struct perf_event *event)
+{
+ struct rapl_package *rp = event->pmu_private;
+ u64 val;
+ int ret;
+
+ /* Return 0 for unsupported events */
+ if (event->hw.idx < 0)
+ return 0;
+
+ ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val);
+
+ /* Return 0 for failed read */
+ if (ret)
+ return 0;
+
+ return val;
+}
+
+static void __rapl_pmu_event_start(struct perf_event *event)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ list_add_tail(&event->active_entry, &data->active_list);
+
+ local64_set(&event->hw.prev_count, event_read_counter(event));
+ if (++data->n_active == 1)
+ hrtimer_start(&data->hrtimer, data->timer_interval,
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+ __rapl_pmu_event_start(event);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ u64 prev_raw_count, new_raw_count;
+ s64 delta, sdelta;
+
+ /*
+ * Follow the generic code to drain hwc->prev_count.
+ * The loop is not expected to run for multiple times.
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ do {
+ new_raw_count = event_read_counter(event);
+ } while (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count));
+
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ */
+ delta = new_raw_count - prev_raw_count;
+
+ /*
+ * Scale delta to smallest unit (2^-32)
+ * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ * or use ldexp(count, -32).
+ * Watts = Joules/Time delta
+ */
+ sdelta = delta * data->scale[event->hw.flags];
+
+ local64_add(sdelta, &event->count);
+
+ return new_raw_count;
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ /* Mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ WARN_ON_ONCE(data->n_active <= 0);
+ if (--data->n_active == 0)
+ hrtimer_cancel(&data->hrtimer);
+
+ list_del(&event->active_entry);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ /* Check if update of sw counter is necessary */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ rapl_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __rapl_pmu_event_start(event);
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+
+ return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+ rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+/* RAPL PMU event ids, same as shown in sysfs */
+enum perf_rapl_events {
+ PERF_RAPL_PP0 = 1, /* all cores */
+ PERF_RAPL_PKG, /* entire package */
+ PERF_RAPL_RAM, /* DRAM */
+ PERF_RAPL_PP1, /* gpu */
+ PERF_RAPL_PSYS, /* psys */
+ PERF_RAPL_MAX
+};
+#define RAPL_EVENT_MASK GENMASK(7, 0)
+
+static const int event_to_domain[PERF_RAPL_MAX] = {
+ [PERF_RAPL_PP0] = RAPL_DOMAIN_PP0,
+ [PERF_RAPL_PKG] = RAPL_DOMAIN_PACKAGE,
+ [PERF_RAPL_RAM] = RAPL_DOMAIN_DRAM,
+ [PERF_RAPL_PP1] = RAPL_DOMAIN_PP1,
+ [PERF_RAPL_PSYS] = RAPL_DOMAIN_PLATFORM,
+};
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+ struct rapl_package *pos, *rp = NULL;
+ u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ int domain, idx;
+
+ /* Only look at RAPL events */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* Check for supported events only */
+ if (!cfg || cfg >= PERF_RAPL_MAX)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* Find out which Package the event belongs to */
+ list_for_each_entry(pos, &rapl_packages, plist) {
+ if (is_rp_pmu_cpu(pos, event->cpu)) {
+ rp = pos;
+ break;
+ }
+ }
+ if (!rp)
+ return -ENODEV;
+
+ /* Find out which RAPL Domain the event belongs to */
+ domain = event_to_domain[cfg];
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+ event->pmu_private = rp; /* Which package */
+ event->hw.flags = domain; /* Which domain */
+
+ event->hw.idx = -1;
+ /* Find out the index in rp->domains[] to get domain pointer */
+ for (idx = 0; idx < rp->nr_domains; idx++) {
+ if (rp->domains[idx].id == domain) {
+ event->hw.idx = idx;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+ rapl_event_update(event);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct rapl_package_pmu_data *data =
+ container_of(hrtimer, struct rapl_package_pmu_data, hrtimer);
+ struct perf_event *event;
+ unsigned long flags;
+
+ if (!data->n_active)
+ return HRTIMER_NORESTART;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ list_for_each_entry(event, &data->active_list, active_entry)
+ rapl_event_update(event);
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+
+ hrtimer_forward_now(hrtimer, data->timer_interval);
+
+ return HRTIMER_RESTART;
+}
+
+/* PMU sysfs attributes */
+
+/*
+ * There are no default events, but we need to create "events" group (with
+ * empty attrs) before updating it with detected events.
+ */
+static struct attribute *attrs_empty[] = {
+ NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+ .name = "events",
+ .attrs = attrs_empty,
+};
+
+static ssize_t cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rapl_package *rp;
+ cpumask_var_t cpu_mask;
+ int cpu;
+ int ret;
+
+ if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpus_read_lock();
+
+ cpumask_clear(cpu_mask);
+
+ /* Choose a cpu for each RAPL Package */
+ list_for_each_entry(rp, &rapl_packages, plist) {
+ cpu = get_pmu_cpu(rp);
+ if (cpu < nr_cpu_ids)
+ cpumask_set_cpu(cpu, cpu_mask);
+ }
+ cpus_read_unlock();
+
+ ret = cpumap_print_to_pagebuf(true, buf, cpu_mask);
+
+ free_cpumask_var(cpu_mask);
+
+ return ret;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *pmu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL
+};
+
+static struct attribute_group pmu_cpumask_group = {
+ .attrs = pmu_cpumask_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *pmu_format_attr[] = {
+ &format_attr_event.attr,
+ NULL
+};
+
+static struct attribute_group pmu_format_group = {
+ .name = "format",
+ .attrs = pmu_format_attr,
+};
+
+static const struct attribute_group *pmu_attr_groups[] = {
+ &pmu_events_group,
+ &pmu_cpumask_group,
+ &pmu_format_group,
+ NULL
+};
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ .event_str = str, \
+}
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04");
+RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules");
+RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules");
+
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10");
+
+#define RAPL_EVENT_GROUP(_name, domain) \
+static struct attribute *pmu_attr_##_name[] = { \
+ &event_attr_rapl_##_name.attr.attr, \
+ &event_attr_rapl_unit_##_name.attr.attr, \
+ &event_attr_rapl_scale_##_name.attr.attr, \
+ NULL \
+}; \
+static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \
+{ \
+ return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \
+} \
+static struct attribute_group pmu_group_##_name = { \
+ .name = "events", \
+ .attrs = pmu_attr_##_name, \
+ .is_visible = is_visible_##_name, \
+}
+
+RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0);
+RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE);
+RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM);
+RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1);
+RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM);
+
+static const struct attribute_group *pmu_attr_update[] = {
+ &pmu_group_cores,
+ &pmu_group_pkg,
+ &pmu_group_ram,
+ &pmu_group_gpu,
+ &pmu_group_psys,
+ NULL
+};
+
+static int rapl_pmu_update(struct rapl_package *rp)
+{
+ int ret = 0;
+
+ /* Return if PMU already covers all events supported by current RAPL Package */
+ if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map)))
+ goto end;
+
+ /* Unregister previous registered PMU */
+ if (rapl_pmu.registered)
+ perf_pmu_unregister(&rapl_pmu.pmu);
+
+ rapl_pmu.registered = false;
+ rapl_pmu.domain_map |= rp->domain_map;
+
+ memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
+ rapl_pmu.pmu.attr_groups = pmu_attr_groups;
+ rapl_pmu.pmu.attr_update = pmu_attr_update;
+ rapl_pmu.pmu.task_ctx_nr = perf_invalid_context;
+ rapl_pmu.pmu.event_init = rapl_pmu_event_init;
+ rapl_pmu.pmu.add = rapl_pmu_event_add;
+ rapl_pmu.pmu.del = rapl_pmu_event_del;
+ rapl_pmu.pmu.start = rapl_pmu_event_start;
+ rapl_pmu.pmu.stop = rapl_pmu_event_stop;
+ rapl_pmu.pmu.read = rapl_pmu_event_read;
+ rapl_pmu.pmu.module = THIS_MODULE;
+ rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT;
+ ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
+ if (ret) {
+ pr_info("Failed to register PMU\n");
+ return ret;
+ }
+
+ rapl_pmu.registered = true;
+end:
+ rp->has_pmu = true;
+ return ret;
+}
+
+int rapl_package_add_pmu(struct rapl_package *rp)
+{
+ struct rapl_package_pmu_data *data = &rp->pmu_data;
+ int idx;
+
+ if (rp->has_pmu)
+ return -EEXIST;
+
+ guard(cpus_read_lock)();
+
+ for (idx = 0; idx < rp->nr_domains; idx++) {
+ struct rapl_domain *rd = &rp->domains[idx];
+ int domain = rd->id;
+ u64 val;
+
+ if (!test_bit(domain, &rp->domain_map))
+ continue;
+
+ /*
+ * The RAPL PMU granularity is 2^-32 Joules
+ * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase
+ */
+ val = rd->energy_unit * (1ULL << 32);
+ do_div(val, ENERGY_UNIT_SCALE * 1000000);
+ data->scale[domain] = val;
+
+ if (!rapl_pmu.timer_ms) {
+ struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER);
+
+ /*
+ * Calculate the timer rate:
+ * Use reference of 200W for scaling the timeout to avoid counter
+ * overflows.
+ *
+ * max_count = rpi->mask >> rpi->shift + 1
+ * max_energy_pj = max_count * rd->energy_unit
+ * max_time_sec = (max_energy_pj / 1000000000) / 200w
+ *
+ * rapl_pmu.timer_ms = max_time_sec * 1000 / 2
+ */
+ val = (rpi->mask >> rpi->shift) + 1;
+ val *= rd->energy_unit;
+ do_div(val, 1000000 * 200 * 2);
+ rapl_pmu.timer_ms = val;
+
+ pr_debug("%llu ms overflow timer\n", rapl_pmu.timer_ms);
+ }
+
+ pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]);
+ }
+
+ /* Initialize per package PMU data */
+ raw_spin_lock_init(&data->lock);
+ INIT_LIST_HEAD(&data->active_list);
+ data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms);
+ hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ data->hrtimer.function = rapl_hrtimer_handle;
+
+ return rapl_pmu_update(rp);
+}
+EXPORT_SYMBOL_GPL(rapl_package_add_pmu);
+
+void rapl_package_remove_pmu(struct rapl_package *rp)
+{
+ struct rapl_package *pos;
+
+ if (!rp->has_pmu)
+ return;
+
+ guard(cpus_read_lock)();
+
+ list_for_each_entry(pos, &rapl_packages, plist) {
+ /* PMU is still needed */
+ if (pos->has_pmu && pos != rp)
+ return;
+ }
+
+ perf_pmu_unregister(&rapl_pmu.pmu);
+ memset(&rapl_pmu, 0, sizeof(struct rapl_pmu));
+}
+EXPORT_SYMBOL_GPL(rapl_package_remove_pmu);
+#endif
+
/* called from CPU hotplug notifier, hotplug lock held */
-void rapl_remove_package(struct rapl_package *rp)
+void rapl_remove_package_cpuslocked(struct rapl_package *rp)
{
struct rapl_domain *rd, *rd_package = NULL;
@@ -1528,16 +2122,45 @@ void rapl_remove_package(struct rapl_package *rp)
list_del(&rp->plist);
kfree(rp);
}
+EXPORT_SYMBOL_GPL(rapl_remove_package_cpuslocked);
+
+void rapl_remove_package(struct rapl_package *rp)
+{
+ guard(cpus_read_lock)();
+ rapl_remove_package_cpuslocked(rp);
+}
EXPORT_SYMBOL_GPL(rapl_remove_package);
+/*
+ * RAPL Package energy counter scope:
+ * 1. AMD/HYGON platforms use per-PKG package energy counter
+ * 2. For Intel platforms
+ * 2.1 CLX-AP platform has per-DIE package energy counter
+ * 2.2 Other platforms that uses MSR RAPL are single die systems so the
+ * package energy counter can be considered as per-PKG/per-DIE,
+ * here it is considered as per-DIE.
+ * 2.3 New platforms that use TPMI RAPL doesn't care about the
+ * scope because they are not MSR/CPU based.
+ */
+#define rapl_msrs_are_pkg_scope() \
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+
/* caller to ensure CPU hotplug lock is held */
-struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu)
+struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv,
+ bool id_is_cpu)
{
struct rapl_package *rp;
int uid;
- if (id_is_cpu)
- uid = topology_logical_die_id(id);
+ if (id_is_cpu) {
+ uid = rapl_msrs_are_pkg_scope() ?
+ topology_physical_package_id(id) : topology_logical_die_id(id);
+ if (uid < 0) {
+ pr_err("topology_logical_(package/die)_id() returned a negative value");
+ return NULL;
+ }
+ }
else
uid = id;
@@ -1549,10 +2172,17 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv,
return NULL;
}
+EXPORT_SYMBOL_GPL(rapl_find_package_domain_cpuslocked);
+
+struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu)
+{
+ guard(cpus_read_lock)();
+ return rapl_find_package_domain_cpuslocked(id, priv, id_is_cpu);
+}
EXPORT_SYMBOL_GPL(rapl_find_package_domain);
/* called from CPU hotplug notifier, hotplug lock held */
-struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu)
+struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, bool id_is_cpu)
{
struct rapl_package *rp;
int ret;
@@ -1562,9 +2192,14 @@ struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id
return ERR_PTR(-ENOMEM);
if (id_is_cpu) {
- rp->id = topology_logical_die_id(id);
+ rp->id = rapl_msrs_are_pkg_scope() ?
+ topology_physical_package_id(id) : topology_logical_die_id(id);
+ if ((int)(rp->id) < 0) {
+ pr_err("topology_logical_(package/die)_id() returned a negative value");
+ return ERR_PTR(-EINVAL);
+ }
rp->lead_cpu = id;
- if (topology_max_die_per_package() > 1)
+ if (!rapl_msrs_are_pkg_scope() && topology_max_dies_per_package() > 1)
snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d-die-%d",
topology_physical_package_id(id), topology_die_id(id));
else
@@ -1598,6 +2233,13 @@ err_free_package:
kfree(rp);
return ERR_PTR(ret);
}
+EXPORT_SYMBOL_GPL(rapl_add_package_cpuslocked);
+
+struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu)
+{
+ guard(cpus_read_lock)();
+ return rapl_add_package_cpuslocked(id, priv, id_is_cpu);
+}
EXPORT_SYMBOL_GPL(rapl_add_package);
static void power_limit_state_save(void)
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 250bd41a588c..2b81aabdb0db 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -73,9 +73,9 @@ static int rapl_cpu_online(unsigned int cpu)
{
struct rapl_package *rp;
- rp = rapl_find_package_domain(cpu, rapl_msr_priv, true);
+ rp = rapl_find_package_domain_cpuslocked(cpu, rapl_msr_priv, true);
if (!rp) {
- rp = rapl_add_package(cpu, rapl_msr_priv, true);
+ rp = rapl_add_package_cpuslocked(cpu, rapl_msr_priv, true);
if (IS_ERR(rp))
return PTR_ERR(rp);
}
@@ -88,14 +88,14 @@ static int rapl_cpu_down_prep(unsigned int cpu)
struct rapl_package *rp;
int lead_cpu;
- rp = rapl_find_package_domain(cpu, rapl_msr_priv, true);
+ rp = rapl_find_package_domain_cpuslocked(cpu, rapl_msr_priv, true);
if (!rp)
return 0;
cpumask_clear_cpu(cpu, &rp->cpumask);
lead_cpu = cpumask_first(&rp->cpumask);
if (lead_cpu >= nr_cpu_ids)
- rapl_remove_package(rp);
+ rapl_remove_package_cpuslocked(rp);
else if (rp->lead_cpu == cpu)
rp->lead_cpu = lead_cpu;
return 0;
@@ -139,14 +139,16 @@ static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
/* List of verified CPUs. */
static const struct x86_cpu_id pl4_support_ids[] = {
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, NULL),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, NULL),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, NULL),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, NULL),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, NULL),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, NULL),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, NULL),
+ X86_MATCH_VFM(INTEL_METEORLAKE, NULL),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, NULL),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL),
{}
};
@@ -197,11 +199,10 @@ out:
return ret;
}
-static int rapl_msr_remove(struct platform_device *pdev)
+static void rapl_msr_remove(struct platform_device *pdev)
{
cpuhp_remove_state(rapl_msr_priv->pcap_rapl_online);
powercap_unregister_control_type(rapl_msr_priv->control_type);
- return 0;
}
static const struct platform_device_id rapl_msr_ids[] = {
diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c
index 891c90fefd8b..af2368f4db10 100644
--- a/drivers/powercap/intel_rapl_tpmi.c
+++ b/drivers/powercap/intel_rapl_tpmi.c
@@ -15,7 +15,8 @@
#include <linux/module.h>
#include <linux/slab.h>
-#define TPMI_RAPL_VERSION 1
+#define TPMI_RAPL_MAJOR_VERSION 0
+#define TPMI_RAPL_MINOR_VERSION 1
/* 1 header + 10 registers + 5 reserved. 8 bytes for each. */
#define TPMI_RAPL_DOMAIN_SIZE 128
@@ -40,6 +41,7 @@ enum tpmi_rapl_register {
TPMI_RAPL_REG_ENERGY_STATUS,
TPMI_RAPL_REG_PERF_STATUS,
TPMI_RAPL_REG_POWER_INFO,
+ TPMI_RAPL_REG_DOMAIN_INFO,
TPMI_RAPL_REG_INTERRUPT,
TPMI_RAPL_REG_MAX = 15,
};
@@ -130,6 +132,12 @@ static void trp_release(struct tpmi_rapl_package *trp)
mutex_unlock(&tpmi_rapl_lock);
}
+/*
+ * Bit 0 of TPMI_RAPL_REG_DOMAIN_INFO indicates if the current package is a domain
+ * root or not. Only domain root packages can enumerate System (Psys) Domain.
+ */
+#define TPMI_RAPL_DOMAIN_ROOT BIT(0)
+
static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset)
{
u8 tpmi_domain_version;
@@ -139,6 +147,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset)
enum rapl_domain_reg_id reg_id;
int tpmi_domain_size, tpmi_domain_flags;
u64 tpmi_domain_header = readq(trp->base + offset);
+ u64 tpmi_domain_info;
/* Domain Parent bits are ignored for now */
tpmi_domain_version = tpmi_domain_header & 0xff;
@@ -146,11 +155,21 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset)
tpmi_domain_size = tpmi_domain_header >> 16 & 0xff;
tpmi_domain_flags = tpmi_domain_header >> 32 & 0xffff;
- if (tpmi_domain_version != TPMI_RAPL_VERSION) {
- pr_warn(FW_BUG "Unsupported version:%d\n", tpmi_domain_version);
+ if (tpmi_domain_version == TPMI_VERSION_INVALID) {
+ pr_warn(FW_BUG "Invalid version\n");
return -ENODEV;
}
+ if (TPMI_MAJOR_VERSION(tpmi_domain_version) != TPMI_RAPL_MAJOR_VERSION) {
+ pr_warn(FW_BUG "Unsupported major version:%ld\n",
+ TPMI_MAJOR_VERSION(tpmi_domain_version));
+ return -ENODEV;
+ }
+
+ if (TPMI_MINOR_VERSION(tpmi_domain_version) > TPMI_RAPL_MINOR_VERSION)
+ pr_info("Ignore: Unsupported minor version:%ld\n",
+ TPMI_MINOR_VERSION(tpmi_domain_version));
+
/* Domain size: in unit of 128 Bytes */
if (tpmi_domain_size != 1) {
pr_warn(FW_BUG "Invalid Domain size %d\n", tpmi_domain_size);
@@ -169,6 +188,13 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset)
domain_type = RAPL_DOMAIN_PACKAGE;
break;
case TPMI_RAPL_DOMAIN_SYSTEM:
+ if (!(tpmi_domain_flags & BIT(TPMI_RAPL_REG_DOMAIN_INFO))) {
+ pr_warn(FW_BUG "System domain must support Domain Info register\n");
+ return -ENODEV;
+ }
+ tpmi_domain_info = readq(trp->base + offset + TPMI_RAPL_REG_DOMAIN_INFO * 8);
+ if (!(tpmi_domain_info & TPMI_RAPL_DOMAIN_ROOT))
+ return 0;
domain_type = RAPL_DOMAIN_PLATFORM;
break;
case TPMI_RAPL_DOMAIN_MEMORY:
@@ -287,6 +313,8 @@ static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev,
goto err;
}
+ rapl_package_add_pmu(trp->rp);
+
auxiliary_set_drvdata(auxdev, trp);
return 0;
@@ -299,6 +327,7 @@ static void intel_rapl_tpmi_remove(struct auxiliary_device *auxdev)
{
struct tpmi_rapl_package *trp = auxiliary_get_drvdata(auxdev);
+ rapl_package_remove_pmu(trp->rp);
rapl_remove_package(trp->rp);
trp_release(trp);
}
@@ -318,7 +347,7 @@ static struct auxiliary_driver intel_rapl_tpmi_driver = {
module_auxiliary_driver(intel_rapl_tpmi_driver)
-MODULE_IMPORT_NS(INTEL_TPMI);
+MODULE_IMPORT_NS("INTEL_TPMI");
MODULE_DESCRIPTION("Intel RAPL TPMI Driver");
MODULE_LICENSE("GPL");
diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c
index 52c32dcbf7d8..4112a0097338 100644
--- a/drivers/powercap/powercap_sys.c
+++ b/drivers/powercap/powercap_sys.c
@@ -627,8 +627,7 @@ struct powercap_control_type *powercap_register_control_type(
dev_set_name(&control_type->dev, "%s", name);
result = device_register(&control_type->dev);
if (result) {
- if (control_type->allocated)
- kfree(control_type);
+ put_device(&control_type->dev);
return ERR_PTR(result);
}
idr_init(&control_type->idr);