diff options
Diffstat (limited to 'drivers/powercap')
-rw-r--r-- | drivers/powercap/Kconfig | 2 | ||||
-rw-r--r-- | drivers/powercap/dtpm.c | 2 | ||||
-rw-r--r-- | drivers/powercap/dtpm_cpu.c | 51 | ||||
-rw-r--r-- | drivers/powercap/dtpm_devfreq.c | 36 | ||||
-rw-r--r-- | drivers/powercap/idle_inject.c | 21 | ||||
-rw-r--r-- | drivers/powercap/intel_rapl_common.c | 796 | ||||
-rw-r--r-- | drivers/powercap/intel_rapl_msr.c | 36 | ||||
-rw-r--r-- | drivers/powercap/intel_rapl_tpmi.c | 37 | ||||
-rw-r--r-- | drivers/powercap/powercap_sys.c | 3 |
9 files changed, 841 insertions, 143 deletions
diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index 69ef8d081c98..03c4c796d993 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -82,7 +82,7 @@ config DTPM config DTPM_CPU bool "Add CPU power capping based on the energy model" - depends on DTPM && ENERGY_MODEL + depends on DTPM && ENERGY_MODEL && SMP help This enables support for CPU power limitation based on energy model. diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index ce920f17f45f..f390665743c4 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -522,7 +522,7 @@ static int dtpm_for_each_child(const struct dtpm_node *hierarchy, /** * dtpm_create_hierarchy - Create the dtpm hierarchy - * @hierarchy: An array of struct dtpm_node describing the hierarchy + * @dtpm_match_table: Pointer to the array of device ID structures * * The function is called by the platform specific code with the * description of the different node in the hierarchy. It creates the diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 9193c3b8edeb..6b6f51b21550 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -42,28 +42,29 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) { struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *pd = em_cpu_get(dtpm_cpu->cpu); - struct cpumask cpus; + struct em_perf_state *table; unsigned long freq; u64 power; int i, nr_cpus; - cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); - nr_cpus = cpumask_weight(&cpus); + nr_cpus = cpumask_weight_and(cpu_online_mask, to_cpumask(pd->cpus)); + rcu_read_lock(); + table = em_perf_state_from_pd(pd); for (i = 0; i < pd->nr_perf_states; i++) { - power = pd->table[i].power * nr_cpus; + power = table[i].power * nr_cpus; if (power > power_limit) break; } - freq = pd->table[i - 1].frequency; + freq = table[i - 1].frequency; + power_limit = table[i - 1].power * nr_cpus; + rcu_read_unlock(); freq_qos_update_request(&dtpm_cpu->qos_req, freq); - power_limit = pd->table[i - 1].power * nr_cpus; - return power_limit; } @@ -87,9 +88,11 @@ static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power) static u64 get_pd_power_uw(struct dtpm *dtpm) { struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); + struct em_perf_state *table; struct em_perf_domain *pd; struct cpumask *pd_mask; unsigned long freq; + u64 power = 0; int i; pd = em_cpu_get(dtpm_cpu->cpu); @@ -98,33 +101,41 @@ static u64 get_pd_power_uw(struct dtpm *dtpm) freq = cpufreq_quick_get(dtpm_cpu->cpu); + rcu_read_lock(); + table = em_perf_state_from_pd(pd); for (i = 0; i < pd->nr_perf_states; i++) { - if (pd->table[i].frequency < freq) + if (table[i].frequency < freq) continue; - return scale_pd_power_uw(pd_mask, pd->table[i].power); + power = scale_pd_power_uw(pd_mask, table[i].power); + break; } + rcu_read_unlock(); - return 0; + return power; } static int update_pd_power_uw(struct dtpm *dtpm) { struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *em = em_cpu_get(dtpm_cpu->cpu); - struct cpumask cpus; + struct em_perf_state *table; int nr_cpus; - cpumask_and(&cpus, cpu_online_mask, to_cpumask(em->cpus)); - nr_cpus = cpumask_weight(&cpus); + nr_cpus = cpumask_weight_and(cpu_online_mask, to_cpumask(em->cpus)); - dtpm->power_min = em->table[0].power; + rcu_read_lock(); + table = em_perf_state_from_pd(em); + + dtpm->power_min = table[0].power; dtpm->power_min *= nr_cpus; - dtpm->power_max = em->table[em->nr_perf_states - 1].power; + dtpm->power_max = table[em->nr_perf_states - 1].power; dtpm->power_max *= nr_cpus; + rcu_read_unlock(); + return 0; } @@ -143,7 +154,7 @@ static void pd_release(struct dtpm *dtpm) cpufreq_cpu_put(policy); } - + kfree(dtpm_cpu); } @@ -180,6 +191,7 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent) { struct dtpm_cpu *dtpm_cpu; struct cpufreq_policy *policy; + struct em_perf_state *table; struct em_perf_domain *pd; char name[CPUFREQ_NAME_LEN]; int ret = -ENOMEM; @@ -216,10 +228,13 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent) if (ret) goto out_kfree_dtpm_cpu; + rcu_read_lock(); + table = em_perf_state_from_pd(pd); ret = freq_qos_add_request(&policy->constraints, &dtpm_cpu->qos_req, FREQ_QOS_MAX, - pd->table[pd->nr_perf_states - 1].frequency); - if (ret) + table[pd->nr_perf_states - 1].frequency); + rcu_read_unlock(); + if (ret < 0) goto out_dtpm_unregister; cpufreq_cpu_put(policy); diff --git a/drivers/powercap/dtpm_devfreq.c b/drivers/powercap/dtpm_devfreq.c index 612c3b59dd5b..d1dff6ccab12 100644 --- a/drivers/powercap/dtpm_devfreq.c +++ b/drivers/powercap/dtpm_devfreq.c @@ -37,11 +37,16 @@ static int update_pd_power_uw(struct dtpm *dtpm) struct devfreq *devfreq = dtpm_devfreq->devfreq; struct device *dev = devfreq->dev.parent; struct em_perf_domain *pd = em_pd_get(dev); + struct em_perf_state *table; - dtpm->power_min = pd->table[0].power; + rcu_read_lock(); + table = em_perf_state_from_pd(pd); - dtpm->power_max = pd->table[pd->nr_perf_states - 1].power; + dtpm->power_min = table[0].power; + dtpm->power_max = table[pd->nr_perf_states - 1].power; + + rcu_read_unlock(); return 0; } @@ -51,20 +56,23 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) struct devfreq *devfreq = dtpm_devfreq->devfreq; struct device *dev = devfreq->dev.parent; struct em_perf_domain *pd = em_pd_get(dev); + struct em_perf_state *table; unsigned long freq; int i; + rcu_read_lock(); + table = em_perf_state_from_pd(pd); for (i = 0; i < pd->nr_perf_states; i++) { - if (pd->table[i].power > power_limit) + if (table[i].power > power_limit) break; } - freq = pd->table[i - 1].frequency; + freq = table[i - 1].frequency; + power_limit = table[i - 1].power; + rcu_read_unlock(); dev_pm_qos_update_request(&dtpm_devfreq->qos_req, freq); - power_limit = pd->table[i - 1].power; - return power_limit; } @@ -89,8 +97,9 @@ static u64 get_pd_power_uw(struct dtpm *dtpm) struct device *dev = devfreq->dev.parent; struct em_perf_domain *pd = em_pd_get(dev); struct devfreq_dev_status status; + struct em_perf_state *table; unsigned long freq; - u64 power; + u64 power = 0; int i; mutex_lock(&devfreq->lock); @@ -100,19 +109,22 @@ static u64 get_pd_power_uw(struct dtpm *dtpm) freq = DIV_ROUND_UP(status.current_frequency, HZ_PER_KHZ); _normalize_load(&status); + rcu_read_lock(); + table = em_perf_state_from_pd(pd); for (i = 0; i < pd->nr_perf_states; i++) { - if (pd->table[i].frequency < freq) + if (table[i].frequency < freq) continue; - power = pd->table[i].power; + power = table[i].power; power *= status.busy_time; power >>= 10; - return power; + break; } + rcu_read_unlock(); - return 0; + return power; } static void pd_release(struct dtpm *dtpm) @@ -166,7 +178,7 @@ static int __dtpm_devfreq_setup(struct devfreq *devfreq, struct dtpm *parent) ret = dev_pm_qos_add_request(dev, &dtpm_devfreq->qos_req, DEV_PM_QOS_MAX_FREQUENCY, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); - if (ret) { + if (ret < 0) { pr_err("Failed to add QoS request: %d\n", ret); goto out_dtpm_unregister; } diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index e18a2cc4e46a..5ad7cc438068 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -127,7 +127,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) struct idle_inject_device *ii_dev = container_of(timer, struct idle_inject_device, timer); - if (!ii_dev->update || (ii_dev->update && ii_dev->update())) + if (!ii_dev->update || ii_dev->update()) idle_inject_wakeup(ii_dev); duration_us = READ_ONCE(ii_dev->run_duration_us); @@ -179,7 +179,7 @@ void idle_inject_set_duration(struct idle_inject_device *ii_dev, if (!run_duration_us) pr_debug("CPU is forced to 100 percent idle\n"); } -EXPORT_SYMBOL_NS_GPL(idle_inject_set_duration, IDLE_INJECT); +EXPORT_SYMBOL_NS_GPL(idle_inject_set_duration, "IDLE_INJECT"); /** * idle_inject_get_duration - idle and run duration retrieval helper @@ -194,7 +194,7 @@ void idle_inject_get_duration(struct idle_inject_device *ii_dev, *run_duration_us = READ_ONCE(ii_dev->run_duration_us); *idle_duration_us = READ_ONCE(ii_dev->idle_duration_us); } -EXPORT_SYMBOL_NS_GPL(idle_inject_get_duration, IDLE_INJECT); +EXPORT_SYMBOL_NS_GPL(idle_inject_get_duration, "IDLE_INJECT"); /** * idle_inject_set_latency - set the maximum latency allowed @@ -206,7 +206,7 @@ void idle_inject_set_latency(struct idle_inject_device *ii_dev, { WRITE_ONCE(ii_dev->latency_us, latency_us); } -EXPORT_SYMBOL_NS_GPL(idle_inject_set_latency, IDLE_INJECT); +EXPORT_SYMBOL_NS_GPL(idle_inject_set_latency, "IDLE_INJECT"); /** * idle_inject_start - start idle injections @@ -238,7 +238,7 @@ int idle_inject_start(struct idle_inject_device *ii_dev) return 0; } -EXPORT_SYMBOL_NS_GPL(idle_inject_start, IDLE_INJECT); +EXPORT_SYMBOL_NS_GPL(idle_inject_start, "IDLE_INJECT"); /** * idle_inject_stop - stops idle injections @@ -285,7 +285,7 @@ void idle_inject_stop(struct idle_inject_device *ii_dev) cpu_hotplug_enable(); } -EXPORT_SYMBOL_NS_GPL(idle_inject_stop, IDLE_INJECT); +EXPORT_SYMBOL_NS_GPL(idle_inject_stop, "IDLE_INJECT"); /** * idle_inject_setup - prepare the current task for idle injection @@ -339,8 +339,7 @@ struct idle_inject_device *idle_inject_register_full(struct cpumask *cpumask, return NULL; cpumask_copy(to_cpumask(ii_dev->cpumask), cpumask); - hrtimer_init(&ii_dev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ii_dev->timer.function = idle_inject_timer_fn; + hrtimer_setup(&ii_dev->timer, idle_inject_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ii_dev->latency_us = UINT_MAX; ii_dev->update = update; @@ -367,7 +366,7 @@ out_rollback: return NULL; } -EXPORT_SYMBOL_NS_GPL(idle_inject_register_full, IDLE_INJECT); +EXPORT_SYMBOL_NS_GPL(idle_inject_register_full, "IDLE_INJECT"); /** * idle_inject_register - initialize idle injection on a set of CPUs @@ -384,7 +383,7 @@ struct idle_inject_device *idle_inject_register(struct cpumask *cpumask) { return idle_inject_register_full(cpumask, NULL); } -EXPORT_SYMBOL_NS_GPL(idle_inject_register, IDLE_INJECT); +EXPORT_SYMBOL_NS_GPL(idle_inject_register, "IDLE_INJECT"); /** * idle_inject_unregister - unregister idle injection control device @@ -405,7 +404,7 @@ void idle_inject_unregister(struct idle_inject_device *ii_dev) kfree(ii_dev); } -EXPORT_SYMBOL_NS_GPL(idle_inject_unregister, IDLE_INJECT); +EXPORT_SYMBOL_NS_GPL(idle_inject_unregister, "IDLE_INJECT"); static struct smp_hotplug_thread idle_inject_threads = { .store = &idle_inject_thread.tsk, diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 2feed036c1cd..e3be40adc0d7 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -5,26 +5,30 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/bitmap.h> +#include <linux/cleanup.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/intel_rapl.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/list.h> -#include <linux/types.h> -#include <linux/device.h> -#include <linux/slab.h> #include <linux/log2.h> -#include <linux/bitmap.h> -#include <linux/delay.h> -#include <linux/sysfs.h> -#include <linux/cpu.h> +#include <linux/module.h> +#include <linux/nospec.h> +#include <linux/perf_event.h> +#include <linux/platform_device.h> #include <linux/powercap.h> -#include <linux/suspend.h> -#include <linux/intel_rapl.h> #include <linux/processor.h> -#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include <linux/sysfs.h> +#include <linux/types.h> -#include <asm/iosf_mbi.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/iosf_mbi.h> +#include <asm/msr.h> /* bitmasks for RAPL MSRs, used by primitive access functions */ #define ENERGY_STATUS_MASK 0xffffffff @@ -737,7 +741,7 @@ static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim) { struct rapl_primitive_info *rpi = rp->priv->rpi; - if (prim < 0 || prim > NR_RAPL_PRIMITIVES || !rpi) + if (prim < 0 || prim >= NR_RAPL_PRIMITIVES || !rpi) return NULL; return &rpi[prim]; @@ -759,6 +763,11 @@ static int rapl_config(struct rapl_package *rp) default: return -EINVAL; } + + /* defaults_msr can be NULL on unsupported platforms */ + if (!rp->priv->defaults || !rp->priv->rpi) + return -ENODEV; + return 0; } @@ -1214,66 +1223,72 @@ static const struct rapl_defaults rapl_defaults_amd = { }; static const struct x86_cpu_id rapl_ids[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &rapl_defaults_core), - - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &rapl_defaults_core), - - X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &rapl_defaults_hsw_server), - - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &rapl_defaults_hsw_server), - - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &rapl_defaults_hsw_server), - X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &rapl_defaults_hsw_server), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &rapl_defaults_hsw_server), - X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), - X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &rapl_defaults_spr_server), - X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core), - - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &rapl_defaults_cht), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &rapl_defaults_tng), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &rapl_defaults_ann), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &rapl_defaults_core), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &rapl_defaults_core), - - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &rapl_defaults_hsw_server), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_IVYBRIDGE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_HASWELL, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_G, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_X, &rapl_defaults_hsw_server), + + X86_MATCH_VFM(INTEL_BROADWELL, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_G, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_X, &rapl_defaults_hsw_server), + + X86_MATCH_VFM(INTEL_SKYLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SKYLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), + X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2,&rapl_defaults_ann), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &rapl_defaults_hsw_server), X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd), X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd), + X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd), X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd), {} }; @@ -1498,8 +1513,587 @@ static int rapl_detect_domains(struct rapl_package *rp) return 0; } +#ifdef CONFIG_PERF_EVENTS + +/* + * Support for RAPL PMU + * + * Register a PMU if any of the registered RAPL Packages have the requirement + * of exposing its energy counters via Perf PMU. + * + * PMU Name: + * power + * + * Events: + * Name Event id RAPL Domain + * energy_cores 0x01 RAPL_DOMAIN_PP0 + * energy_pkg 0x02 RAPL_DOMAIN_PACKAGE + * energy_ram 0x03 RAPL_DOMAIN_DRAM + * energy_gpu 0x04 RAPL_DOMAIN_PP1 + * energy_psys 0x05 RAPL_DOMAIN_PLATFORM + * + * Unit: + * Joules + * + * Scale: + * 2.3283064365386962890625e-10 + * The same RAPL domain in different RAPL Packages may have different + * energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as + * the fixed unit for all energy counters, and covert each hardware + * counter increase to N times of PMU event counter increases. + * + * This is fully compatible with the current MSR RAPL PMU. This means that + * userspace programs like turbostat can use the same code to handle RAPL Perf + * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running + * underlying on the platform. + * + * Note that RAPL Packages can be probed/removed dynamically, and the events + * supported by each TPMI RAPL device can be different. Thus the RAPL PMU + * support is done on demand, which means + * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for + * unsupported counters are not exposed. + * 2. PMU is unregistered and registered when a new RAPL Package is probed and + * supports new counters that are not supported by current PMU. + * 3. PMU is unregistered when all registered RAPL Packages don't need PMU. + */ + +struct rapl_pmu { + struct pmu pmu; /* Perf PMU structure */ + u64 timer_ms; /* Maximum expiration time to avoid counter overflow */ + unsigned long domain_map; /* Events supported by current registered PMU */ + bool registered; /* Whether the PMU has been registered or not */ +}; + +static struct rapl_pmu rapl_pmu; + +/* PMU helpers */ + +static int get_pmu_cpu(struct rapl_package *rp) +{ + int cpu; + + if (!rp->has_pmu) + return nr_cpu_ids; + + /* Only TPMI RAPL is supported for now */ + if (rp->priv->type != RAPL_IF_TPMI) + return nr_cpu_ids; + + /* TPMI RAPL uses any CPU in the package for PMU */ + for_each_online_cpu(cpu) + if (topology_physical_package_id(cpu) == rp->id) + return cpu; + + return nr_cpu_ids; +} + +static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu) +{ + if (!rp->has_pmu) + return false; + + /* Only TPMI RAPL is supported for now */ + if (rp->priv->type != RAPL_IF_TPMI) + return false; + + /* TPMI RAPL uses any CPU in the package for PMU */ + return topology_physical_package_id(cpu) == rp->id; +} + +static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event) +{ + struct rapl_package *rp = event->pmu_private; + + return &rp->pmu_data; +} + +/* PMU event callbacks */ + +static u64 event_read_counter(struct perf_event *event) +{ + struct rapl_package *rp = event->pmu_private; + u64 val; + int ret; + + /* Return 0 for unsupported events */ + if (event->hw.idx < 0) + return 0; + + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val); + + /* Return 0 for failed read */ + if (ret) + return 0; + + return val; +} + +static void __rapl_pmu_event_start(struct perf_event *event) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &data->active_list); + + local64_set(&event->hw.prev_count, event_read_counter(event)); + if (++data->n_active == 1) + hrtimer_start(&data->hrtimer, data->timer_interval, + HRTIMER_MODE_REL_PINNED); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + __rapl_pmu_event_start(event); + raw_spin_unlock_irqrestore(&data->lock, flags); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + + /* + * Follow the generic code to drain hwc->prev_count. + * The loop is not expected to run for multiple times. + */ + prev_raw_count = local64_read(&hwc->prev_count); + do { + new_raw_count = event_read_counter(event); + } while (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)); + + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + */ + delta = new_raw_count - prev_raw_count; + + /* + * Scale delta to smallest unit (2^-32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + sdelta = delta * data->scale[event->hw.flags]; + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + + /* Mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(data->n_active <= 0); + if (--data->n_active == 0) + hrtimer_cancel(&data->hrtimer); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* Check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + raw_spin_unlock_irqrestore(&data->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(event); + + raw_spin_unlock_irqrestore(&data->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +/* RAPL PMU event ids, same as shown in sysfs */ +enum perf_rapl_events { + PERF_RAPL_PP0 = 1, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + PERF_RAPL_MAX +}; +#define RAPL_EVENT_MASK GENMASK(7, 0) + +static const int event_to_domain[PERF_RAPL_MAX] = { + [PERF_RAPL_PP0] = RAPL_DOMAIN_PP0, + [PERF_RAPL_PKG] = RAPL_DOMAIN_PACKAGE, + [PERF_RAPL_RAM] = RAPL_DOMAIN_DRAM, + [PERF_RAPL_PP1] = RAPL_DOMAIN_PP1, + [PERF_RAPL_PSYS] = RAPL_DOMAIN_PLATFORM, +}; + +static int rapl_pmu_event_init(struct perf_event *event) +{ + struct rapl_package *pos, *rp = NULL; + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int domain, idx; + + /* Only look at RAPL events */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Check for supported events only */ + if (!cfg || cfg >= PERF_RAPL_MAX) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + /* Find out which Package the event belongs to */ + list_for_each_entry(pos, &rapl_packages, plist) { + if (is_rp_pmu_cpu(pos, event->cpu)) { + rp = pos; + break; + } + } + if (!rp) + return -ENODEV; + + /* Find out which RAPL Domain the event belongs to */ + domain = event_to_domain[cfg]; + + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->pmu_private = rp; /* Which package */ + event->hw.flags = domain; /* Which domain */ + + event->hw.idx = -1; + /* Find out the index in rp->domains[] to get domain pointer */ + for (idx = 0; idx < rp->nr_domains; idx++) { + if (rp->domains[idx].id == domain) { + event->hw.idx = idx; + break; + } + } + + return 0; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_package_pmu_data *data = + container_of(hrtimer, struct rapl_package_pmu_data, hrtimer); + struct perf_event *event; + unsigned long flags; + + if (!data->n_active) + return HRTIMER_NORESTART; + + raw_spin_lock_irqsave(&data->lock, flags); + + list_for_each_entry(event, &data->active_list, active_entry) + rapl_event_update(event); + + raw_spin_unlock_irqrestore(&data->lock, flags); + + hrtimer_forward_now(hrtimer, data->timer_interval); + + return HRTIMER_RESTART; +} + +/* PMU sysfs attributes */ + +/* + * There are no default events, but we need to create "events" group (with + * empty attrs) before updating it with detected events. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group pmu_events_group = { + .name = "events", + .attrs = attrs_empty, +}; + +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rapl_package *rp; + cpumask_var_t cpu_mask; + int cpu; + int ret; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + cpus_read_lock(); + + cpumask_clear(cpu_mask); + + /* Choose a cpu for each RAPL Package */ + list_for_each_entry(rp, &rapl_packages, plist) { + cpu = get_pmu_cpu(rp); + if (cpu < nr_cpu_ids) + cpumask_set_cpu(cpu, cpu_mask); + } + cpus_read_unlock(); + + ret = cpumap_print_to_pagebuf(true, buf, cpu_mask); + + free_cpumask_var(cpu_mask); + + return ret; +} + +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static struct attribute_group pmu_cpumask_group = { + .attrs = pmu_cpumask_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); +static struct attribute *pmu_format_attr[] = { + &format_attr_event.attr, + NULL +}; + +static struct attribute_group pmu_format_group = { + .name = "format", + .attrs = pmu_format_attr, +}; + +static const struct attribute_group *pmu_attr_groups[] = { + &pmu_events_group, + &pmu_cpumask_group, + &pmu_format_group, + NULL +}; + +#define RAPL_EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ + .event_str = str, \ +} + +RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02"); +RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03"); +RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04"); +RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); + +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules"); +RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules"); +RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules"); +RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules"); +RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules"); + +RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10"); + +#define RAPL_EVENT_GROUP(_name, domain) \ +static struct attribute *pmu_attr_##_name[] = { \ + &event_attr_rapl_##_name.attr.attr, \ + &event_attr_rapl_unit_##_name.attr.attr, \ + &event_attr_rapl_scale_##_name.attr.attr, \ + NULL \ +}; \ +static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \ +{ \ + return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \ +} \ +static struct attribute_group pmu_group_##_name = { \ + .name = "events", \ + .attrs = pmu_attr_##_name, \ + .is_visible = is_visible_##_name, \ +} + +RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0); +RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE); +RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM); +RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1); +RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM); + +static const struct attribute_group *pmu_attr_update[] = { + &pmu_group_cores, + &pmu_group_pkg, + &pmu_group_ram, + &pmu_group_gpu, + &pmu_group_psys, + NULL +}; + +static int rapl_pmu_update(struct rapl_package *rp) +{ + int ret = 0; + + /* Return if PMU already covers all events supported by current RAPL Package */ + if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map))) + goto end; + + /* Unregister previous registered PMU */ + if (rapl_pmu.registered) + perf_pmu_unregister(&rapl_pmu.pmu); + + rapl_pmu.registered = false; + rapl_pmu.domain_map |= rp->domain_map; + + memset(&rapl_pmu.pmu, 0, sizeof(struct pmu)); + rapl_pmu.pmu.attr_groups = pmu_attr_groups; + rapl_pmu.pmu.attr_update = pmu_attr_update; + rapl_pmu.pmu.task_ctx_nr = perf_invalid_context; + rapl_pmu.pmu.event_init = rapl_pmu_event_init; + rapl_pmu.pmu.add = rapl_pmu_event_add; + rapl_pmu.pmu.del = rapl_pmu_event_del; + rapl_pmu.pmu.start = rapl_pmu_event_start; + rapl_pmu.pmu.stop = rapl_pmu_event_stop; + rapl_pmu.pmu.read = rapl_pmu_event_read; + rapl_pmu.pmu.module = THIS_MODULE; + rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT; + ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1); + if (ret) { + pr_info("Failed to register PMU\n"); + return ret; + } + + rapl_pmu.registered = true; +end: + rp->has_pmu = true; + return ret; +} + +int rapl_package_add_pmu(struct rapl_package *rp) +{ + struct rapl_package_pmu_data *data = &rp->pmu_data; + int idx; + + if (rp->has_pmu) + return -EEXIST; + + guard(cpus_read_lock)(); + + for (idx = 0; idx < rp->nr_domains; idx++) { + struct rapl_domain *rd = &rp->domains[idx]; + int domain = rd->id; + u64 val; + + if (!test_bit(domain, &rp->domain_map)) + continue; + + /* + * The RAPL PMU granularity is 2^-32 Joules + * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase + */ + val = rd->energy_unit * (1ULL << 32); + do_div(val, ENERGY_UNIT_SCALE * 1000000); + data->scale[domain] = val; + + if (!rapl_pmu.timer_ms) { + struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER); + + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter + * overflows. + * + * max_count = rpi->mask >> rpi->shift + 1 + * max_energy_pj = max_count * rd->energy_unit + * max_time_sec = (max_energy_pj / 1000000000) / 200w + * + * rapl_pmu.timer_ms = max_time_sec * 1000 / 2 + */ + val = (rpi->mask >> rpi->shift) + 1; + val *= rd->energy_unit; + do_div(val, 1000000 * 200 * 2); + rapl_pmu.timer_ms = val; + + pr_debug("%llu ms overflow timer\n", rapl_pmu.timer_ms); + } + + pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]); + } + + /* Initialize per package PMU data */ + raw_spin_lock_init(&data->lock); + INIT_LIST_HEAD(&data->active_list); + data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms); + hrtimer_setup(&data->hrtimer, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + + return rapl_pmu_update(rp); +} +EXPORT_SYMBOL_GPL(rapl_package_add_pmu); + +void rapl_package_remove_pmu(struct rapl_package *rp) +{ + struct rapl_package *pos; + + if (!rp->has_pmu) + return; + + guard(cpus_read_lock)(); + + list_for_each_entry(pos, &rapl_packages, plist) { + /* PMU is still needed */ + if (pos->has_pmu && pos != rp) + return; + } + + perf_pmu_unregister(&rapl_pmu.pmu); + memset(&rapl_pmu, 0, sizeof(struct rapl_pmu)); +} +EXPORT_SYMBOL_GPL(rapl_package_remove_pmu); +#endif + /* called from CPU hotplug notifier, hotplug lock held */ -void rapl_remove_package(struct rapl_package *rp) +void rapl_remove_package_cpuslocked(struct rapl_package *rp) { struct rapl_domain *rd, *rd_package = NULL; @@ -1528,16 +2122,45 @@ void rapl_remove_package(struct rapl_package *rp) list_del(&rp->plist); kfree(rp); } +EXPORT_SYMBOL_GPL(rapl_remove_package_cpuslocked); + +void rapl_remove_package(struct rapl_package *rp) +{ + guard(cpus_read_lock)(); + rapl_remove_package_cpuslocked(rp); +} EXPORT_SYMBOL_GPL(rapl_remove_package); +/* + * RAPL Package energy counter scope: + * 1. AMD/HYGON platforms use per-PKG package energy counter + * 2. For Intel platforms + * 2.1 CLX-AP platform has per-DIE package energy counter + * 2.2 Other platforms that uses MSR RAPL are single die systems so the + * package energy counter can be considered as per-PKG/per-DIE, + * here it is considered as per-DIE. + * 2.3 New platforms that use TPMI RAPL doesn't care about the + * scope because they are not MSR/CPU based. + */ +#define rapl_msrs_are_pkg_scope() \ + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + /* caller to ensure CPU hotplug lock is held */ -struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu) +struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv, + bool id_is_cpu) { struct rapl_package *rp; int uid; - if (id_is_cpu) - uid = topology_logical_die_id(id); + if (id_is_cpu) { + uid = rapl_msrs_are_pkg_scope() ? + topology_physical_package_id(id) : topology_logical_die_id(id); + if (uid < 0) { + pr_err("topology_logical_(package/die)_id() returned a negative value"); + return NULL; + } + } else uid = id; @@ -1549,10 +2172,17 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, return NULL; } +EXPORT_SYMBOL_GPL(rapl_find_package_domain_cpuslocked); + +struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu) +{ + guard(cpus_read_lock)(); + return rapl_find_package_domain_cpuslocked(id, priv, id_is_cpu); +} EXPORT_SYMBOL_GPL(rapl_find_package_domain); /* called from CPU hotplug notifier, hotplug lock held */ -struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu) +struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, bool id_is_cpu) { struct rapl_package *rp; int ret; @@ -1562,9 +2192,14 @@ struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id return ERR_PTR(-ENOMEM); if (id_is_cpu) { - rp->id = topology_logical_die_id(id); + rp->id = rapl_msrs_are_pkg_scope() ? + topology_physical_package_id(id) : topology_logical_die_id(id); + if ((int)(rp->id) < 0) { + pr_err("topology_logical_(package/die)_id() returned a negative value"); + return ERR_PTR(-EINVAL); + } rp->lead_cpu = id; - if (topology_max_die_per_package() > 1) + if (!rapl_msrs_are_pkg_scope() && topology_max_dies_per_package() > 1) snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d-die-%d", topology_physical_package_id(id), topology_die_id(id)); else @@ -1598,6 +2233,13 @@ err_free_package: kfree(rp); return ERR_PTR(ret); } +EXPORT_SYMBOL_GPL(rapl_add_package_cpuslocked); + +struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu) +{ + guard(cpus_read_lock)(); + return rapl_add_package_cpuslocked(id, priv, id_is_cpu); +} EXPORT_SYMBOL_GPL(rapl_add_package); static void power_limit_state_save(void) diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 250bd41a588c..8ad2115d65f6 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -24,6 +24,7 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> /* Local defines */ #define MSR_PLATFORM_POWER_LIMIT 0x0000065C @@ -73,9 +74,9 @@ static int rapl_cpu_online(unsigned int cpu) { struct rapl_package *rp; - rp = rapl_find_package_domain(cpu, rapl_msr_priv, true); + rp = rapl_find_package_domain_cpuslocked(cpu, rapl_msr_priv, true); if (!rp) { - rp = rapl_add_package(cpu, rapl_msr_priv, true); + rp = rapl_add_package_cpuslocked(cpu, rapl_msr_priv, true); if (IS_ERR(rp)) return PTR_ERR(rp); } @@ -88,14 +89,14 @@ static int rapl_cpu_down_prep(unsigned int cpu) struct rapl_package *rp; int lead_cpu; - rp = rapl_find_package_domain(cpu, rapl_msr_priv, true); + rp = rapl_find_package_domain_cpuslocked(cpu, rapl_msr_priv, true); if (!rp) return 0; cpumask_clear_cpu(cpu, &rp->cpumask); lead_cpu = cpumask_first(&rp->cpumask); if (lead_cpu >= nr_cpu_ids) - rapl_remove_package(rp); + rapl_remove_package_cpuslocked(rp); else if (rp->lead_cpu == cpu) rp->lead_cpu = lead_cpu; return 0; @@ -103,7 +104,7 @@ static int rapl_cpu_down_prep(unsigned int cpu) static int rapl_msr_read_raw(int cpu, struct reg_action *ra) { - if (rdmsrl_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) { + if (rdmsrq_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) { pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu); return -EIO; } @@ -116,14 +117,14 @@ static void rapl_msr_update_func(void *info) struct reg_action *ra = info; u64 val; - ra->err = rdmsrl_safe(ra->reg.msr, &val); + ra->err = rdmsrq_safe(ra->reg.msr, &val); if (ra->err) return; val &= ~ra->mask; val |= ra->value; - ra->err = wrmsrl_safe(ra->reg.msr, val); + ra->err = wrmsrq_safe(ra->reg.msr, val); } static int rapl_msr_write_raw(int cpu, struct reg_action *ra) @@ -139,14 +140,16 @@ static int rapl_msr_write_raw(int cpu, struct reg_action *ra) /* List of verified CPUs. */ static const struct x86_cpu_id pl4_support_ids[] = { - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, NULL), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL), - X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, NULL), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_ALDERLAKE, NULL), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, NULL), + X86_MATCH_VFM(INTEL_RAPTORLAKE, NULL), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, NULL), + X86_MATCH_VFM(INTEL_METEORLAKE, NULL), + X86_MATCH_VFM(INTEL_METEORLAKE_L, NULL), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL), + X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), {} }; @@ -197,11 +200,10 @@ out: return ret; } -static int rapl_msr_remove(struct platform_device *pdev) +static void rapl_msr_remove(struct platform_device *pdev) { cpuhp_remove_state(rapl_msr_priv->pcap_rapl_online); powercap_unregister_control_type(rapl_msr_priv->control_type); - return 0; } static const struct platform_device_id rapl_msr_ids[] = { diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 891c90fefd8b..af2368f4db10 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -15,7 +15,8 @@ #include <linux/module.h> #include <linux/slab.h> -#define TPMI_RAPL_VERSION 1 +#define TPMI_RAPL_MAJOR_VERSION 0 +#define TPMI_RAPL_MINOR_VERSION 1 /* 1 header + 10 registers + 5 reserved. 8 bytes for each. */ #define TPMI_RAPL_DOMAIN_SIZE 128 @@ -40,6 +41,7 @@ enum tpmi_rapl_register { TPMI_RAPL_REG_ENERGY_STATUS, TPMI_RAPL_REG_PERF_STATUS, TPMI_RAPL_REG_POWER_INFO, + TPMI_RAPL_REG_DOMAIN_INFO, TPMI_RAPL_REG_INTERRUPT, TPMI_RAPL_REG_MAX = 15, }; @@ -130,6 +132,12 @@ static void trp_release(struct tpmi_rapl_package *trp) mutex_unlock(&tpmi_rapl_lock); } +/* + * Bit 0 of TPMI_RAPL_REG_DOMAIN_INFO indicates if the current package is a domain + * root or not. Only domain root packages can enumerate System (Psys) Domain. + */ +#define TPMI_RAPL_DOMAIN_ROOT BIT(0) + static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) { u8 tpmi_domain_version; @@ -139,6 +147,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) enum rapl_domain_reg_id reg_id; int tpmi_domain_size, tpmi_domain_flags; u64 tpmi_domain_header = readq(trp->base + offset); + u64 tpmi_domain_info; /* Domain Parent bits are ignored for now */ tpmi_domain_version = tpmi_domain_header & 0xff; @@ -146,11 +155,21 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) tpmi_domain_size = tpmi_domain_header >> 16 & 0xff; tpmi_domain_flags = tpmi_domain_header >> 32 & 0xffff; - if (tpmi_domain_version != TPMI_RAPL_VERSION) { - pr_warn(FW_BUG "Unsupported version:%d\n", tpmi_domain_version); + if (tpmi_domain_version == TPMI_VERSION_INVALID) { + pr_warn(FW_BUG "Invalid version\n"); return -ENODEV; } + if (TPMI_MAJOR_VERSION(tpmi_domain_version) != TPMI_RAPL_MAJOR_VERSION) { + pr_warn(FW_BUG "Unsupported major version:%ld\n", + TPMI_MAJOR_VERSION(tpmi_domain_version)); + return -ENODEV; + } + + if (TPMI_MINOR_VERSION(tpmi_domain_version) > TPMI_RAPL_MINOR_VERSION) + pr_info("Ignore: Unsupported minor version:%ld\n", + TPMI_MINOR_VERSION(tpmi_domain_version)); + /* Domain size: in unit of 128 Bytes */ if (tpmi_domain_size != 1) { pr_warn(FW_BUG "Invalid Domain size %d\n", tpmi_domain_size); @@ -169,6 +188,13 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) domain_type = RAPL_DOMAIN_PACKAGE; break; case TPMI_RAPL_DOMAIN_SYSTEM: + if (!(tpmi_domain_flags & BIT(TPMI_RAPL_REG_DOMAIN_INFO))) { + pr_warn(FW_BUG "System domain must support Domain Info register\n"); + return -ENODEV; + } + tpmi_domain_info = readq(trp->base + offset + TPMI_RAPL_REG_DOMAIN_INFO * 8); + if (!(tpmi_domain_info & TPMI_RAPL_DOMAIN_ROOT)) + return 0; domain_type = RAPL_DOMAIN_PLATFORM; break; case TPMI_RAPL_DOMAIN_MEMORY: @@ -287,6 +313,8 @@ static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev, goto err; } + rapl_package_add_pmu(trp->rp); + auxiliary_set_drvdata(auxdev, trp); return 0; @@ -299,6 +327,7 @@ static void intel_rapl_tpmi_remove(struct auxiliary_device *auxdev) { struct tpmi_rapl_package *trp = auxiliary_get_drvdata(auxdev); + rapl_package_remove_pmu(trp->rp); rapl_remove_package(trp->rp); trp_release(trp); } @@ -318,7 +347,7 @@ static struct auxiliary_driver intel_rapl_tpmi_driver = { module_auxiliary_driver(intel_rapl_tpmi_driver) -MODULE_IMPORT_NS(INTEL_TPMI); +MODULE_IMPORT_NS("INTEL_TPMI"); MODULE_DESCRIPTION("Intel RAPL TPMI Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c index 52c32dcbf7d8..4112a0097338 100644 --- a/drivers/powercap/powercap_sys.c +++ b/drivers/powercap/powercap_sys.c @@ -627,8 +627,7 @@ struct powercap_control_type *powercap_register_control_type( dev_set_name(&control_type->dev, "%s", name); result = device_register(&control_type->dev); if (result) { - if (control_type->allocated) - kfree(control_type); + put_device(&control_type->dev); return ERR_PTR(result); } idr_init(&control_type->idr); |