summaryrefslogtreecommitdiff
path: root/drivers/powercap/intel_rapl_common.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/powercap/intel_rapl_common.c')
-rw-r--r--drivers/powercap/intel_rapl_common.c796
1 files changed, 719 insertions, 77 deletions
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 2feed036c1cd..77d75e1f14a9 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -5,26 +5,29 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/bitmap.h>
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/intel_rapl.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/slab.h>
#include <linux/log2.h>
-#include <linux/bitmap.h>
-#include <linux/delay.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
#include <linux/powercap.h>
-#include <linux/suspend.h>
-#include <linux/intel_rapl.h>
#include <linux/processor.h>
-#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
-#include <asm/iosf_mbi.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
+#include <asm/iosf_mbi.h>
/* bitmasks for RAPL MSRs, used by primitive access functions */
#define ENERGY_STATUS_MASK 0xffffffff
@@ -737,7 +740,7 @@ static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim)
{
struct rapl_primitive_info *rpi = rp->priv->rpi;
- if (prim < 0 || prim > NR_RAPL_PRIMITIVES || !rpi)
+ if (prim < 0 || prim >= NR_RAPL_PRIMITIVES || !rpi)
return NULL;
return &rpi[prim];
@@ -759,6 +762,11 @@ static int rapl_config(struct rapl_package *rp)
default:
return -EINVAL;
}
+
+ /* defaults_msr can be NULL on unsupported platforms */
+ if (!rp->priv->defaults || !rp->priv->rpi)
+ return -ENODEV;
+
return 0;
}
@@ -1214,66 +1222,72 @@ static const struct rapl_defaults rapl_defaults_amd = {
};
static const struct x86_cpu_id rapl_ids[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &rapl_defaults_core),
-
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &rapl_defaults_core),
-
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &rapl_defaults_hsw_server),
-
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &rapl_defaults_hsw_server),
-
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &rapl_defaults_hsw_server),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &rapl_defaults_hsw_server),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &rapl_defaults_hsw_server),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &rapl_defaults_spr_server),
- X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core),
-
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &rapl_defaults_cht),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &rapl_defaults_tng),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &rapl_defaults_ann),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &rapl_defaults_core),
-
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &rapl_defaults_hsw_server),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core),
+
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &rapl_defaults_core),
+
+ X86_MATCH_VFM(INTEL_HASWELL, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_HASWELL_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &rapl_defaults_hsw_server),
+
+ X86_MATCH_VFM(INTEL_BROADWELL, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &rapl_defaults_hsw_server),
+
+ X86_MATCH_VFM(INTEL_SKYLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core),
+
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &rapl_defaults_ann),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &rapl_defaults_core),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &rapl_defaults_core),
+
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &rapl_defaults_hsw_server),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &rapl_defaults_hsw_server),
X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd),
X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd),
+ X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd),
X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd),
{}
};
@@ -1498,8 +1512,588 @@ static int rapl_detect_domains(struct rapl_package *rp)
return 0;
}
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Support for RAPL PMU
+ *
+ * Register a PMU if any of the registered RAPL Packages have the requirement
+ * of exposing its energy counters via Perf PMU.
+ *
+ * PMU Name:
+ * power
+ *
+ * Events:
+ * Name Event id RAPL Domain
+ * energy_cores 0x01 RAPL_DOMAIN_PP0
+ * energy_pkg 0x02 RAPL_DOMAIN_PACKAGE
+ * energy_ram 0x03 RAPL_DOMAIN_DRAM
+ * energy_gpu 0x04 RAPL_DOMAIN_PP1
+ * energy_psys 0x05 RAPL_DOMAIN_PLATFORM
+ *
+ * Unit:
+ * Joules
+ *
+ * Scale:
+ * 2.3283064365386962890625e-10
+ * The same RAPL domain in different RAPL Packages may have different
+ * energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as
+ * the fixed unit for all energy counters, and covert each hardware
+ * counter increase to N times of PMU event counter increases.
+ *
+ * This is fully compatible with the current MSR RAPL PMU. This means that
+ * userspace programs like turbostat can use the same code to handle RAPL Perf
+ * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running
+ * underlying on the platform.
+ *
+ * Note that RAPL Packages can be probed/removed dynamically, and the events
+ * supported by each TPMI RAPL device can be different. Thus the RAPL PMU
+ * support is done on demand, which means
+ * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for
+ * unsupported counters are not exposed.
+ * 2. PMU is unregistered and registered when a new RAPL Package is probed and
+ * supports new counters that are not supported by current PMU.
+ * 3. PMU is unregistered when all registered RAPL Packages don't need PMU.
+ */
+
+struct rapl_pmu {
+ struct pmu pmu; /* Perf PMU structure */
+ u64 timer_ms; /* Maximum expiration time to avoid counter overflow */
+ unsigned long domain_map; /* Events supported by current registered PMU */
+ bool registered; /* Whether the PMU has been registered or not */
+};
+
+static struct rapl_pmu rapl_pmu;
+
+/* PMU helpers */
+
+static int get_pmu_cpu(struct rapl_package *rp)
+{
+ int cpu;
+
+ if (!rp->has_pmu)
+ return nr_cpu_ids;
+
+ /* Only TPMI RAPL is supported for now */
+ if (rp->priv->type != RAPL_IF_TPMI)
+ return nr_cpu_ids;
+
+ /* TPMI RAPL uses any CPU in the package for PMU */
+ for_each_online_cpu(cpu)
+ if (topology_physical_package_id(cpu) == rp->id)
+ return cpu;
+
+ return nr_cpu_ids;
+}
+
+static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
+{
+ if (!rp->has_pmu)
+ return false;
+
+ /* Only TPMI RAPL is supported for now */
+ if (rp->priv->type != RAPL_IF_TPMI)
+ return false;
+
+ /* TPMI RAPL uses any CPU in the package for PMU */
+ return topology_physical_package_id(cpu) == rp->id;
+}
+
+static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event)
+{
+ struct rapl_package *rp = event->pmu_private;
+
+ return &rp->pmu_data;
+}
+
+/* PMU event callbacks */
+
+static u64 event_read_counter(struct perf_event *event)
+{
+ struct rapl_package *rp = event->pmu_private;
+ u64 val;
+ int ret;
+
+ /* Return 0 for unsupported events */
+ if (event->hw.idx < 0)
+ return 0;
+
+ ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val);
+
+ /* Return 0 for failed read */
+ if (ret)
+ return 0;
+
+ return val;
+}
+
+static void __rapl_pmu_event_start(struct perf_event *event)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ list_add_tail(&event->active_entry, &data->active_list);
+
+ local64_set(&event->hw.prev_count, event_read_counter(event));
+ if (++data->n_active == 1)
+ hrtimer_start(&data->hrtimer, data->timer_interval,
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+ __rapl_pmu_event_start(event);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ u64 prev_raw_count, new_raw_count;
+ s64 delta, sdelta;
+
+ /*
+ * Follow the generic code to drain hwc->prev_count.
+ * The loop is not expected to run for multiple times.
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ do {
+ new_raw_count = event_read_counter(event);
+ } while (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count));
+
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ */
+ delta = new_raw_count - prev_raw_count;
+
+ /*
+ * Scale delta to smallest unit (2^-32)
+ * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ * or use ldexp(count, -32).
+ * Watts = Joules/Time delta
+ */
+ sdelta = delta * data->scale[event->hw.flags];
+
+ local64_add(sdelta, &event->count);
+
+ return new_raw_count;
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ /* Mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ WARN_ON_ONCE(data->n_active <= 0);
+ if (--data->n_active == 0)
+ hrtimer_cancel(&data->hrtimer);
+
+ list_del(&event->active_entry);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ /* Check if update of sw counter is necessary */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ rapl_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __rapl_pmu_event_start(event);
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+
+ return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+ rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+/* RAPL PMU event ids, same as shown in sysfs */
+enum perf_rapl_events {
+ PERF_RAPL_PP0 = 1, /* all cores */
+ PERF_RAPL_PKG, /* entire package */
+ PERF_RAPL_RAM, /* DRAM */
+ PERF_RAPL_PP1, /* gpu */
+ PERF_RAPL_PSYS, /* psys */
+ PERF_RAPL_MAX
+};
+#define RAPL_EVENT_MASK GENMASK(7, 0)
+
+static const int event_to_domain[PERF_RAPL_MAX] = {
+ [PERF_RAPL_PP0] = RAPL_DOMAIN_PP0,
+ [PERF_RAPL_PKG] = RAPL_DOMAIN_PACKAGE,
+ [PERF_RAPL_RAM] = RAPL_DOMAIN_DRAM,
+ [PERF_RAPL_PP1] = RAPL_DOMAIN_PP1,
+ [PERF_RAPL_PSYS] = RAPL_DOMAIN_PLATFORM,
+};
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+ struct rapl_package *pos, *rp = NULL;
+ u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ int domain, idx;
+
+ /* Only look at RAPL events */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* Check for supported events only */
+ if (!cfg || cfg >= PERF_RAPL_MAX)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* Find out which Package the event belongs to */
+ list_for_each_entry(pos, &rapl_packages, plist) {
+ if (is_rp_pmu_cpu(pos, event->cpu)) {
+ rp = pos;
+ break;
+ }
+ }
+ if (!rp)
+ return -ENODEV;
+
+ /* Find out which RAPL Domain the event belongs to */
+ domain = event_to_domain[cfg];
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+ event->pmu_private = rp; /* Which package */
+ event->hw.flags = domain; /* Which domain */
+
+ event->hw.idx = -1;
+ /* Find out the index in rp->domains[] to get domain pointer */
+ for (idx = 0; idx < rp->nr_domains; idx++) {
+ if (rp->domains[idx].id == domain) {
+ event->hw.idx = idx;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+ rapl_event_update(event);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct rapl_package_pmu_data *data =
+ container_of(hrtimer, struct rapl_package_pmu_data, hrtimer);
+ struct perf_event *event;
+ unsigned long flags;
+
+ if (!data->n_active)
+ return HRTIMER_NORESTART;
+
+ raw_spin_lock_irqsave(&data->lock, flags);
+
+ list_for_each_entry(event, &data->active_list, active_entry)
+ rapl_event_update(event);
+
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+
+ hrtimer_forward_now(hrtimer, data->timer_interval);
+
+ return HRTIMER_RESTART;
+}
+
+/* PMU sysfs attributes */
+
+/*
+ * There are no default events, but we need to create "events" group (with
+ * empty attrs) before updating it with detected events.
+ */
+static struct attribute *attrs_empty[] = {
+ NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+ .name = "events",
+ .attrs = attrs_empty,
+};
+
+static ssize_t cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rapl_package *rp;
+ cpumask_var_t cpu_mask;
+ int cpu;
+ int ret;
+
+ if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpus_read_lock();
+
+ cpumask_clear(cpu_mask);
+
+ /* Choose a cpu for each RAPL Package */
+ list_for_each_entry(rp, &rapl_packages, plist) {
+ cpu = get_pmu_cpu(rp);
+ if (cpu < nr_cpu_ids)
+ cpumask_set_cpu(cpu, cpu_mask);
+ }
+ cpus_read_unlock();
+
+ ret = cpumap_print_to_pagebuf(true, buf, cpu_mask);
+
+ free_cpumask_var(cpu_mask);
+
+ return ret;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *pmu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL
+};
+
+static struct attribute_group pmu_cpumask_group = {
+ .attrs = pmu_cpumask_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *pmu_format_attr[] = {
+ &format_attr_event.attr,
+ NULL
+};
+
+static struct attribute_group pmu_format_group = {
+ .name = "format",
+ .attrs = pmu_format_attr,
+};
+
+static const struct attribute_group *pmu_attr_groups[] = {
+ &pmu_events_group,
+ &pmu_cpumask_group,
+ &pmu_format_group,
+ NULL
+};
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ .event_str = str, \
+}
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04");
+RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules");
+RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules");
+
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10");
+
+#define RAPL_EVENT_GROUP(_name, domain) \
+static struct attribute *pmu_attr_##_name[] = { \
+ &event_attr_rapl_##_name.attr.attr, \
+ &event_attr_rapl_unit_##_name.attr.attr, \
+ &event_attr_rapl_scale_##_name.attr.attr, \
+ NULL \
+}; \
+static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \
+{ \
+ return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \
+} \
+static struct attribute_group pmu_group_##_name = { \
+ .name = "events", \
+ .attrs = pmu_attr_##_name, \
+ .is_visible = is_visible_##_name, \
+}
+
+RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0);
+RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE);
+RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM);
+RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1);
+RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM);
+
+static const struct attribute_group *pmu_attr_update[] = {
+ &pmu_group_cores,
+ &pmu_group_pkg,
+ &pmu_group_ram,
+ &pmu_group_gpu,
+ &pmu_group_psys,
+ NULL
+};
+
+static int rapl_pmu_update(struct rapl_package *rp)
+{
+ int ret = 0;
+
+ /* Return if PMU already covers all events supported by current RAPL Package */
+ if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map)))
+ goto end;
+
+ /* Unregister previous registered PMU */
+ if (rapl_pmu.registered)
+ perf_pmu_unregister(&rapl_pmu.pmu);
+
+ rapl_pmu.registered = false;
+ rapl_pmu.domain_map |= rp->domain_map;
+
+ memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
+ rapl_pmu.pmu.attr_groups = pmu_attr_groups;
+ rapl_pmu.pmu.attr_update = pmu_attr_update;
+ rapl_pmu.pmu.task_ctx_nr = perf_invalid_context;
+ rapl_pmu.pmu.event_init = rapl_pmu_event_init;
+ rapl_pmu.pmu.add = rapl_pmu_event_add;
+ rapl_pmu.pmu.del = rapl_pmu_event_del;
+ rapl_pmu.pmu.start = rapl_pmu_event_start;
+ rapl_pmu.pmu.stop = rapl_pmu_event_stop;
+ rapl_pmu.pmu.read = rapl_pmu_event_read;
+ rapl_pmu.pmu.module = THIS_MODULE;
+ rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT;
+ ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
+ if (ret) {
+ pr_info("Failed to register PMU\n");
+ return ret;
+ }
+
+ rapl_pmu.registered = true;
+end:
+ rp->has_pmu = true;
+ return ret;
+}
+
+int rapl_package_add_pmu(struct rapl_package *rp)
+{
+ struct rapl_package_pmu_data *data = &rp->pmu_data;
+ int idx;
+
+ if (rp->has_pmu)
+ return -EEXIST;
+
+ guard(cpus_read_lock)();
+
+ for (idx = 0; idx < rp->nr_domains; idx++) {
+ struct rapl_domain *rd = &rp->domains[idx];
+ int domain = rd->id;
+ u64 val;
+
+ if (!test_bit(domain, &rp->domain_map))
+ continue;
+
+ /*
+ * The RAPL PMU granularity is 2^-32 Joules
+ * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase
+ */
+ val = rd->energy_unit * (1ULL << 32);
+ do_div(val, ENERGY_UNIT_SCALE * 1000000);
+ data->scale[domain] = val;
+
+ if (!rapl_pmu.timer_ms) {
+ struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER);
+
+ /*
+ * Calculate the timer rate:
+ * Use reference of 200W for scaling the timeout to avoid counter
+ * overflows.
+ *
+ * max_count = rpi->mask >> rpi->shift + 1
+ * max_energy_pj = max_count * rd->energy_unit
+ * max_time_sec = (max_energy_pj / 1000000000) / 200w
+ *
+ * rapl_pmu.timer_ms = max_time_sec * 1000 / 2
+ */
+ val = (rpi->mask >> rpi->shift) + 1;
+ val *= rd->energy_unit;
+ do_div(val, 1000000 * 200 * 2);
+ rapl_pmu.timer_ms = val;
+
+ pr_debug("%llu ms overflow timer\n", rapl_pmu.timer_ms);
+ }
+
+ pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]);
+ }
+
+ /* Initialize per package PMU data */
+ raw_spin_lock_init(&data->lock);
+ INIT_LIST_HEAD(&data->active_list);
+ data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms);
+ hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ data->hrtimer.function = rapl_hrtimer_handle;
+
+ return rapl_pmu_update(rp);
+}
+EXPORT_SYMBOL_GPL(rapl_package_add_pmu);
+
+void rapl_package_remove_pmu(struct rapl_package *rp)
+{
+ struct rapl_package *pos;
+
+ if (!rp->has_pmu)
+ return;
+
+ guard(cpus_read_lock)();
+
+ list_for_each_entry(pos, &rapl_packages, plist) {
+ /* PMU is still needed */
+ if (pos->has_pmu && pos != rp)
+ return;
+ }
+
+ perf_pmu_unregister(&rapl_pmu.pmu);
+ memset(&rapl_pmu, 0, sizeof(struct rapl_pmu));
+}
+EXPORT_SYMBOL_GPL(rapl_package_remove_pmu);
+#endif
+
/* called from CPU hotplug notifier, hotplug lock held */
-void rapl_remove_package(struct rapl_package *rp)
+void rapl_remove_package_cpuslocked(struct rapl_package *rp)
{
struct rapl_domain *rd, *rd_package = NULL;
@@ -1528,16 +2122,45 @@ void rapl_remove_package(struct rapl_package *rp)
list_del(&rp->plist);
kfree(rp);
}
+EXPORT_SYMBOL_GPL(rapl_remove_package_cpuslocked);
+
+void rapl_remove_package(struct rapl_package *rp)
+{
+ guard(cpus_read_lock)();
+ rapl_remove_package_cpuslocked(rp);
+}
EXPORT_SYMBOL_GPL(rapl_remove_package);
+/*
+ * RAPL Package energy counter scope:
+ * 1. AMD/HYGON platforms use per-PKG package energy counter
+ * 2. For Intel platforms
+ * 2.1 CLX-AP platform has per-DIE package energy counter
+ * 2.2 Other platforms that uses MSR RAPL are single die systems so the
+ * package energy counter can be considered as per-PKG/per-DIE,
+ * here it is considered as per-DIE.
+ * 2.3 New platforms that use TPMI RAPL doesn't care about the
+ * scope because they are not MSR/CPU based.
+ */
+#define rapl_msrs_are_pkg_scope() \
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+
/* caller to ensure CPU hotplug lock is held */
-struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu)
+struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv,
+ bool id_is_cpu)
{
struct rapl_package *rp;
int uid;
- if (id_is_cpu)
- uid = topology_logical_die_id(id);
+ if (id_is_cpu) {
+ uid = rapl_msrs_are_pkg_scope() ?
+ topology_physical_package_id(id) : topology_logical_die_id(id);
+ if (uid < 0) {
+ pr_err("topology_logical_(package/die)_id() returned a negative value");
+ return NULL;
+ }
+ }
else
uid = id;
@@ -1549,10 +2172,17 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv,
return NULL;
}
+EXPORT_SYMBOL_GPL(rapl_find_package_domain_cpuslocked);
+
+struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu)
+{
+ guard(cpus_read_lock)();
+ return rapl_find_package_domain_cpuslocked(id, priv, id_is_cpu);
+}
EXPORT_SYMBOL_GPL(rapl_find_package_domain);
/* called from CPU hotplug notifier, hotplug lock held */
-struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu)
+struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, bool id_is_cpu)
{
struct rapl_package *rp;
int ret;
@@ -1562,9 +2192,14 @@ struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id
return ERR_PTR(-ENOMEM);
if (id_is_cpu) {
- rp->id = topology_logical_die_id(id);
+ rp->id = rapl_msrs_are_pkg_scope() ?
+ topology_physical_package_id(id) : topology_logical_die_id(id);
+ if ((int)(rp->id) < 0) {
+ pr_err("topology_logical_(package/die)_id() returned a negative value");
+ return ERR_PTR(-EINVAL);
+ }
rp->lead_cpu = id;
- if (topology_max_die_per_package() > 1)
+ if (!rapl_msrs_are_pkg_scope() && topology_max_dies_per_package() > 1)
snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d-die-%d",
topology_physical_package_id(id), topology_die_id(id));
else
@@ -1598,6 +2233,13 @@ err_free_package:
kfree(rp);
return ERR_PTR(ret);
}
+EXPORT_SYMBOL_GPL(rapl_add_package_cpuslocked);
+
+struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu)
+{
+ guard(cpus_read_lock)();
+ return rapl_add_package_cpuslocked(id, priv, id_is_cpu);
+}
EXPORT_SYMBOL_GPL(rapl_add_package);
static void power_limit_state_save(void)