summaryrefslogtreecommitdiff
path: root/arch/s390/kernel/perf_cpum_sf.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/kernel/perf_cpum_sf.c')
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c911
1 files changed, 457 insertions, 454 deletions
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index bfabeb1889cc..459af23a47a5 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -5,8 +5,7 @@
* Copyright IBM Corp. 2013, 2018
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
-#define KMSG_COMPONENT "cpum_sf"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "cpum_sf: " fmt
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
@@ -14,7 +13,6 @@
#include <linux/percpu.h>
#include <linux/pid.h>
#include <linux/notifier.h>
-#include <linux/export.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/moduleparam.h>
@@ -22,6 +20,23 @@
#include <asm/irq.h>
#include <asm/debug.h>
#include <asm/timex.h>
+#include <linux/io.h>
+
+/* Perf PMU definitions for the sampling facility */
+#define PERF_CPUM_SF_MAX_CTR 2
+#define PERF_EVENT_CPUM_SF 0xB0000UL /* Event: Basic-sampling */
+#define PERF_EVENT_CPUM_SF_DIAG 0xBD000UL /* Event: Combined-sampling */
+#define PERF_CPUM_SF_BASIC_MODE 0x0001 /* Basic-sampling flag */
+#define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */
+#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */
+
+#define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config)
+#define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc)
+#define TEAR_REG(hwc) ((hwc)->last_tag)
+#define SAMPL_RATE(hwc) ((hwc)->event_base)
+#define SAMPL_FLAGS(hwc) ((hwc)->config_base)
+#define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE)
+#define SAMPL_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE)
/* Minimum number of sample-data-block-tables:
* At least one table is required for the sampling buffer structure.
@@ -42,7 +57,7 @@
#define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8)
static inline int require_table_link(const void *sdbt)
{
- return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
+ return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
}
/* Minimum and maximum sampling buffer sizes:
@@ -99,15 +114,55 @@ static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf);
/* Debug feature */
static debug_info_t *sfdbg;
+/* Sampling control helper functions */
+static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi,
+ unsigned long freq)
+{
+ return (USEC_PER_SEC / freq) * qsi->cpu_speed;
+}
+
+static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
+ unsigned long rate)
+{
+ return USEC_PER_SEC * qsi->cpu_speed / rate;
+}
+
+/* Return pointer to trailer entry of an sample data block */
+static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v)
+{
+ void *ret;
+
+ ret = (void *)v;
+ ret += PAGE_SIZE;
+ ret -= sizeof(struct hws_trailer_entry);
+
+ return ret;
+}
+
+/*
+ * Return true if the entry in the sample data block table (sdbt)
+ * is a link to the next sdbt
+ */
+static inline int is_link_entry(unsigned long *s)
+{
+ return *s & 0x1UL ? 1 : 0;
+}
+
+/* Return pointer to the linked sdbt */
+static inline unsigned long *get_next_sdbt(unsigned long *s)
+{
+ return phys_to_virt(*s & ~0x1UL);
+}
+
/*
* sf_disable() - Switch off sampling facility
*/
-static int sf_disable(void)
+static void sf_disable(void)
{
struct hws_lsctl_request_block sreq;
memset(&sreq, 0, sizeof(sreq));
- return lsctl(&sreq);
+ lsctl(&sreq);
}
/*
@@ -123,57 +178,44 @@ static int sf_buffer_available(struct cpu_hw_sf *cpuhw)
*/
static void free_sampling_buffer(struct sf_buffer *sfb)
{
- unsigned long *sdbt, *curr;
-
- if (!sfb->sdbt)
- return;
+ unsigned long *sdbt, *curr, *head;
sdbt = sfb->sdbt;
- curr = sdbt;
-
+ if (!sdbt)
+ return;
+ sfb->sdbt = NULL;
/* Free the SDBT after all SDBs are processed... */
- while (1) {
- if (!*curr || !sdbt)
- break;
-
- /* Process table-link entries */
+ head = sdbt;
+ curr = sdbt;
+ do {
if (is_link_entry(curr)) {
+ /* Process table-link entries */
curr = get_next_sdbt(curr);
- if (sdbt)
- free_page((unsigned long) sdbt);
-
- /* If the origin is reached, sampling buffer is freed */
- if (curr == sfb->sdbt)
- break;
- else
- sdbt = curr;
+ free_page((unsigned long)sdbt);
+ sdbt = curr;
} else {
/* Process SDB pointer */
- if (*curr) {
- free_page(*curr);
- curr++;
- }
+ free_page((unsigned long)phys_to_virt(*curr));
+ curr++;
}
- }
-
- debug_sprintf_event(sfdbg, 5,
- "free_sampling_buffer: freed sdbt=%p\n", sfb->sdbt);
+ } while (curr != head);
memset(sfb, 0, sizeof(*sfb));
}
static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
{
- unsigned long sdb, *trailer;
+ struct hws_trailer_entry *te;
+ unsigned long sdb;
/* Allocate and initialize sample-data-block */
sdb = get_zeroed_page(gfp_flags);
if (!sdb)
return -ENOMEM;
- trailer = trailer_entry_ptr(sdb);
- *trailer = SDB_TE_ALERT_REQ_MASK;
+ te = trailer_entry_ptr(sdb);
+ te->header.a = 1;
/* Link SDB into the sample-data-block-table */
- *sdbt = sdb;
+ *sdbt = virt_to_phys((void *)sdb);
return 0;
}
@@ -193,7 +235,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
unsigned long num_sdb, gfp_t gfp_flags)
{
int i, rc;
- unsigned long *new, *tail;
+ unsigned long *new, *tail, *tail_prev = NULL;
if (!sfb->sdbt || !sfb->tail)
return -EINVAL;
@@ -212,10 +254,9 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
* the sampling buffer origin.
*/
if (sfb->sdbt != get_next_sdbt(tail)) {
- debug_sprintf_event(sfdbg, 3, "realloc_sampling_buffer: "
- "sampling buffer is not linked: origin=%p"
- "tail=%p\n",
- (void *) sfb->sdbt, (void *) tail);
+ debug_sprintf_event(sfdbg, 3, "%s buffer not linked origin %#lx tail %#lx\n",
+ __func__, (unsigned long)sfb->sdbt,
+ (unsigned long)tail);
return -EINVAL;
}
@@ -224,14 +265,15 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
for (i = 0; i < num_sdb; i++) {
/* Allocate a new SDB-table if it is full. */
if (require_table_link(tail)) {
- new = (unsigned long *) get_zeroed_page(gfp_flags);
+ new = (unsigned long *)get_zeroed_page(gfp_flags);
if (!new) {
rc = -ENOMEM;
break;
}
sfb->num_sdbt++;
/* Link current page to tail of chain */
- *tail = (unsigned long)(void *) new + 1;
+ *tail = virt_to_phys((void *)new) + 1;
+ tail_prev = tail;
tail = new;
}
@@ -241,19 +283,28 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
* issue, a new realloc call (if required) might succeed.
*/
rc = alloc_sample_data_block(tail, gfp_flags);
- if (rc)
+ if (rc) {
+ /* Undo last SDBT. An SDBT with no SDB at its first
+ * entry but with an SDBT entry instead can not be
+ * handled by the interrupt handler code.
+ * Avoid this situation.
+ */
+ if (tail_prev) {
+ sfb->num_sdbt--;
+ free_page((unsigned long)new);
+ tail = tail_prev;
+ }
break;
+ }
sfb->num_sdb++;
tail++;
+ tail_prev = new = NULL; /* Allocated at least one SBD */
}
/* Link sampling buffer to its origin */
- *tail = (unsigned long) sfb->sdbt + 1;
+ *tail = virt_to_phys(sfb->sdbt) + 1;
sfb->tail = tail;
- debug_sprintf_event(sfdbg, 4, "realloc_sampling_buffer: new buffer"
- " settings: sdbt=%lu sdb=%lu\n",
- sfb->num_sdbt, sfb->num_sdb);
return rc;
}
@@ -276,7 +327,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
return -EINVAL;
/* Allocate the sample-data-block-table origin */
- sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!sfb->sdbt)
return -ENOMEM;
sfb->num_sdb = 0;
@@ -286,18 +337,12 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
* realloc_sampling_buffer() invocation.
*/
sfb->tail = sfb->sdbt;
- *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1;
+ *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1;
/* Allocate requested number of sample-data-blocks */
rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL);
- if (rc) {
+ if (rc)
free_sampling_buffer(sfb);
- debug_sprintf_event(sfdbg, 4, "alloc_sampling_buffer: "
- "realloc_sampling_buffer failed with rc=%i\n", rc);
- } else
- debug_sprintf_event(sfdbg, 4,
- "alloc_sampling_buffer: tear=%p dear=%p\n",
- sfb->sdbt, (void *) *sfb->sdbt);
return rc;
}
@@ -309,8 +354,8 @@ static void sfb_set_limits(unsigned long min, unsigned long max)
CPUM_SF_MAX_SDB = max;
memset(&si, 0, sizeof(si));
- if (!qsi(&si))
- CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes);
+ qsi(&si);
+ CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes);
}
static unsigned long sfb_max_limit(struct hw_perf_event *hwc)
@@ -329,12 +374,6 @@ static unsigned long sfb_pending_allocs(struct sf_buffer *sfb,
return 0;
}
-static int sfb_has_pending_allocs(struct sf_buffer *sfb,
- struct hw_perf_event *hwc)
-{
- return sfb_pending_allocs(sfb, hwc) > 0;
-}
-
static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc)
{
/* Limit the number of SDBs to not exceed the maximum */
@@ -351,45 +390,45 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc)
static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
{
- if (cpuhw->sfb.sdbt)
+ if (sf_buffer_available(cpuhw))
free_sampling_buffer(&cpuhw->sfb);
}
static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
{
- unsigned long n_sdb, freq, factor;
- size_t sample_size;
+ unsigned long n_sdb, freq;
/* Calculate sampling buffers using 4K pages
*
- * 1. Determine the sample data size which depends on the used
- * sampling functions, for example, basic-sampling or
- * basic-sampling with diagnostic-sampling.
+ * 1. The sampling size is 32 bytes for basic sampling. This size
+ * is the same for all machine types. Diagnostic
+ * sampling uses auxlilary data buffer setup which provides the
+ * memory for SDBs using linux common code auxiliary trace
+ * setup.
*
- * 2. Use the sampling frequency as input. The sampling buffer is
- * designed for almost one second. This can be adjusted through
- * the "factor" variable.
- * In any case, alloc_sampling_buffer() sets the Alert Request
+ * 2. Function alloc_sampling_buffer() sets the Alert Request
* Control indicator to trigger a measurement-alert to harvest
- * sample-data-blocks (sdb).
+ * sample-data-blocks (SDB). This is done per SDB. This
+ * measurement alert interrupt fires quick enough to handle
+ * one SDB, on very high frequency and work loads there might
+ * be 2 to 3 SBDs available for sample processing.
+ * Currently there is no need for setup alert request on every
+ * n-th page. This is counterproductive as one IRQ triggers
+ * a very high number of samples to be processed at one IRQ.
*
- * 3. Compute the number of sample-data-blocks and ensure a minimum
- * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not
- * exceed a "calculated" maximum. The symbolic maximum is
- * designed for basic-sampling only and needs to be increased if
- * diagnostic-sampling is active.
- * See also the remarks for these symbolic constants.
+ * 3. Use the sampling frequency as input.
+ * Compute the number of SDBs and ensure a minimum
+ * of CPUM_SF_MIN_SDB. Depending on frequency add some more
+ * SDBs to handle a higher sampling rate.
+ * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples
+ * (one SDB) for every 10000 HZ frequency increment.
*
* 4. Compute the number of sample-data-block-tables (SDBT) and
* ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up
* to 511 SDBs).
*/
- sample_size = sizeof(struct hws_basic_entry);
freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc));
- factor = 1;
- n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size));
- if (n_sdb < CPUM_SF_MIN_SDB)
- n_sdb = CPUM_SF_MIN_SDB;
+ n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000);
/* If there is already a sampling buffer allocated, it is very likely
* that the sampling facility is enabled too. If the event to be
@@ -403,12 +442,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
if (sf_buffer_available(cpuhw))
return 0;
- debug_sprintf_event(sfdbg, 3,
- "allocate_buffers: rate=%lu f=%lu sdb=%lu/%lu"
- " sample_size=%lu cpuhw=%p\n",
- SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc),
- sample_size, cpuhw);
-
return alloc_sampling_buffer(&cpuhw->sfb,
sfb_pending_allocs(&cpuhw->sfb, hwc));
}
@@ -465,8 +498,6 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw,
if (num)
sfb_account_allocs(num, hwc);
- debug_sprintf_event(sfdbg, 5, "sfb: overflow: overflow=%llu ratio=%lu"
- " num=%lu\n", OVERFLOW_REG(hwc), ratio, num);
OVERFLOW_REG(hwc) = 0;
}
@@ -484,13 +515,11 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw,
static void extend_sampling_buffer(struct sf_buffer *sfb,
struct hw_perf_event *hwc)
{
- unsigned long num, num_old;
- int rc;
+ unsigned long num;
num = sfb_pending_allocs(sfb, hwc);
if (!num)
return;
- num_old = sfb->num_sdb;
/* Disable the sampling facility to reset any states and also
* clear pending measurement alerts.
@@ -502,61 +531,32 @@ static void extend_sampling_buffer(struct sf_buffer *sfb,
* called by perf. Because this is a reallocation, it is fine if the
* new SDB-request cannot be satisfied immediately.
*/
- rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC);
- if (rc)
- debug_sprintf_event(sfdbg, 5, "sfb: extend: realloc "
- "failed with rc=%i\n", rc);
-
- if (sfb_has_pending_allocs(sfb, hwc))
- debug_sprintf_event(sfdbg, 5, "sfb: extend: "
- "req=%lu alloc=%lu remaining=%lu\n",
- num, sfb->num_sdb - num_old,
- sfb_pending_allocs(sfb, hwc));
+ realloc_sampling_buffer(sfb, num, GFP_ATOMIC);
}
-
/* Number of perf events counting hardware events */
-static atomic_t num_events;
+static refcount_t num_events;
/* Used to avoid races in calling reserve/release_cpumf_hardware */
static DEFINE_MUTEX(pmc_reserve_mutex);
#define PMC_INIT 0
#define PMC_RELEASE 1
-#define PMC_FAILURE 2
static void setup_pmc_cpu(void *flags)
{
- int err;
- struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf);
+ struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
- err = 0;
- switch (*((int *) flags)) {
+ sf_disable();
+ switch (*((int *)flags)) {
case PMC_INIT:
- memset(cpusf, 0, sizeof(*cpusf));
- err = qsi(&cpusf->qsi);
- if (err)
- break;
- cpusf->flags |= PMU_F_RESERVED;
- err = sf_disable();
- if (err)
- pr_err("Switching off the sampling facility failed "
- "with rc=%i\n", err);
- debug_sprintf_event(sfdbg, 5,
- "setup_pmc_cpu: initialized: cpuhw=%p\n", cpusf);
+ memset(cpuhw, 0, sizeof(*cpuhw));
+ qsi(&cpuhw->qsi);
+ cpuhw->flags |= PMU_F_RESERVED;
break;
case PMC_RELEASE:
- cpusf->flags &= ~PMU_F_RESERVED;
- err = sf_disable();
- if (err) {
- pr_err("Switching off the sampling facility failed "
- "with rc=%i\n", err);
- } else
- deallocate_buffers(cpusf);
- debug_sprintf_event(sfdbg, 5,
- "setup_pmc_cpu: released: cpuhw=%p\n", cpusf);
+ cpuhw->flags &= ~PMU_F_RESERVED;
+ deallocate_buffers(cpuhw);
break;
}
- if (err)
- *((int *) flags) |= PMC_FAILURE;
}
static void release_pmc_hardware(void)
@@ -567,27 +567,19 @@ static void release_pmc_hardware(void)
on_each_cpu(setup_pmc_cpu, &flags, 1);
}
-static int reserve_pmc_hardware(void)
+static void reserve_pmc_hardware(void)
{
int flags = PMC_INIT;
on_each_cpu(setup_pmc_cpu, &flags, 1);
- if (flags & PMC_FAILURE) {
- release_pmc_hardware();
- return -ENODEV;
- }
irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
- return 0;
}
static void hw_perf_event_destroy(struct perf_event *event)
{
/* Release PMC if this is the last perf event */
- if (!atomic_add_unless(&num_events, -1, 1)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_dec_return(&num_events) == 0)
- release_pmc_hardware();
+ if (refcount_dec_and_mutex_lock(&num_events, &pmc_reserve_mutex)) {
+ release_pmc_hardware();
mutex_unlock(&pmc_reserve_mutex);
}
}
@@ -599,13 +591,6 @@ static void hw_init_period(struct hw_perf_event *hwc, u64 period)
local64_set(&hwc->period_left, hwc->sample_period);
}
-static void hw_reset_registers(struct hw_perf_event *hwc,
- unsigned long *sdbt_origin)
-{
- /* (Re)set to first sample-data-block-table */
- TEAR_REG(hwc) = (unsigned long) sdbt_origin;
-}
-
static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si,
unsigned long rate)
{
@@ -660,8 +645,9 @@ static void cpumsf_output_event_pid(struct perf_event *event,
/* Protect callchain buffers, tasks */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
+ if (perf_output_begin(&handle, data, event, header.size))
goto out;
/* Update the process ID (see also kernel/events/core.c) */
@@ -674,30 +660,92 @@ out:
rcu_read_unlock();
}
+static unsigned long getrate(bool freq, unsigned long sample,
+ struct hws_qsi_info_block *si)
+{
+ unsigned long rate;
+
+ if (freq) {
+ rate = freq_to_sample_rate(si, sample);
+ rate = hw_limit_rate(si, rate);
+ } else {
+ /* The min/max sampling rates specifies the valid range
+ * of sample periods. If the specified sample period is
+ * out of range, limit the period to the range boundary.
+ */
+ rate = hw_limit_rate(si, sample);
+
+ /* The perf core maintains a maximum sample rate that is
+ * configurable through the sysctl interface. Ensure the
+ * sampling rate does not exceed this value. This also helps
+ * to avoid throttling when pushing samples with
+ * perf_event_overflow().
+ */
+ if (sample_rate_to_freq(si, rate) >
+ sysctl_perf_event_sample_rate) {
+ rate = 0;
+ }
+ }
+ return rate;
+}
+
+/* The sampling information (si) contains information about the
+ * min/max sampling intervals and the CPU speed. So calculate the
+ * correct sampling interval and avoid the whole period adjust
+ * feedback loop.
+ *
+ * Since the CPU Measurement sampling facility can not handle frequency
+ * calculate the sampling interval when frequency is specified using
+ * this formula:
+ * interval := cpu_speed * 1000000 / sample_freq
+ *
+ * Returns errno on bad input and zero on success with parameter interval
+ * set to the correct sampling rate.
+ *
+ * Note: This function turns off freq bit to avoid calling function
+ * perf_adjust_period(). This causes frequency adjustment in the common
+ * code part which causes tremendous variations in the counter values.
+ */
+static int __hw_perf_event_init_rate(struct perf_event *event,
+ struct hws_qsi_info_block *si)
+{
+ struct perf_event_attr *attr = &event->attr;
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long rate;
+
+ if (attr->freq) {
+ if (!attr->sample_freq)
+ return -EINVAL;
+ rate = getrate(attr->freq, attr->sample_freq, si);
+ attr->freq = 0; /* Don't call perf_adjust_period() */
+ SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FREQ_MODE;
+ } else {
+ rate = getrate(attr->freq, attr->sample_period, si);
+ if (!rate)
+ return -EINVAL;
+ }
+ attr->sample_period = rate;
+ SAMPL_RATE(hwc) = rate;
+ hw_init_period(hwc, SAMPL_RATE(hwc));
+ return 0;
+}
+
static int __hw_perf_event_init(struct perf_event *event)
{
struct cpu_hw_sf *cpuhw;
struct hws_qsi_info_block si;
struct perf_event_attr *attr = &event->attr;
struct hw_perf_event *hwc = &event->hw;
- unsigned long rate;
- int cpu, err;
+ int cpu, err = 0;
/* Reserve CPU-measurement sampling facility */
- err = 0;
- if (!atomic_inc_not_zero(&num_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&num_events) == 0 && reserve_pmc_hardware())
- err = -EBUSY;
- else
- atomic_inc(&num_events);
- mutex_unlock(&pmc_reserve_mutex);
+ mutex_lock(&pmc_reserve_mutex);
+ if (!refcount_inc_not_zero(&num_events)) {
+ reserve_pmc_hardware();
+ refcount_set(&num_events, 1);
}
event->destroy = hw_perf_event_destroy;
- if (err)
- goto out;
-
/* Access per-CPU sampling information (query sampling info) */
/*
* The event->cpu value can be -1 to count on every CPU, for example,
@@ -709,9 +757,9 @@ static int __hw_perf_event_init(struct perf_event *event)
*/
memset(&si, 0, sizeof(si));
cpuhw = NULL;
- if (event->cpu == -1)
+ if (event->cpu == -1) {
qsi(&si);
- else {
+ } else {
/* Event is pinned to a particular CPU, retrieve the per-CPU
* sampling structure for accessing the CPU-specific QSI.
*/
@@ -728,6 +776,12 @@ static int __hw_perf_event_init(struct perf_event *event)
goto out;
}
+ if (si.ribm & CPU_MF_SF_RIBM_NOTAV) {
+ pr_warn("CPU Measurement Facility sampling is temporarily not available\n");
+ err = -EBUSY;
+ goto out;
+ }
+
/* Always enable basic sampling */
SAMPL_FLAGS(hwc) = PERF_CPUM_SF_BASIC_MODE;
@@ -742,55 +796,13 @@ static int __hw_perf_event_init(struct perf_event *event)
SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE;
}
- /* Check and set other sampling flags */
- if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS)
- SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS;
-
- /* The sampling information (si) contains information about the
- * min/max sampling intervals and the CPU speed. So calculate the
- * correct sampling interval and avoid the whole period adjust
- * feedback loop.
- */
- rate = 0;
- if (attr->freq) {
- if (!attr->sample_freq) {
- err = -EINVAL;
- goto out;
- }
- rate = freq_to_sample_rate(&si, attr->sample_freq);
- rate = hw_limit_rate(&si, rate);
- attr->freq = 0;
- attr->sample_period = rate;
- } else {
- /* The min/max sampling rates specifies the valid range
- * of sample periods. If the specified sample period is
- * out of range, limit the period to the range boundary.
- */
- rate = hw_limit_rate(&si, hwc->sample_period);
-
- /* The perf core maintains a maximum sample rate that is
- * configurable through the sysctl interface. Ensure the
- * sampling rate does not exceed this value. This also helps
- * to avoid throttling when pushing samples with
- * perf_event_overflow().
- */
- if (sample_rate_to_freq(&si, rate) >
- sysctl_perf_event_sample_rate) {
- err = -EINVAL;
- debug_sprintf_event(sfdbg, 1, "Sampling rate exceeds maximum perf sample rate\n");
- goto out;
- }
- }
- SAMPL_RATE(hwc) = rate;
- hw_init_period(hwc, SAMPL_RATE(hwc));
-
- /* Initialize sample data overflow accounting */
- hwc->extra_reg.reg = REG_OVERFLOW;
- OVERFLOW_REG(hwc) = 0;
+ err = __hw_perf_event_init_rate(event, &si);
+ if (err)
+ goto out;
/* Use AUX buffer. No need to allocate it by ourself */
if (attr->config == PERF_EVENT_CPUM_SF_DIAG)
- return 0;
+ goto out;
/* Allocate the per-CPU sampling buffer using the CPU information
* from the event. If the event is not pinned to a particular
@@ -820,15 +832,25 @@ static int __hw_perf_event_init(struct perf_event *event)
if (is_default_overflow_handler(event))
event->overflow_handler = cpumsf_output_event_pid;
out:
+ mutex_unlock(&pmc_reserve_mutex);
return err;
}
+static bool is_callchain_event(struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER |
+ PERF_SAMPLE_STACK_USER);
+}
+
static int cpumsf_pmu_event_init(struct perf_event *event)
{
int err;
/* No support for taken branch sampling */
- if (has_branch_stack(event))
+ /* No support for callchain, stacks and registers */
+ if (has_branch_stack(event) || is_callchain_event(event))
return -EOPNOTSUPP;
switch (event->attr.type) {
@@ -852,10 +874,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
return -ENOENT;
}
- /* Check online status of the CPU to which the event is pinned */
- if (event->cpu >= 0 && !cpu_online(event->cpu))
- return -ENODEV;
-
/* Force reset of idle/hv excludes regardless of what the
* user requested.
*/
@@ -865,9 +883,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
event->attr.exclude_idle = 0;
err = __hw_perf_event_init(event);
- if (unlikely(err))
- if (event->destroy)
- event->destroy(event);
return err;
}
@@ -877,10 +892,14 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
struct hw_perf_event *hwc;
int err;
- if (cpuhw->flags & PMU_F_ENABLED)
- return;
-
- if (cpuhw->flags & PMU_F_ERR_MASK)
+ /*
+ * Event must be
+ * - added/started on this CPU (PMU_F_IN_USE set)
+ * - and CPU must be available (PMU_F_RESERVED set)
+ * - and not already enabled (PMU_F_ENABLED not set)
+ * - and not in error condition (PMU_F_ERR_MASK not set)
+ */
+ if (cpuhw->flags != (PMU_F_IN_USE | PMU_F_RESERVED))
return;
/* Check whether to extent the sampling buffer.
@@ -894,38 +913,27 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
* facility, but it can be fully re-enabled using sampling controls that
* have been saved in cpumsf_pmu_disable().
*/
- if (cpuhw->event) {
- hwc = &cpuhw->event->hw;
- if (!(SAMPL_DIAG_MODE(hwc))) {
- /*
- * Account number of overflow-designated
- * buffer extents
- */
- sfb_account_overflows(cpuhw, hwc);
- if (sfb_has_pending_allocs(&cpuhw->sfb, hwc))
- extend_sampling_buffer(&cpuhw->sfb, hwc);
- }
+ hwc = &cpuhw->event->hw;
+ if (!(SAMPL_DIAG_MODE(hwc))) {
+ /*
+ * Account number of overflow-designated buffer extents
+ */
+ sfb_account_overflows(cpuhw, hwc);
+ extend_sampling_buffer(&cpuhw->sfb, hwc);
}
+ /* Rate may be adjusted with ioctl() */
+ cpuhw->lsctl.interval = SAMPL_RATE(hwc);
/* (Re)enable the PMU and sampling facility */
- cpuhw->flags |= PMU_F_ENABLED;
- barrier();
-
err = lsctl(&cpuhw->lsctl);
if (err) {
- cpuhw->flags &= ~PMU_F_ENABLED;
- pr_err("Loading sampling controls failed: op=%i err=%i\n",
- 1, err);
+ pr_err("Loading sampling controls failed: op 1 err %i\n", err);
return;
}
/* Load current program parameter */
- lpp(&S390_lowcore.lpp);
-
- debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i "
- "tear=%p dear=%p\n", cpuhw->lsctl.es, cpuhw->lsctl.cs,
- cpuhw->lsctl.ed, cpuhw->lsctl.cd,
- (void *) cpuhw->lsctl.tear, (void *) cpuhw->lsctl.dear);
+ lpp(&get_lowcore()->lpp);
+ cpuhw->flags |= PMU_F_ENABLED;
}
static void cpumsf_pmu_disable(struct pmu *pmu)
@@ -948,30 +956,27 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
err = lsctl(&inactive);
if (err) {
- pr_err("Loading sampling controls failed: op=%i err=%i\n",
- 2, err);
+ pr_err("Loading sampling controls failed: op 2 err %i\n", err);
return;
}
- /* Save state of TEAR and DEAR register contents */
- if (!qsi(&si)) {
- /* TEAR/DEAR values are valid only if the sampling facility is
- * enabled. Note that cpumsf_pmu_disable() might be called even
- * for a disabled sampling facility because cpumsf_pmu_enable()
- * controls the enable/disable state.
- */
- if (si.es) {
- cpuhw->lsctl.tear = si.tear;
- cpuhw->lsctl.dear = si.dear;
- }
- } else
- debug_sprintf_event(sfdbg, 3, "cpumsf_pmu_disable: "
- "qsi() failed with err=%i\n", err);
+ /*
+ * Save state of TEAR and DEAR register contents.
+ * TEAR/DEAR values are valid only if the sampling facility is
+ * enabled. Note that cpumsf_pmu_disable() might be called even
+ * for a disabled sampling facility because cpumsf_pmu_enable()
+ * controls the enable/disable state.
+ */
+ qsi(&si);
+ if (si.es) {
+ cpuhw->lsctl.tear = si.tear;
+ cpuhw->lsctl.dear = si.dear;
+ }
cpuhw->flags &= ~PMU_F_ENABLED;
}
-/* perf_exclude_event() - Filter event
+/* perf_event_exclude() - Filter event
* @event: The perf event
* @regs: pt_regs structure
* @sde_regs: Sample-data-entry (sde) regs structure
@@ -980,7 +985,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
*
* Return non-zero if the event shall be excluded.
*/
-static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
+static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs,
struct perf_sf_sde_regs *sde_regs)
{
if (event->attr.exclude_user && user_mode(regs))
@@ -1063,12 +1068,9 @@ static int perf_push_sample(struct perf_event *event,
data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
overflow = 0;
- if (perf_exclude_event(event, &regs, sde_regs))
+ if (perf_event_exclude(event, &regs, sde_regs))
goto out;
- if (perf_event_overflow(event, &data, &regs)) {
- overflow = 1;
- event->pmu->stop(event, 0);
- }
+ overflow = perf_event_overflow(event, &data, &regs);
perf_event_update_userpage(event);
out:
return overflow;
@@ -1079,14 +1081,6 @@ static void perf_event_count_update(struct perf_event *event, u64 count)
local64_add(count, &event->count);
}
-static void debug_sample_entry(struct hws_basic_entry *sample,
- struct hws_trailer_entry *te)
-{
- debug_sprintf_event(sfdbg, 4, "hw_collect_samples: Found unknown "
- "sampling data entry: te->f=%i basic.def=%04x (%p)\n",
- te->f, sample->def, sample);
-}
-
/* hw_collect_samples() - Walk through a sample-data-block and collect samples
* @event: The perf event
* @sdbt: Sample-data-block table
@@ -1098,7 +1092,7 @@ static void debug_sample_entry(struct hws_basic_entry *sample,
* combined-sampling data entry consists of a basic- and a diagnostic-sampling
* data entry. The sampling function is determined by the flags in the perf
* event hardware structure. The function always works with a combined-sampling
- * data entry but ignores the the diagnostic portion if it is not available.
+ * data entry but ignores the diagnostic portion if it is not available.
*
* Note that the implementation focuses on basic-sampling data entries and, if
* such an entry is not valid, the entire combined-sampling data entry is
@@ -1113,11 +1107,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
struct hws_trailer_entry *te;
struct hws_basic_entry *sample;
- te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
- sample = (struct hws_basic_entry *) *sdbt;
- while ((unsigned long *) sample < (unsigned long *) te) {
+ te = trailer_entry_ptr((unsigned long)sdbt);
+ sample = (struct hws_basic_entry *)sdbt;
+ while ((unsigned long *)sample < (unsigned long *)te) {
/* Check for an empty sample */
- if (!sample->def)
+ if (!sample->def || sample->LS)
break;
/* Update perf event period */
@@ -1140,7 +1134,6 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
/* Count discarded samples */
*overflow += 1;
} else {
- debug_sample_entry(sample, te);
/* Sample slot is not yet written or other record.
*
* This condition can occur if the buffer was reused
@@ -1151,7 +1144,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
* that are not full. Stop processing if the first
* invalid format was detected.
*/
- if (!te->f)
+ if (!te->header.f)
break;
}
@@ -1169,71 +1162,62 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
* The sampling buffer position are retrieved and saved in the TEAR_REG
* register of the specified perf event.
*
- * Only full sample-data-blocks are processed. Specify the flash_all flag
- * to also walk through partially filled sample-data-blocks. It is ignored
- * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag
- * enforces the processing of full sample-data-blocks only (trailer entries
- * with the block-full-indicator bit set).
+ * Only full sample-data-blocks are processed. Specify the flush_all flag
+ * to also walk through partially filled sample-data-blocks.
*/
static void hw_perf_event_update(struct perf_event *event, int flush_all)
{
+ unsigned long long event_overflow, sampl_overflow, num_sdb;
struct hw_perf_event *hwc = &event->hw;
+ union hws_trailer_header prev, new;
struct hws_trailer_entry *te;
- unsigned long *sdbt;
- unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
+ unsigned long *sdbt, sdb;
int done;
/*
* AUX buffer is used when in diagnostic sampling mode.
* No perf events/samples are created.
*/
- if (SAMPL_DIAG_MODE(&event->hw))
+ if (SAMPL_DIAG_MODE(hwc))
return;
- if (flush_all && SDB_FULL_BLOCKS(hwc))
- flush_all = 0;
-
- sdbt = (unsigned long *) TEAR_REG(hwc);
+ sdbt = (unsigned long *)TEAR_REG(hwc);
done = event_overflow = sampl_overflow = num_sdb = 0;
while (!done) {
/* Get the trailer entry of the sample-data-block */
- te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
+ sdb = (unsigned long)phys_to_virt(*sdbt);
+ te = trailer_entry_ptr(sdb);
/* Leave loop if no more work to do (block full indicator) */
- if (!te->f) {
+ if (!te->header.f) {
done = 1;
if (!flush_all)
break;
}
/* Check the sample overflow count */
- if (te->overflow)
+ if (te->header.overflow)
/* Account sample overflows and, if a particular limit
* is reached, extend the sampling buffer.
* For details, see sfb_account_overflows().
*/
- sampl_overflow += te->overflow;
-
- /* Timestamps are valid for full sample-data-blocks only */
- debug_sprintf_event(sfdbg, 6, "hw_perf_event_update: sdbt=%p "
- "overflow=%llu timestamp=0x%llx\n",
- sdbt, te->overflow,
- (te->f) ? trailer_timestamp(te) : 0ULL);
+ sampl_overflow += te->header.overflow;
/* Collect all samples from a single sample-data-block and
* flag if an (perf) event overflow happened. If so, the PMU
* is stopped and remaining samples will be discarded.
*/
- hw_collect_samples(event, sdbt, &event_overflow);
+ hw_collect_samples(event, (unsigned long *)sdb, &event_overflow);
num_sdb++;
/* Reset trailer (using compare-double-and-swap) */
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te_flags |= SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- te->flags, te->overflow,
- te_flags, 0ULL));
+ new.val = prev.val;
+ new.f = 0;
+ new.a = 1;
+ new.overflow = 0;
+ } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
/* Advance to next sample-data-block */
sdbt++;
@@ -1241,35 +1225,52 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
sdbt = get_next_sdbt(sdbt);
/* Update event hardware registers */
- TEAR_REG(hwc) = (unsigned long) sdbt;
+ TEAR_REG(hwc) = (unsigned long)sdbt;
/* Stop processing sample-data if all samples of the current
* sample-data-block were flushed even if it was not full.
*/
if (flush_all && done)
break;
-
- /* If an event overflow happened, discard samples by
- * processing any remaining sample-data-blocks.
- */
- if (event_overflow)
- flush_all = 1;
}
/* Account sample overflows in the event hardware structure */
if (sampl_overflow)
OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) +
sampl_overflow, 1 + num_sdb);
- if (sampl_overflow || event_overflow)
- debug_sprintf_event(sfdbg, 4, "hw_perf_event_update: "
- "overflow stats: sample=%llu event=%llu\n",
- sampl_overflow, event_overflow);
+
+ /* Perf_event_overflow() and perf_event_account_interrupt() limit
+ * the interrupt rate to an upper limit. Roughly 1000 samples per
+ * task tick.
+ * Hitting this limit results in a large number
+ * of throttled REF_REPORT_THROTTLE entries and the samples
+ * are dropped.
+ * Slightly increase the interval to avoid hitting this limit.
+ */
+ if (event_overflow)
+ SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10);
+}
+
+static inline unsigned long aux_sdb_index(struct aux_buffer *aux,
+ unsigned long i)
+{
+ return i % aux->sfb.num_sdb;
}
-#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb)
-#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0)
-#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark)
-#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark)
+static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end)
+{
+ return end >= start ? end - start + 1 : 0;
+}
+
+static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux)
+{
+ return aux_sdb_num(aux->head, aux->alert_mark);
+}
+
+static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux)
+{
+ return aux_sdb_num(aux->head, aux->empty_mark);
+}
/*
* Get trailer entry by index of SDB.
@@ -1279,9 +1280,9 @@ static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux,
{
unsigned long sdb;
- index = AUX_SDB_INDEX(aux, index);
+ index = aux_sdb_index(aux, index);
sdb = aux->sdb_index[index];
- return (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+ return trailer_entry_ptr(sdb);
}
/*
@@ -1303,10 +1304,10 @@ static void aux_output_end(struct perf_output_handle *handle)
if (!aux)
return;
- range_scan = AUX_SDB_NUM_ALERT(aux);
+ range_scan = aux_sdb_num_alert(aux);
for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- if (!(te->flags & SDB_TE_BUFFER_FULL_MASK))
+ if (!te->header.f)
break;
}
/* i is num of SDBs which are full */
@@ -1314,9 +1315,7 @@ static void aux_output_end(struct perf_output_handle *handle)
/* Remove alert indicators in the buffer */
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags &= ~SDB_TE_ALERT_REQ_MASK;
-
- debug_sprintf_event(sfdbg, 6, "aux_output_end: collect %lx SDBs\n", i);
+ te->header.a = 0;
}
/*
@@ -1332,12 +1331,10 @@ static int aux_output_begin(struct perf_output_handle *handle,
struct aux_buffer *aux,
struct cpu_hw_sf *cpuhw)
{
- unsigned long range;
- unsigned long i, range_scan, idx;
- unsigned long head, base, offset;
+ unsigned long range, i, range_scan, idx, head, base, offset;
struct hws_trailer_entry *te;
- if (WARN_ON_ONCE(handle->head & ~PAGE_MASK))
+ if (handle->head & ~PAGE_MASK)
return -EINVAL;
aux->head = handle->head >> PAGE_SHIFT;
@@ -1349,14 +1346,14 @@ static int aux_output_begin(struct perf_output_handle *handle,
* SDBs between aux->head and aux->empty_mark are already ready
* for new data. range_scan is num of SDBs not within them.
*/
- if (range > AUX_SDB_NUM_EMPTY(aux)) {
- range_scan = range - AUX_SDB_NUM_EMPTY(aux);
+ if (range > aux_sdb_num_empty(aux)) {
+ range_scan = range - aux_sdb_num_empty(aux);
idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- te->flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te->flags = te->flags & ~SDB_TE_ALERT_REQ_MASK;
- te->overflow = 0;
+ te->header.f = 0;
+ te->header.a = 0;
+ te->header.overflow = 0;
}
/* Save the position of empty SDBs */
aux->empty_mark = aux->head + range - 1;
@@ -1365,24 +1362,14 @@ static int aux_output_begin(struct perf_output_handle *handle,
/* Set alert indicator */
aux->alert_mark = aux->head + range/2 - 1;
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags = te->flags | SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 1;
/* Reset hardware buffer head */
- head = AUX_SDB_INDEX(aux, aux->head);
+ head = aux_sdb_index(aux, aux->head);
base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE];
offset = head % CPUM_SF_SDB_PER_TABLE;
- cpuhw->lsctl.tear = base + offset * sizeof(unsigned long);
- cpuhw->lsctl.dear = aux->sdb_index[head];
-
- debug_sprintf_event(sfdbg, 6, "aux_output_begin: "
- "head->alert_mark->empty_mark (num_alert, range)"
- "[%lx -> %lx -> %lx] (%lx, %lx) "
- "tear index %lx, tear %lx dear %lx\n",
- aux->head, aux->alert_mark, aux->empty_mark,
- AUX_SDB_NUM_ALERT(aux), range,
- head / CPUM_SF_SDB_PER_TABLE,
- cpuhw->lsctl.tear,
- cpuhw->lsctl.dear);
+ cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long);
+ cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]);
return 0;
}
@@ -1396,15 +1383,15 @@ static int aux_output_begin(struct perf_output_handle *handle,
static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
+ union hws_trailer_header prev, new;
struct hws_trailer_entry *te;
te = aux_sdb_trailer(aux, alert_index);
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- *overflow = orig_overflow;
- if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+ new.val = prev.val;
+ *overflow = prev.overflow;
+ if (prev.f) {
/*
* SDB is already set by hardware.
* Abort and try to set somewhere
@@ -1412,10 +1399,9 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
*/
return false;
}
- new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 1;
+ new.overflow = 0;
+ } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
return true;
}
@@ -1444,11 +1430,12 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
+ union hws_trailer_header prev, new;
unsigned long i, range_scan, idx;
+ unsigned long long orig_overflow;
struct hws_trailer_entry *te;
- if (range <= AUX_SDB_NUM_EMPTY(aux))
+ if (range <= aux_sdb_num_empty(aux))
/*
* No need to scan. All SDBs in range are marked as empty.
* Just set alert indicator. Should check race with hardware
@@ -1469,21 +1456,21 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
* Start scanning from one SDB behind empty_mark. If the new alert
* indicator fall into this range, set it.
*/
- range_scan = range - AUX_SDB_NUM_EMPTY(aux);
+ range_scan = range - aux_sdb_num_empty(aux);
idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+ new.val = prev.val;
+ orig_overflow = prev.overflow;
+ new.f = 0;
+ new.overflow = 0;
if (idx == aux->alert_mark)
- new_flags |= SDB_TE_ALERT_REQ_MASK;
+ new.a = 1;
else
- new_flags &= ~SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 0;
+ } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
*overflow += orig_overflow;
}
@@ -1506,14 +1493,16 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
unsigned long num_sdb;
aux = perf_get_aux(handle);
- if (WARN_ON_ONCE(!aux))
+ if (!aux)
return;
/* Inform user space new data arrived */
- size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
+ size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
+ debug_sprintf_event(sfdbg, 6, "%s #alert %ld\n", __func__,
+ size >> PAGE_SHIFT);
perf_aux_output_end(handle, size);
- num_sdb = aux->sfb.num_sdb;
+ num_sdb = aux->sfb.num_sdb;
while (!done) {
/* Get an output handle */
aux = perf_aux_output_begin(handle, cpuhw->event);
@@ -1521,10 +1510,9 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
pr_err("The AUX buffer with %lu pages for the "
"diagnostic-sampling mode is full\n",
num_sdb);
- debug_sprintf_event(sfdbg, 1, "AUX buffer used up\n");
break;
}
- if (WARN_ON_ONCE(!aux))
+ if (!aux)
return;
/* Update head and alert_mark to new position */
@@ -1543,24 +1531,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
size = range << PAGE_SHIFT;
perf_aux_output_end(&cpuhw->handle, size);
pr_err("Sample data caused the AUX buffer with %lu "
- "pages to overflow\n", num_sdb);
- debug_sprintf_event(sfdbg, 1, "head %lx range %lx "
- "overflow %llx\n",
- aux->head, range, overflow);
+ "pages to overflow\n", aux->sfb.num_sdb);
} else {
- size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
+ size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
perf_aux_output_end(&cpuhw->handle, size);
- debug_sprintf_event(sfdbg, 6, "head %lx alert %lx "
- "already full, try another\n",
- aux->head, aux->alert_mark);
}
}
-
- if (done)
- debug_sprintf_event(sfdbg, 6, "aux_reset_buffer: "
- "[%lx -> %lx -> %lx] (%lx, %lx)\n",
- aux->head, aux->alert_mark, aux->empty_mark,
- AUX_SDB_NUM_ALERT(aux), range);
}
/*
@@ -1582,25 +1558,22 @@ static void aux_buffer_free(void *data)
kfree(aux->sdbt_index);
kfree(aux->sdb_index);
kfree(aux);
-
- debug_sprintf_event(sfdbg, 4, "aux_buffer_free: free "
- "%lu SDBTs\n", num_sdbt);
}
static void aux_sdb_init(unsigned long sdb)
{
struct hws_trailer_entry *te;
- te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+ te = trailer_entry_ptr(sdb);
/* Save clock base */
te->clock_base = 1;
- memcpy(&te->progusage2, &tod_clock_base[1], 8);
+ te->progusage2 = tod_clock_base.tod;
}
/*
* aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling
- * @cpu: On which to allocate, -1 means current
+ * @event: Event the buffer is setup for, event->cpu == -1 means current
* @pages: Array of pointers to buffer pages passed from perf core
* @nr_pages: Total pages
* @snapshot: Flag for snapshot mode
@@ -1612,8 +1585,8 @@ static void aux_sdb_init(unsigned long sdb)
*
* Return the private AUX buffer structure if success or NULL if fails.
*/
-static void *aux_buffer_setup(int cpu, void **pages, int nr_pages,
- bool snapshot)
+static void *aux_buffer_setup(struct perf_event *event, void **pages,
+ int nr_pages, bool snapshot)
{
struct sf_buffer *sfb;
struct aux_buffer *aux;
@@ -1636,13 +1609,13 @@ static void *aux_buffer_setup(int cpu, void **pages, int nr_pages,
}
/* Allocate aux_buffer struct for the event */
- aux = kmalloc(sizeof(struct aux_buffer), GFP_KERNEL);
+ aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL);
if (!aux)
goto no_aux;
sfb = &aux->sfb;
/* Allocate sdbt_index for fast reference */
- n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE;
+ n_sdbt = DIV_ROUND_UP(nr_pages, CPUM_SF_SDB_PER_TABLE);
aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL);
if (!aux->sdbt_index)
goto no_sdbt_index;
@@ -1654,7 +1627,7 @@ static void *aux_buffer_setup(int cpu, void **pages, int nr_pages,
/* Allocate the first SDBT */
sfb->num_sdbt = 0;
- sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!sfb->sdbt)
goto no_sdbt;
aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt;
@@ -1666,23 +1639,23 @@ static void *aux_buffer_setup(int cpu, void **pages, int nr_pages,
*/
for (i = 0; i < nr_pages; i++, tail++) {
if (require_table_link(tail)) {
- new = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ new = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!new)
goto no_sdbt;
aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new;
/* Link current page to tail of chain */
- *tail = (unsigned long)(void *) new + 1;
+ *tail = virt_to_phys(new) + 1;
tail = new;
}
/* Tail is the entry in a SDBT */
- *tail = (unsigned long)pages[i];
+ *tail = virt_to_phys(pages[i]);
aux->sdb_index[i] = (unsigned long)pages[i];
aux_sdb_init((unsigned long)pages[i]);
}
sfb->num_sdb = nr_pages;
/* Link the last entry in the SDBT to the first SDBT */
- *tail = (unsigned long) sfb->sdbt + 1;
+ *tail = virt_to_phys(sfb->sdbt) + 1;
sfb->tail = tail;
/*
@@ -1692,10 +1665,6 @@ static void *aux_buffer_setup(int cpu, void **pages, int nr_pages,
*/
aux->empty_mark = sfb->num_sdb - 1;
- debug_sprintf_event(sfdbg, 4, "aux_buffer_setup: setup %lu SDBTs"
- " and %lu SDBs\n",
- sfb->num_sdbt, sfb->num_sdb);
-
return aux;
no_sdbt:
@@ -1716,6 +1685,39 @@ static void cpumsf_pmu_read(struct perf_event *event)
/* Nothing to do ... updates are interrupt-driven */
}
+/* Check if the new sampling period/frequency is appropriate.
+ *
+ * Return non-zero on error and zero on passed checks.
+ */
+static int cpumsf_pmu_check_period(struct perf_event *event, u64 value)
+{
+ struct hws_qsi_info_block si;
+ unsigned long rate;
+ bool do_freq;
+
+ memset(&si, 0, sizeof(si));
+ if (event->cpu == -1) {
+ qsi(&si);
+ } else {
+ /* Event is pinned to a particular CPU, retrieve the per-CPU
+ * sampling structure for accessing the CPU-specific QSI.
+ */
+ struct cpu_hw_sf *cpuhw = &per_cpu(cpu_hw_sf, event->cpu);
+
+ si = cpuhw->qsi;
+ }
+
+ do_freq = !!SAMPL_FREQ_MODE(&event->hw);
+ rate = getrate(do_freq, value, &si);
+ if (!rate)
+ return -EINVAL;
+
+ event->attr.sample_period = rate;
+ SAMPL_RATE(&event->hw) = rate;
+ hw_init_period(&event->hw, SAMPL_RATE(&event->hw));
+ return 0;
+}
+
/* Activate sampling control.
* Next call of pmu_enable() starts sampling.
*/
@@ -1723,12 +1725,8 @@ static void cpumsf_pmu_start(struct perf_event *event, int flags)
{
struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
- if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ if (!(event->hw.state & PERF_HES_STOPPED))
return;
-
- if (flags & PERF_EF_RELOAD)
- WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-
perf_pmu_disable(event->pmu);
event->hw.state = 0;
cpuhw->lsctl.cs = 1;
@@ -1753,7 +1751,9 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags)
event->hw.state |= PERF_HES_STOPPED;
if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
- hw_perf_event_update(event, 1);
+ /* CPU hotplug off removes SDBs. No samples to extract. */
+ if (cpuhw->flags & PMU_F_RESERVED)
+ hw_perf_event_update(event, 1);
event->hw.state |= PERF_HES_UPTODATE;
}
perf_pmu_enable(event->pmu);
@@ -1763,15 +1763,14 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
struct aux_buffer *aux;
- int err;
+ int err = 0;
if (cpuhw->flags & PMU_F_IN_USE)
return -EAGAIN;
- if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt)
+ if (!SAMPL_DIAG_MODE(&event->hw) && !sf_buffer_available(cpuhw))
return -EINVAL;
- err = 0;
perf_pmu_disable(event->pmu);
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
@@ -1785,9 +1784,9 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags)
cpuhw->lsctl.h = 1;
cpuhw->lsctl.interval = SAMPL_RATE(&event->hw);
if (!SAMPL_DIAG_MODE(&event->hw)) {
- cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt;
- cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt;
- hw_reset_registers(&event->hw, cpuhw->sfb.sdbt);
+ cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt);
+ cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt;
+ TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt;
}
/* Ensure sampling functions are in the disabled state. If disabled,
@@ -1879,10 +1878,12 @@ static struct attribute_group cpumsf_pmu_events_group = {
.name = "events",
.attrs = cpumsf_pmu_events_attr,
};
+
static struct attribute_group cpumsf_pmu_format_group = {
.name = "format",
.attrs = cpumsf_pmu_format_attr,
};
+
static const struct attribute_group *cpumsf_pmu_attr_groups[] = {
&cpumsf_pmu_events_group,
&cpumsf_pmu_format_group,
@@ -1905,6 +1906,8 @@ static struct pmu cpumf_sampling = {
.setup_aux = aux_buffer_setup,
.free_aux = aux_buffer_free,
+
+ .check_period = cpumsf_pmu_check_period,
};
static void cpumf_measurement_alert(struct ext_code ext_code,
@@ -1927,18 +1930,18 @@ static void cpumf_measurement_alert(struct ext_code ext_code,
/* Program alert request */
if (alert & CPU_MF_INT_SF_PRA) {
- if (cpuhw->flags & PMU_F_IN_USE)
+ if (cpuhw->flags & PMU_F_IN_USE) {
if (SAMPL_DIAG_MODE(&cpuhw->event->hw))
hw_collect_aux(cpuhw);
else
hw_perf_event_update(cpuhw->event, 0);
- else
- WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE));
+ }
}
/* Report measurement alerts only for non-PRA codes */
if (alert != CPU_MF_INT_SF_PRA)
- debug_sprintf_event(sfdbg, 6, "measurement alert: 0x%x\n", alert);
+ debug_sprintf_event(sfdbg, 6, "%s alert %#x\n", __func__,
+ alert);
/* Sampling authorization change request */
if (alert & CPU_MF_INT_SF_SACA)
@@ -1953,18 +1956,19 @@ static void cpumf_measurement_alert(struct ext_code ext_code,
/* Invalid sampling buffer entry */
if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) {
- pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n",
+ pr_err("A sampling buffer entry is incorrect (alert=%#x)\n",
alert);
cpuhw->flags |= PMU_F_ERR_IBE;
sf_disable();
}
}
+
static int cpusf_pmu_setup(unsigned int cpu, int flags)
{
/* Ignore the notification if no events are scheduled on the PMU.
* This might be racy...
*/
- if (!atomic_read(&num_events))
+ if (!refcount_read(&num_events))
return 0;
local_irq_disable();
@@ -2015,7 +2019,7 @@ static int param_set_sfb_size(const char *val, const struct kernel_param *kp)
sfb_set_limits(min, max);
pr_info("The sampling buffer limits have changed to: "
- "min=%lu max=%lu (diag=x%lu)\n",
+ "min %lu max %lu (diag %lu)\n",
CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB, CPUM_SF_SDB_DIAG_FACTOR);
return 0;
}
@@ -2026,14 +2030,16 @@ static const struct kernel_param_ops param_ops_sfb_size = {
.get = param_get_sfb_size,
};
-#define RS_INIT_FAILURE_QSI 0x0001
-#define RS_INIT_FAILURE_BSDES 0x0002
-#define RS_INIT_FAILURE_ALRT 0x0003
-#define RS_INIT_FAILURE_PERF 0x0004
+enum {
+ RS_INIT_FAILURE_BSDES = 2, /* Bad basic sampling size */
+ RS_INIT_FAILURE_ALRT = 3, /* IRQ registration failure */
+ RS_INIT_FAILURE_PERF = 4 /* PMU registration failure */
+};
+
static void __init pr_cpumsf_err(unsigned int reason)
{
pr_err("Sampling facility support for perf is not available: "
- "reason=%04x\n", reason);
+ "reason %#x\n", reason);
}
static int __init init_cpum_sampling_pmu(void)
@@ -2045,11 +2051,7 @@ static int __init init_cpum_sampling_pmu(void)
return -ENODEV;
memset(&si, 0, sizeof(si));
- if (qsi(&si)) {
- pr_cpumsf_err(RS_INIT_FAILURE_QSI);
- return -ENODEV;
- }
-
+ qsi(&si);
if (!si.as && !si.ad)
return -ENODEV;
@@ -2067,7 +2069,7 @@ static int __init init_cpum_sampling_pmu(void)
CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG);
}
- sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80);
+ sfdbg = debug_register("cpum_sf", 2, 1, 80);
if (!sfdbg) {
pr_err("Registering for s390dbf failed\n");
return -ENOMEM;
@@ -2096,5 +2098,6 @@ static int __init init_cpum_sampling_pmu(void)
out:
return err;
}
+
arch_initcall(init_cpum_sampling_pmu);
-core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0640);
+core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644);