summaryrefslogtreecommitdiff
path: root/tools/perf/util/arm-spe.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r--tools/perf/util/arm-spe.c167
1 files changed, 160 insertions, 7 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 12761c39788f..2a9775649cc2 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -37,6 +37,8 @@
#include "../../arch/arm64/include/asm/cputype.h"
#define MAX_TIMESTAMP (~0ULL)
+#define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST))
+
struct arm_spe {
struct auxtrace auxtrace;
struct auxtrace_queues queues;
@@ -101,6 +103,7 @@ struct arm_spe_queue {
struct thread *thread;
u64 period_instructions;
u32 flags;
+ struct branch_stack *last_branch;
};
struct data_source_handle {
@@ -231,6 +234,17 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
params.get_trace = arm_spe_get_trace;
params.data = speq;
+ if (spe->synth_opts.last_branch) {
+ size_t sz = sizeof(struct branch_stack);
+
+ /* Allocate up to two entries for PBT + TGT */
+ sz += sizeof(struct branch_entry) *
+ min(spe->synth_opts.last_branch_sz, 2U);
+ speq->last_branch = zalloc(sz);
+ if (!speq->last_branch)
+ goto out_free;
+ }
+
/* create new decoder */
speq->decoder = arm_spe_decoder_new(&params);
if (!speq->decoder)
@@ -240,6 +254,7 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
out_free:
zfree(&speq->event_buf);
+ zfree(&speq->last_branch);
free(speq);
return NULL;
@@ -346,6 +361,88 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
event->sample.header.size = sizeof(struct perf_event_header);
}
+static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq)
+{
+ struct arm_spe *spe = speq->spe;
+ struct arm_spe_record *record = &speq->decoder->record;
+ struct branch_stack *bstack = speq->last_branch;
+ struct branch_flags *bs_flags;
+ unsigned int last_branch_sz = spe->synth_opts.last_branch_sz;
+ bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH);
+ bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt;
+ size_t sz = sizeof(struct branch_stack) +
+ sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */;
+ int i = 0;
+
+ /* Clean up branch stack */
+ memset(bstack, 0x0, sz);
+
+ if (!have_tgt && !have_pbt)
+ return;
+
+ if (have_tgt) {
+ bstack->entries[i].from = record->from_ip;
+ bstack->entries[i].to = record->to_ip;
+
+ bs_flags = &bstack->entries[i].flags;
+ bs_flags->value = 0;
+
+ if (record->op & ARM_SPE_OP_BR_CR_BL) {
+ if (record->op & ARM_SPE_OP_BR_COND)
+ bs_flags->type |= PERF_BR_COND_CALL;
+ else
+ bs_flags->type |= PERF_BR_CALL;
+ /*
+ * Indirect branch instruction without link (e.g. BR),
+ * take this case as function return.
+ */
+ } else if (record->op & ARM_SPE_OP_BR_CR_RET ||
+ record->op & ARM_SPE_OP_BR_INDIRECT) {
+ if (record->op & ARM_SPE_OP_BR_COND)
+ bs_flags->type |= PERF_BR_COND_RET;
+ else
+ bs_flags->type |= PERF_BR_RET;
+ } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) {
+ if (record->op & ARM_SPE_OP_BR_COND)
+ bs_flags->type |= PERF_BR_COND;
+ else
+ bs_flags->type |= PERF_BR_UNCOND;
+ } else {
+ if (record->op & ARM_SPE_OP_BR_COND)
+ bs_flags->type |= PERF_BR_COND;
+ else
+ bs_flags->type |= PERF_BR_UNKNOWN;
+ }
+
+ if (record->type & ARM_SPE_BRANCH_MISS) {
+ bs_flags->mispred = 1;
+ bs_flags->predicted = 0;
+ } else {
+ bs_flags->mispred = 0;
+ bs_flags->predicted = 1;
+ }
+
+ if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
+ bs_flags->not_taken = 1;
+
+ if (record->type & ARM_SPE_IN_TXN)
+ bs_flags->in_tx = 1;
+
+ bs_flags->cycles = min(record->latency, 0xFFFFU);
+ i++;
+ }
+
+ if (have_pbt) {
+ bs_flags = &bstack->entries[i].flags;
+ bs_flags->type |= PERF_BR_UNKNOWN;
+ bstack->entries[i].to = record->prev_br_tgt;
+ i++;
+ }
+
+ bstack->nr = i;
+ bstack->hw_idx = -1ULL;
+}
+
static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
{
event->header.size = perf_event__sample_event_size(sample, type, 0);
@@ -379,8 +476,10 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
struct arm_spe *spe = speq->spe;
struct arm_spe_record *record = &speq->decoder->record;
union perf_event *event = speq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
+ int ret;
+ perf_sample__init(&sample, /*all=*/true);
arm_spe_prep_sample(spe, speq, event, &sample);
sample.id = spe_events_id;
@@ -390,7 +489,9 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
sample.data_src = data_src;
sample.weight = record->latency;
- return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ perf_sample__exit(&sample);
+ return ret;
}
static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
@@ -399,8 +500,10 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
struct arm_spe *spe = speq->spe;
struct arm_spe_record *record = &speq->decoder->record;
union perf_event *event = speq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
+ int ret;
+ perf_sample__init(&sample, /*all=*/true);
arm_spe_prep_sample(spe, speq, event, &sample);
sample.id = spe_events_id;
@@ -408,8 +511,11 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
sample.addr = record->to_ip;
sample.weight = record->latency;
sample.flags = speq->flags;
+ sample.branch_stack = speq->last_branch;
- return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ perf_sample__exit(&sample);
+ return ret;
}
static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
@@ -418,7 +524,8 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
struct arm_spe *spe = speq->spe;
struct arm_spe_record *record = &speq->decoder->record;
union perf_event *event = speq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
+ int ret;
/*
* Handles perf instruction sampling period.
@@ -428,6 +535,7 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
return 0;
speq->period_instructions = 0;
+ perf_sample__init(&sample, /*all=*/true);
arm_spe_prep_sample(spe, speq, event, &sample);
sample.id = spe_events_id;
@@ -438,8 +546,11 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
sample.period = spe->instructions_sample_period;
sample.weight = record->latency;
sample.flags = speq->flags;
+ sample.branch_stack = speq->last_branch;
- return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ perf_sample__exit(&sample);
+ return ret;
}
static const struct midr_range common_ds_encoding_cpus[] = {
@@ -470,6 +581,26 @@ static void arm_spe__sample_flags(struct arm_spe_queue *speq)
if (record->type & ARM_SPE_BRANCH_MISS)
speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
+
+ if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
+ speq->flags |= PERF_IP_FLAG_NOT_TAKEN;
+
+ if (record->type & ARM_SPE_IN_TXN)
+ speq->flags |= PERF_IP_FLAG_IN_TX;
+
+ if (record->op & ARM_SPE_OP_BR_COND)
+ speq->flags |= PERF_IP_FLAG_CONDITIONAL;
+
+ if (record->op & ARM_SPE_OP_BR_CR_BL)
+ speq->flags |= PERF_IP_FLAG_CALL;
+ else if (record->op & ARM_SPE_OP_BR_CR_RET)
+ speq->flags |= PERF_IP_FLAG_RETURN;
+ /*
+ * Indirect branch instruction without link (e.g. BR),
+ * take it as a function return.
+ */
+ else if (record->op & ARM_SPE_OP_BR_INDIRECT)
+ speq->flags |= PERF_IP_FLAG_RETURN;
}
}
@@ -669,6 +800,10 @@ static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq,
{
union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA };
+ /* Only synthesize data source for LDST operations */
+ if (!is_ldst_op(record->op))
+ return 0;
+
if (record->op & ARM_SPE_OP_LD)
data_src.mem_op = PERF_MEM_OP_LOAD;
else if (record->op & ARM_SPE_OP_ST)
@@ -749,6 +884,10 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
}
}
+ if (spe->synth_opts.last_branch &&
+ (spe->sample_branch || spe->sample_instructions))
+ arm_spe__prep_branch_stack(speq);
+
if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
err = arm_spe__synth_branch_sample(speq, spe->branch_id);
if (err)
@@ -767,7 +906,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
* When data_src is zero it means the record is not a memory operation,
* skip to synthesize memory sample for this case.
*/
- if (spe->sample_memory && data_src) {
+ if (spe->sample_memory && is_ldst_op(record->op)) {
err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
if (err)
return err;
@@ -1240,6 +1379,7 @@ static void arm_spe_free_queue(void *priv)
thread__zput(speq->thread);
arm_spe_decoder_free(speq->decoder);
zfree(&speq->event_buf);
+ zfree(&speq->last_branch);
free(speq);
}
@@ -1459,6 +1599,19 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
id += 1;
}
+ if (spe->synth_opts.last_branch) {
+ if (spe->synth_opts.last_branch_sz > 2)
+ pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");
+
+ attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
+ /*
+ * We don't use the hardware index, but the sample generation
+ * code uses the new format branch_stack with this field,
+ * so the event attributes must indicate that it's present.
+ */
+ attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
+ }
+
if (spe->synth_opts.branches) {
spe->sample_branch = true;