diff options
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r-- | tools/perf/util/arm-spe.c | 167 |
1 files changed, 160 insertions, 7 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 12761c39788f..2a9775649cc2 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -37,6 +37,8 @@ #include "../../arch/arm64/include/asm/cputype.h" #define MAX_TIMESTAMP (~0ULL) +#define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST)) + struct arm_spe { struct auxtrace auxtrace; struct auxtrace_queues queues; @@ -101,6 +103,7 @@ struct arm_spe_queue { struct thread *thread; u64 period_instructions; u32 flags; + struct branch_stack *last_branch; }; struct data_source_handle { @@ -231,6 +234,17 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, params.get_trace = arm_spe_get_trace; params.data = speq; + if (spe->synth_opts.last_branch) { + size_t sz = sizeof(struct branch_stack); + + /* Allocate up to two entries for PBT + TGT */ + sz += sizeof(struct branch_entry) * + min(spe->synth_opts.last_branch_sz, 2U); + speq->last_branch = zalloc(sz); + if (!speq->last_branch) + goto out_free; + } + /* create new decoder */ speq->decoder = arm_spe_decoder_new(¶ms); if (!speq->decoder) @@ -240,6 +254,7 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, out_free: zfree(&speq->event_buf); + zfree(&speq->last_branch); free(speq); return NULL; @@ -346,6 +361,88 @@ static void arm_spe_prep_sample(struct arm_spe *spe, event->sample.header.size = sizeof(struct perf_event_header); } +static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq) +{ + struct arm_spe *spe = speq->spe; + struct arm_spe_record *record = &speq->decoder->record; + struct branch_stack *bstack = speq->last_branch; + struct branch_flags *bs_flags; + unsigned int last_branch_sz = spe->synth_opts.last_branch_sz; + bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH); + bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt; + size_t sz = sizeof(struct branch_stack) + + sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */; + int i = 0; + + /* Clean up branch stack */ + memset(bstack, 0x0, sz); + + if (!have_tgt && !have_pbt) + return; + + if (have_tgt) { + bstack->entries[i].from = record->from_ip; + bstack->entries[i].to = record->to_ip; + + bs_flags = &bstack->entries[i].flags; + bs_flags->value = 0; + + if (record->op & ARM_SPE_OP_BR_CR_BL) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND_CALL; + else + bs_flags->type |= PERF_BR_CALL; + /* + * Indirect branch instruction without link (e.g. BR), + * take this case as function return. + */ + } else if (record->op & ARM_SPE_OP_BR_CR_RET || + record->op & ARM_SPE_OP_BR_INDIRECT) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND_RET; + else + bs_flags->type |= PERF_BR_RET; + } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND; + else + bs_flags->type |= PERF_BR_UNCOND; + } else { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND; + else + bs_flags->type |= PERF_BR_UNKNOWN; + } + + if (record->type & ARM_SPE_BRANCH_MISS) { + bs_flags->mispred = 1; + bs_flags->predicted = 0; + } else { + bs_flags->mispred = 0; + bs_flags->predicted = 1; + } + + if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) + bs_flags->not_taken = 1; + + if (record->type & ARM_SPE_IN_TXN) + bs_flags->in_tx = 1; + + bs_flags->cycles = min(record->latency, 0xFFFFU); + i++; + } + + if (have_pbt) { + bs_flags = &bstack->entries[i].flags; + bs_flags->type |= PERF_BR_UNKNOWN; + bstack->entries[i].to = record->prev_br_tgt; + i++; + } + + bstack->nr = i; + bstack->hw_idx = -1ULL; +} + static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type) { event->header.size = perf_event__sample_event_size(sample, type, 0); @@ -379,8 +476,10 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; @@ -390,7 +489,9 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, sample.data_src = data_src; sample.weight = record->latency; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, @@ -399,8 +500,10 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; @@ -408,8 +511,11 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, sample.addr = record->to_ip; sample.weight = record->latency; sample.flags = speq->flags; + sample.branch_stack = speq->last_branch; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, @@ -418,7 +524,8 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; /* * Handles perf instruction sampling period. @@ -428,6 +535,7 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, return 0; speq->period_instructions = 0; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; @@ -438,8 +546,11 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, sample.period = spe->instructions_sample_period; sample.weight = record->latency; sample.flags = speq->flags; + sample.branch_stack = speq->last_branch; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } static const struct midr_range common_ds_encoding_cpus[] = { @@ -470,6 +581,26 @@ static void arm_spe__sample_flags(struct arm_spe_queue *speq) if (record->type & ARM_SPE_BRANCH_MISS) speq->flags |= PERF_IP_FLAG_BRANCH_MISS; + + if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) + speq->flags |= PERF_IP_FLAG_NOT_TAKEN; + + if (record->type & ARM_SPE_IN_TXN) + speq->flags |= PERF_IP_FLAG_IN_TX; + + if (record->op & ARM_SPE_OP_BR_COND) + speq->flags |= PERF_IP_FLAG_CONDITIONAL; + + if (record->op & ARM_SPE_OP_BR_CR_BL) + speq->flags |= PERF_IP_FLAG_CALL; + else if (record->op & ARM_SPE_OP_BR_CR_RET) + speq->flags |= PERF_IP_FLAG_RETURN; + /* + * Indirect branch instruction without link (e.g. BR), + * take it as a function return. + */ + else if (record->op & ARM_SPE_OP_BR_INDIRECT) + speq->flags |= PERF_IP_FLAG_RETURN; } } @@ -669,6 +800,10 @@ static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, { union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA }; + /* Only synthesize data source for LDST operations */ + if (!is_ldst_op(record->op)) + return 0; + if (record->op & ARM_SPE_OP_LD) data_src.mem_op = PERF_MEM_OP_LOAD; else if (record->op & ARM_SPE_OP_ST) @@ -749,6 +884,10 @@ static int arm_spe_sample(struct arm_spe_queue *speq) } } + if (spe->synth_opts.last_branch && + (spe->sample_branch || spe->sample_instructions)) + arm_spe__prep_branch_stack(speq); + if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) { err = arm_spe__synth_branch_sample(speq, spe->branch_id); if (err) @@ -767,7 +906,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq) * When data_src is zero it means the record is not a memory operation, * skip to synthesize memory sample for this case. */ - if (spe->sample_memory && data_src) { + if (spe->sample_memory && is_ldst_op(record->op)) { err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src); if (err) return err; @@ -1240,6 +1379,7 @@ static void arm_spe_free_queue(void *priv) thread__zput(speq->thread); arm_spe_decoder_free(speq->decoder); zfree(&speq->event_buf); + zfree(&speq->last_branch); free(speq); } @@ -1459,6 +1599,19 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; } + if (spe->synth_opts.last_branch) { + if (spe->synth_opts.last_branch_sz > 2) + pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n"); + + attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; + /* + * We don't use the hardware index, but the sample generation + * code uses the new format branch_stack with this field, + * so the event attributes must indicate that it's present. + */ + attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; + } + if (spe->synth_opts.branches) { spe->sample_branch = true; |