diff options
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r-- | tools/perf/util/arm-spe.c | 599 |
1 files changed, 512 insertions, 87 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index afbd5869f6bf..2a9775649cc2 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -37,6 +37,8 @@ #include "../../arch/arm64/include/asm/cputype.h" #define MAX_TIMESTAMP (~0ULL) +#define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST)) + struct arm_spe { struct auxtrace auxtrace; struct auxtrace_queues queues; @@ -46,7 +48,6 @@ struct arm_spe { struct perf_session *session; struct machine *machine; u32 pmu_type; - u64 midr; struct perf_tsc_conversion tc; @@ -69,7 +70,7 @@ struct arm_spe { u64 llc_access_id; u64 tlb_miss_id; u64 tlb_access_id; - u64 branch_miss_id; + u64 branch_id; u64 remote_access_id; u64 memory_id; u64 instructions_id; @@ -78,6 +79,11 @@ struct arm_spe { unsigned long num_events; u8 use_ctx_pkt_for_pid; + + u64 **metadata; + u64 metadata_ver; + u64 metadata_nr_cpu; + bool is_homogeneous; }; struct arm_spe_queue { @@ -96,8 +102,22 @@ struct arm_spe_queue { u64 timestamp; struct thread *thread; u64 period_instructions; + u32 flags; + struct branch_stack *last_branch; +}; + +struct data_source_handle { + const struct midr_range *midr_ranges; + void (*ds_synth)(const struct arm_spe_record *record, + union perf_mem_data_src *data_src); }; +#define DS(range, func) \ + { \ + .midr_ranges = range, \ + .ds_synth = arm_spe__synth_##func, \ + } + static void arm_spe_dump(struct arm_spe *spe __maybe_unused, unsigned char *buf, size_t len) { @@ -118,7 +138,7 @@ static void arm_spe_dump(struct arm_spe *spe __maybe_unused, else pkt_len = 1; printf("."); - color_fprintf(stdout, color, " %08x: ", pos); + color_fprintf(stdout, color, " %08zx: ", pos); for (i = 0; i < pkt_len; i++) color_fprintf(stdout, color, " %02x", buf[i]); for (; i < 16; i++) @@ -214,6 +234,17 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, params.get_trace = arm_spe_get_trace; params.data = speq; + if (spe->synth_opts.last_branch) { + size_t sz = sizeof(struct branch_stack); + + /* Allocate up to two entries for PBT + TGT */ + sz += sizeof(struct branch_entry) * + min(spe->synth_opts.last_branch_sz, 2U); + speq->last_branch = zalloc(sz); + if (!speq->last_branch) + goto out_free; + } + /* create new decoder */ speq->decoder = arm_spe_decoder_new(¶ms); if (!speq->decoder) @@ -223,6 +254,7 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, out_free: zfree(&speq->event_buf); + zfree(&speq->last_branch); free(speq); return NULL; @@ -273,6 +305,20 @@ static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid) return 0; } +static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu) +{ + u64 i; + + if (!spe->metadata) + return NULL; + + for (i = 0; i < spe->metadata_nr_cpu; i++) + if (spe->metadata[i][ARM_SPE_CPU] == cpu) + return spe->metadata[i]; + + return NULL; +} + static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record) { struct simd_flags simd_flags = {}; @@ -315,6 +361,88 @@ static void arm_spe_prep_sample(struct arm_spe *spe, event->sample.header.size = sizeof(struct perf_event_header); } +static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq) +{ + struct arm_spe *spe = speq->spe; + struct arm_spe_record *record = &speq->decoder->record; + struct branch_stack *bstack = speq->last_branch; + struct branch_flags *bs_flags; + unsigned int last_branch_sz = spe->synth_opts.last_branch_sz; + bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH); + bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt; + size_t sz = sizeof(struct branch_stack) + + sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */; + int i = 0; + + /* Clean up branch stack */ + memset(bstack, 0x0, sz); + + if (!have_tgt && !have_pbt) + return; + + if (have_tgt) { + bstack->entries[i].from = record->from_ip; + bstack->entries[i].to = record->to_ip; + + bs_flags = &bstack->entries[i].flags; + bs_flags->value = 0; + + if (record->op & ARM_SPE_OP_BR_CR_BL) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND_CALL; + else + bs_flags->type |= PERF_BR_CALL; + /* + * Indirect branch instruction without link (e.g. BR), + * take this case as function return. + */ + } else if (record->op & ARM_SPE_OP_BR_CR_RET || + record->op & ARM_SPE_OP_BR_INDIRECT) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND_RET; + else + bs_flags->type |= PERF_BR_RET; + } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND; + else + bs_flags->type |= PERF_BR_UNCOND; + } else { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND; + else + bs_flags->type |= PERF_BR_UNKNOWN; + } + + if (record->type & ARM_SPE_BRANCH_MISS) { + bs_flags->mispred = 1; + bs_flags->predicted = 0; + } else { + bs_flags->mispred = 0; + bs_flags->predicted = 1; + } + + if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) + bs_flags->not_taken = 1; + + if (record->type & ARM_SPE_IN_TXN) + bs_flags->in_tx = 1; + + bs_flags->cycles = min(record->latency, 0xFFFFU); + i++; + } + + if (have_pbt) { + bs_flags = &bstack->entries[i].flags; + bs_flags->type |= PERF_BR_UNKNOWN; + bstack->entries[i].to = record->prev_br_tgt; + i++; + } + + bstack->nr = i; + bstack->hw_idx = -1ULL; +} + static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type) { event->header.size = perf_event__sample_event_size(sample, type, 0); @@ -348,8 +476,10 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; @@ -359,7 +489,9 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, sample.data_src = data_src; sample.weight = record->latency; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, @@ -368,16 +500,22 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; sample.stream_id = spe_events_id; sample.addr = record->to_ip; sample.weight = record->latency; + sample.flags = speq->flags; + sample.branch_stack = speq->last_branch; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, @@ -386,7 +524,8 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; /* * Handles perf instruction sampling period. @@ -396,28 +535,77 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, return 0; speq->period_instructions = 0; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; sample.stream_id = spe_events_id; - sample.addr = record->virt_addr; + sample.addr = record->to_ip; sample.phys_addr = record->phys_addr; sample.data_src = data_src; sample.period = spe->instructions_sample_period; sample.weight = record->latency; + sample.flags = speq->flags; + sample.branch_stack = speq->last_branch; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } -static const struct midr_range neoverse_spe[] = { +static const struct midr_range common_ds_encoding_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), {}, }; -static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, - union perf_mem_data_src *data_src) +static const struct midr_range ampereone_ds_encoding_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; + +static void arm_spe__sample_flags(struct arm_spe_queue *speq) +{ + const struct arm_spe_record *record = &speq->decoder->record; + + speq->flags = 0; + if (record->op & ARM_SPE_OP_BRANCH_ERET) { + speq->flags = PERF_IP_FLAG_BRANCH; + + if (record->type & ARM_SPE_BRANCH_MISS) + speq->flags |= PERF_IP_FLAG_BRANCH_MISS; + + if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) + speq->flags |= PERF_IP_FLAG_NOT_TAKEN; + + if (record->type & ARM_SPE_IN_TXN) + speq->flags |= PERF_IP_FLAG_IN_TX; + + if (record->op & ARM_SPE_OP_BR_COND) + speq->flags |= PERF_IP_FLAG_CONDITIONAL; + + if (record->op & ARM_SPE_OP_BR_CR_BL) + speq->flags |= PERF_IP_FLAG_CALL; + else if (record->op & ARM_SPE_OP_BR_CR_RET) + speq->flags |= PERF_IP_FLAG_RETURN; + /* + * Indirect branch instruction without link (e.g. BR), + * take it as a function return. + */ + else if (record->op & ARM_SPE_OP_BR_INDIRECT) + speq->flags |= PERF_IP_FLAG_RETURN; + } +} + +static void arm_spe__synth_data_source_common(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) { /* * Even though four levels of cache hierarchy are possible, no known @@ -439,17 +627,17 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec } switch (record->source) { - case ARM_SPE_NV_L1D: + case ARM_SPE_COMMON_DS_L1D: data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; data_src->mem_snoop = PERF_MEM_SNOOP_NONE; break; - case ARM_SPE_NV_L2: + case ARM_SPE_COMMON_DS_L2: data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; data_src->mem_snoop = PERF_MEM_SNOOP_NONE; break; - case ARM_SPE_NV_PEER_CORE: + case ARM_SPE_COMMON_DS_PEER_CORE: data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; @@ -458,8 +646,8 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec * We don't know if this is L1, L2 but we do know it was a cache-2-cache * transfer, so set SNOOPX_PEER */ - case ARM_SPE_NV_LOCAL_CLUSTER: - case ARM_SPE_NV_PEER_CLUSTER: + case ARM_SPE_COMMON_DS_LOCAL_CLUSTER: + case ARM_SPE_COMMON_DS_PEER_CLUSTER: data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; @@ -467,7 +655,7 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec /* * System cache is assumed to be L3 */ - case ARM_SPE_NV_SYS_CACHE: + case ARM_SPE_COMMON_DS_SYS_CACHE: data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; data_src->mem_snoop = PERF_MEM_SNOOP_HIT; @@ -476,13 +664,13 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec * We don't know what level it hit in, except it came from the other * socket */ - case ARM_SPE_NV_REMOTE: + case ARM_SPE_COMMON_DS_REMOTE: data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; break; - case ARM_SPE_NV_DRAM: + case ARM_SPE_COMMON_DS_DRAM: data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; data_src->mem_snoop = PERF_MEM_SNOOP_NONE; @@ -492,8 +680,51 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec } } -static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record, - union perf_mem_data_src *data_src) +/* + * Source is IMPDEF. Here we convert the source code used on AmpereOne cores + * to the common (Neoverse, Cortex) to avoid duplicating the decoding code. + */ +static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ + struct arm_spe_record common_record; + + switch (record->source) { + case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE: + common_record.source = ARM_SPE_COMMON_DS_PEER_CORE; + break; + case ARM_SPE_AMPEREONE_SLC: + common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE; + break; + case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE: + common_record.source = ARM_SPE_COMMON_DS_REMOTE; + break; + case ARM_SPE_AMPEREONE_DDR: + common_record.source = ARM_SPE_COMMON_DS_DRAM; + break; + case ARM_SPE_AMPEREONE_L1D: + common_record.source = ARM_SPE_COMMON_DS_L1D; + break; + case ARM_SPE_AMPEREONE_L2D: + common_record.source = ARM_SPE_COMMON_DS_L2; + break; + default: + pr_warning_once("AmpereOne: Unknown data source (0x%x)\n", + record->source); + return; + } + + common_record.op = record->op; + arm_spe__synth_data_source_common(&common_record, data_src); +} + +static const struct data_source_handle data_source_handles[] = { + DS(common_ds_encoding_cpus, data_source_common), + DS(ampereone_ds_encoding_cpus, data_source_ampereone), +}; + +static void arm_spe__synth_memory_level(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) { if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { data_src->mem_lvl = PERF_MEM_LVL_L3; @@ -515,10 +746,63 @@ static void arm_spe__synth_data_source_generic(const struct arm_spe_record *reco data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; } -static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr) +static bool arm_spe__synth_ds(struct arm_spe_queue *speq, + const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ + struct arm_spe *spe = speq->spe; + u64 *metadata = NULL; + u64 midr; + unsigned int i; + + /* Metadata version 1 assumes all CPUs are the same (old behavior) */ + if (spe->metadata_ver == 1) { + const char *cpuid; + + pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n"); + cpuid = perf_env__cpuid(spe->session->evlist->env); + midr = strtol(cpuid, NULL, 16); + } else { + /* CPU ID is -1 for per-thread mode */ + if (speq->cpu < 0) { + /* + * On the heterogeneous system, due to CPU ID is -1, + * cannot confirm the data source packet is supported. + */ + if (!spe->is_homogeneous) + return false; + + /* In homogeneous system, simply use CPU0's metadata */ + if (spe->metadata) + metadata = spe->metadata[0]; + } else { + metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); + } + + if (!metadata) + return false; + + midr = metadata[ARM_SPE_CPU_MIDR]; + } + + for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) { + if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) { + data_source_handles[i].ds_synth(record, data_src); + return true; + } + } + + return false; +} + +static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, + const struct arm_spe_record *record) { union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA }; - bool is_neoverse = is_midr_in_range_list(midr, neoverse_spe); + + /* Only synthesize data source for LDST operations */ + if (!is_ldst_op(record->op)) + return 0; if (record->op & ARM_SPE_OP_LD) data_src.mem_op = PERF_MEM_OP_LOAD; @@ -527,10 +811,8 @@ static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 m else return 0; - if (is_neoverse) - arm_spe__synth_data_source_neoverse(record, &data_src); - else - arm_spe__synth_data_source_generic(record, &data_src); + if (!arm_spe__synth_ds(speq, record, &data_src)) + arm_spe__synth_memory_level(record, &data_src); if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { data_src.mem_dtlb = PERF_MEM_TLB_WK; @@ -551,7 +833,8 @@ static int arm_spe_sample(struct arm_spe_queue *speq) u64 data_src; int err; - data_src = arm_spe__synth_data_source(record, spe->midr); + arm_spe__sample_flags(speq); + data_src = arm_spe__synth_data_source(speq, record); if (spe->sample_flc) { if (record->type & ARM_SPE_L1D_MISS) { @@ -601,8 +884,12 @@ static int arm_spe_sample(struct arm_spe_queue *speq) } } - if (spe->sample_branch && (record->type & ARM_SPE_BRANCH_MISS)) { - err = arm_spe__synth_branch_sample(speq, spe->branch_miss_id); + if (spe->synth_opts.last_branch && + (spe->sample_branch || spe->sample_instructions)) + arm_spe__prep_branch_stack(speq); + + if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) { + err = arm_spe__synth_branch_sample(speq, spe->branch_id); if (err) return err; } @@ -619,7 +906,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq) * When data_src is zero it means the record is not a memory operation, * skip to synthesize memory sample for this case. */ - if (spe->sample_memory && data_src) { + if (spe->sample_memory && is_ldst_op(record->op)) { err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src); if (err) return err; @@ -899,7 +1186,7 @@ static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event, static int arm_spe_process_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool) + const struct perf_tool *tool) { int err = 0; u64 timestamp; @@ -947,7 +1234,7 @@ static int arm_spe_process_event(struct perf_session *session, static int arm_spe_process_auxtrace_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); @@ -985,7 +1272,7 @@ static int arm_spe_process_auxtrace_event(struct perf_session *session, } static int arm_spe_flush(struct perf_session *session __maybe_unused, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); @@ -1016,6 +1303,73 @@ static int arm_spe_flush(struct perf_session *session __maybe_unused, return 0; } +static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size) +{ + u64 *metadata; + + metadata = zalloc(per_cpu_size); + if (!metadata) + return NULL; + + memcpy(metadata, buf, per_cpu_size); + return metadata; +} + +static void arm_spe__free_metadata(u64 **metadata, int nr_cpu) +{ + int i; + + for (i = 0; i < nr_cpu; i++) + zfree(&metadata[i]); + free(metadata); +} + +static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info, + u64 *ver, int *nr_cpu) +{ + u64 *ptr = (u64 *)info->priv; + u64 metadata_size; + u64 **metadata = NULL; + int hdr_sz, per_cpu_sz, i; + + metadata_size = info->header.size - + sizeof(struct perf_record_auxtrace_info); + + /* Metadata version 1 */ + if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) { + *ver = 1; + *nr_cpu = 0; + /* No per CPU metadata */ + return NULL; + } + + *ver = ptr[ARM_SPE_HEADER_VERSION]; + hdr_sz = ptr[ARM_SPE_HEADER_SIZE]; + *nr_cpu = ptr[ARM_SPE_CPUS_NUM]; + + metadata = calloc(*nr_cpu, sizeof(*metadata)); + if (!metadata) + return NULL; + + /* Locate the start address of per CPU metadata */ + ptr += hdr_sz; + per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu); + + for (i = 0; i < *nr_cpu; i++) { + metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz); + if (!metadata[i]) + goto err_per_cpu_metadata; + + ptr += per_cpu_sz / sizeof(u64); + } + + return metadata; + +err_per_cpu_metadata: + arm_spe__free_metadata(metadata, *nr_cpu); + return NULL; +} + static void arm_spe_free_queue(void *priv) { struct arm_spe_queue *speq = priv; @@ -1025,6 +1379,7 @@ static void arm_spe_free_queue(void *priv) thread__zput(speq->thread); arm_spe_decoder_free(speq->decoder); zfree(&speq->event_buf); + zfree(&speq->last_branch); free(speq); } @@ -1050,6 +1405,7 @@ static void arm_spe_free(struct perf_session *session) auxtrace_heap__free(&spe->heap); arm_spe_free_events(session); session->auxtrace = NULL; + arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu); free(spe); } @@ -1061,45 +1417,60 @@ static bool arm_spe_evsel_is_auxtrace(struct perf_session *session, return evsel->core.attr.type == spe->pmu_type; } -static const char * const arm_spe_info_fmts[] = { - [ARM_SPE_PMU_TYPE] = " PMU Type %"PRId64"\n", +static const char * const metadata_hdr_v1_fmts[] = { + [ARM_SPE_PMU_TYPE] = " PMU Type :%"PRId64"\n", + [ARM_SPE_PER_CPU_MMAPS] = " Per CPU mmaps :%"PRId64"\n", }; -static void arm_spe_print_info(__u64 *arr) -{ - if (!dump_trace) - return; - - fprintf(stdout, arm_spe_info_fmts[ARM_SPE_PMU_TYPE], arr[ARM_SPE_PMU_TYPE]); -} +static const char * const metadata_hdr_fmts[] = { + [ARM_SPE_HEADER_VERSION] = " Header version :%"PRId64"\n", + [ARM_SPE_HEADER_SIZE] = " Header size :%"PRId64"\n", + [ARM_SPE_PMU_TYPE_V2] = " PMU type v2 :%"PRId64"\n", + [ARM_SPE_CPUS_NUM] = " CPU number :%"PRId64"\n", +}; -struct arm_spe_synth { - struct perf_tool dummy_tool; - struct perf_session *session; +static const char * const metadata_per_cpu_fmts[] = { + [ARM_SPE_MAGIC] = " Magic :0x%"PRIx64"\n", + [ARM_SPE_CPU] = " CPU # :%"PRId64"\n", + [ARM_SPE_CPU_NR_PARAMS] = " Num of params :%"PRId64"\n", + [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n", + [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n", + [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n", }; -static int arm_spe_event_synth(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample __maybe_unused, - struct machine *machine __maybe_unused) +static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr) { - struct arm_spe_synth *arm_spe_synth = - container_of(tool, struct arm_spe_synth, dummy_tool); + unsigned int i, cpu, hdr_size, cpu_num, cpu_size; + const char * const *hdr_fmts; - return perf_session__deliver_synth_event(arm_spe_synth->session, - event, NULL); -} + if (!dump_trace) + return; -static int arm_spe_synth_event(struct perf_session *session, - struct perf_event_attr *attr, u64 id) -{ - struct arm_spe_synth arm_spe_synth; + if (spe->metadata_ver == 1) { + cpu_num = 0; + hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX; + hdr_fmts = metadata_hdr_v1_fmts; + } else { + cpu_num = arr[ARM_SPE_CPUS_NUM]; + hdr_size = arr[ARM_SPE_HEADER_SIZE]; + hdr_fmts = metadata_hdr_fmts; + } - memset(&arm_spe_synth, 0, sizeof(struct arm_spe_synth)); - arm_spe_synth.session = session; + for (i = 0; i < hdr_size; i++) + fprintf(stdout, hdr_fmts[i], arr[i]); - return perf_event__synthesize_attr(&arm_spe_synth.dummy_tool, attr, 1, - &id, arm_spe_event_synth); + arr += hdr_size; + for (cpu = 0; cpu < cpu_num; cpu++) { + /* + * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS + * are fixed. The sequential parameter size is decided by the + * field 'ARM_SPE_CPU_NR_PARAMS'. + */ + cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS]; + for (i = 0; i < cpu_size; i++) + fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]); + arr += cpu_size; + } } static void arm_spe_set_event_name(struct evlist *evlist, u64 id, @@ -1172,7 +1543,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_flc = true; /* Level 1 data cache miss */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->l1d_miss_id = id; @@ -1180,7 +1551,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; /* Level 1 data cache access */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->l1d_access_id = id; @@ -1192,7 +1563,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_llc = true; /* Last level cache miss */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->llc_miss_id = id; @@ -1200,7 +1571,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; /* Last level cache access */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->llc_access_id = id; @@ -1212,7 +1583,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_tlb = true; /* TLB miss */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->tlb_miss_id = id; @@ -1220,7 +1591,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; /* TLB access */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->tlb_access_id = id; @@ -1228,15 +1599,28 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; } + if (spe->synth_opts.last_branch) { + if (spe->synth_opts.last_branch_sz > 2) + pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n"); + + attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; + /* + * We don't use the hardware index, but the sample generation + * code uses the new format branch_stack with this field, + * so the event attributes must indicate that it's present. + */ + attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; + } + if (spe->synth_opts.branches) { spe->sample_branch = true; - /* Branch miss */ - err = arm_spe_synth_event(session, &attr, id); + /* Branch */ + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; - spe->branch_miss_id = id; - arm_spe_set_event_name(evlist, id, "branch-miss"); + spe->branch_id = id; + arm_spe_set_event_name(evlist, id, "branch"); id += 1; } @@ -1244,7 +1628,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_remote_access = true; /* Remote access */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->remote_access_id = id; @@ -1255,7 +1639,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) if (spe->synth_opts.mem) { spe->sample_memory = true; - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->memory_id = id; @@ -1276,7 +1660,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) attr.config = PERF_COUNT_HW_INSTRUCTIONS; attr.sample_period = spe->synth_opts.period; spe->instructions_sample_period = attr.sample_period; - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->instructions_id = id; @@ -1287,24 +1671,57 @@ synth_instructions_out: return 0; } +static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu) +{ + u64 midr; + int i; + + if (!nr_cpu) + return false; + + for (i = 0; i < nr_cpu; i++) { + if (!metadata[i]) + return false; + + if (i == 0) { + midr = metadata[i][ARM_SPE_CPU_MIDR]; + continue; + } + + if (midr != metadata[i][ARM_SPE_CPU_MIDR]) + return false; + } + + return true; +} + int arm_spe_process_auxtrace_info(union perf_event *event, struct perf_session *session) { struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; - size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX; + size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE; struct perf_record_time_conv *tc = &session->time_conv; - const char *cpuid = perf_env__cpuid(session->evlist->env); - u64 midr = strtol(cpuid, NULL, 16); struct arm_spe *spe; - int err; + u64 **metadata = NULL; + u64 metadata_ver; + int nr_cpu, err; if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) + min_sz) return -EINVAL; + metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver, + &nr_cpu); + if (!metadata && metadata_ver != 1) { + pr_err("Failed to parse Arm SPE metadata.\n"); + return -EINVAL; + } + spe = zalloc(sizeof(struct arm_spe)); - if (!spe) - return -ENOMEM; + if (!spe) { + err = -ENOMEM; + goto err_free_metadata; + } err = auxtrace_queues__init(&spe->queues); if (err) @@ -1313,8 +1730,14 @@ int arm_spe_process_auxtrace_info(union perf_event *event, spe->session = session; spe->machine = &session->machines.host; /* No kvm support */ spe->auxtrace_type = auxtrace_info->type; - spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; - spe->midr = midr; + if (metadata_ver == 1) + spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; + else + spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2]; + spe->metadata = metadata; + spe->metadata_ver = metadata_ver; + spe->metadata_nr_cpu = nr_cpu; + spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu); spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); @@ -1347,7 +1770,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event, spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace; session->auxtrace = &spe->auxtrace; - arm_spe_print_info(&auxtrace_info->priv[0]); + arm_spe_print_info(spe, &auxtrace_info->priv[0]); if (dump_trace) return 0; @@ -1375,5 +1798,7 @@ err_free_queues: session->auxtrace = NULL; err_free: free(spe); +err_free_metadata: + arm_spe__free_metadata(metadata, nr_cpu); return err; } |