diff options
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r-- | tools/perf/util/arm-spe.c | 263 |
1 files changed, 184 insertions, 79 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 8942fa598a84..71be979f5077 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -39,6 +39,18 @@ #define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST)) +#define ARM_SPE_CACHE_EVENT(lvl) \ + (ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS) + +#define arm_spe_is_cache_level(type, lvl) \ + ((type) & ARM_SPE_CACHE_EVENT(lvl)) + +#define arm_spe_is_cache_hit(type, lvl) \ + (((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS) + +#define arm_spe_is_cache_miss(type, lvl) \ + ((type) & ARM_SPE_##lvl##_MISS) + struct arm_spe { struct auxtrace auxtrace; struct auxtrace_queues queues; @@ -62,7 +74,6 @@ struct arm_spe { u8 sample_remote_access; u8 sample_memory; u8 sample_instructions; - u64 instructions_sample_period; u64 l1d_miss_id; u64 l1d_access_id; @@ -101,7 +112,7 @@ struct arm_spe_queue { u64 time; u64 timestamp; struct thread *thread; - u64 period_instructions; + u64 sample_count; u32 flags; struct branch_stack *last_branch; }; @@ -228,7 +239,6 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, speq->pid = -1; speq->tid = -1; speq->cpu = -1; - speq->period_instructions = 0; /* params set */ params.get_trace = arm_spe_get_trace; @@ -305,15 +315,28 @@ static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid) return 0; } -static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu) +static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu) { u64 i; if (!spe->metadata) return NULL; + /* CPU ID is -1 for per-thread mode */ + if (cpu < 0) { + /* + * On the heterogeneous system, due to CPU ID is -1, + * cannot confirm the data source packet is supported. + */ + if (!spe->is_homogeneous) + return NULL; + + /* In homogeneous system, simply use CPU0's metadata */ + return spe->metadata[0]; + } + for (i = 0; i < spe->metadata_nr_cpu; i++) - if (spe->metadata[i][ARM_SPE_CPU] == cpu) + if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu) return spe->metadata[i]; return NULL; @@ -352,7 +375,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe, sample->cpumode = arm_spe_cpumode(spe, sample->ip); sample->pid = speq->pid; sample->tid = speq->tid; - sample->period = 1; + sample->period = spe->synth_opts.period; sample->cpu = speq->cpu; sample->simd_flags = arm_spe__synth_simd_flags(record); @@ -471,7 +494,8 @@ arm_spe_deliver_synth_event(struct arm_spe *spe, } static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, - u64 spe_events_id, u64 data_src) + u64 spe_events_id, + union perf_mem_data_src data_src) { struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; @@ -486,7 +510,7 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, sample.stream_id = spe_events_id; sample.addr = record->virt_addr; sample.phys_addr = record->phys_addr; - sample.data_src = data_src; + sample.data_src = data_src.val; sample.weight = record->latency; ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); @@ -519,7 +543,8 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, } static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, - u64 spe_events_id, u64 data_src) + u64 spe_events_id, + union perf_mem_data_src data_src) { struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; @@ -527,14 +552,6 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, struct perf_sample sample; int ret; - /* - * Handles perf instruction sampling period. - */ - speq->period_instructions++; - if (speq->period_instructions < spe->instructions_sample_period) - return 0; - speq->period_instructions = 0; - perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); @@ -542,8 +559,7 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, sample.stream_id = spe_events_id; sample.addr = record->to_ip; sample.phys_addr = record->phys_addr; - sample.data_src = data_src; - sample.period = spe->instructions_sample_period; + sample.data_src = data_src.val; sample.weight = record->latency; sample.flags = speq->flags; sample.branch_stack = speq->last_branch; @@ -670,8 +686,8 @@ static void arm_spe__synth_data_source_common(const struct arm_spe_record *recor * socket */ case ARM_SPE_COMMON_DS_REMOTE: - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; - data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; + data_src->mem_lvl = PERF_MEM_LVL_NA; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; break; @@ -819,30 +835,121 @@ static const struct data_source_handle data_source_handles[] = { DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip), }; -static void arm_spe__synth_memory_level(const struct arm_spe_record *record, - union perf_mem_data_src *data_src) +static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) { - if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { - data_src->mem_lvl = PERF_MEM_LVL_L3; + /* + * To find a cache hit, search in ascending order from the lower level + * caches to the higher level caches. This reflects the best scenario + * for a cache hit. + */ + if (arm_spe_is_cache_hit(record->type, L1D)) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + } else if (record->type & ARM_SPE_RECENTLY_FETCHED) { + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB; + } else if (arm_spe_is_cache_hit(record->type, L2D)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + } else if (arm_spe_is_cache_hit(record->type, LLC)) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + /* + * To find a cache miss, search in descending order from the higher + * level cache to the lower level cache. This represents the worst + * scenario for a cache miss. + */ + } else if (arm_spe_is_cache_miss(record->type, LLC)) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + } else if (arm_spe_is_cache_miss(record->type, L2D)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + } else if (arm_spe_is_cache_miss(record->type, L1D)) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + } +} - if (record->type & ARM_SPE_LLC_MISS) - data_src->mem_lvl |= PERF_MEM_LVL_MISS; - else - data_src->mem_lvl |= PERF_MEM_LVL_HIT; - } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { +static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ + /* Record the greatest level info for a store operation. */ + if (arm_spe_is_cache_level(record->type, LLC)) { + data_src->mem_lvl = PERF_MEM_LVL_L3; + data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ? + PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + } else if (arm_spe_is_cache_level(record->type, L2D)) { + data_src->mem_lvl = PERF_MEM_LVL_L2; + data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ? + PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + } else if (arm_spe_is_cache_level(record->type, L1D)) { data_src->mem_lvl = PERF_MEM_LVL_L1; + data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ? + PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + } +} - if (record->type & ARM_SPE_L1D_MISS) - data_src->mem_lvl |= PERF_MEM_LVL_MISS; - else - data_src->mem_lvl |= PERF_MEM_LVL_HIT; +static void arm_spe__synth_memory_level(struct arm_spe_queue *speq, + const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ + struct arm_spe *spe = speq->spe; + + /* + * The data source packet contains more info for cache levels for + * peer snooping. So respect the memory level if has been set by + * data source parsing. + */ + if (!data_src->mem_lvl) { + if (data_src->mem_op == PERF_MEM_OP_LOAD) + arm_spe__synth_ld_memory_level(record, data_src); + if (data_src->mem_op == PERF_MEM_OP_STORE) + arm_spe__synth_st_memory_level(record, data_src); + } + + if (!data_src->mem_lvl) { + data_src->mem_lvl = PERF_MEM_LVL_NA; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; + } + + /* + * If 'mem_snoop' has been set by data source packet, skip to set + * it at here. + */ + if (!data_src->mem_snoop) { + if (record->type & ARM_SPE_DATA_SNOOPED) { + if (record->type & ARM_SPE_HITM) + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + else + data_src->mem_snoop = PERF_MEM_SNOOP_HIT; + } else { + u64 *metadata = + arm_spe__get_metadata_by_cpu(spe, speq->cpu); + + /* + * Set NA ("Not available") mode if no meta data or the + * SNOOPED event is not supported. + */ + if (!metadata || + !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED)) + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + else + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + } } - if (record->type & ARM_SPE_REMOTE_ACCESS) - data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; + if (!data_src->mem_remote) { + if (record->type & ARM_SPE_REMOTE_ACCESS) + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + } } -static bool arm_spe__synth_ds(struct arm_spe_queue *speq, +static void arm_spe__synth_ds(struct arm_spe_queue *speq, const struct arm_spe_record *record, union perf_mem_data_src *data_src) { @@ -859,56 +966,41 @@ static bool arm_spe__synth_ds(struct arm_spe_queue *speq, cpuid = perf_env__cpuid(perf_session__env(spe->session)); midr = strtol(cpuid, NULL, 16); } else { - /* CPU ID is -1 for per-thread mode */ - if (speq->cpu < 0) { - /* - * On the heterogeneous system, due to CPU ID is -1, - * cannot confirm the data source packet is supported. - */ - if (!spe->is_homogeneous) - return false; - - /* In homogeneous system, simply use CPU0's metadata */ - if (spe->metadata) - metadata = spe->metadata[0]; - } else { - metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); - } - + metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); if (!metadata) - return false; + return; midr = metadata[ARM_SPE_CPU_MIDR]; } for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) { if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) { - data_source_handles[i].ds_synth(record, data_src); - return true; + return data_source_handles[i].ds_synth(record, data_src); } } - return false; + return; } -static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, - const struct arm_spe_record *record) +static union perf_mem_data_src +arm_spe__synth_data_source(struct arm_spe_queue *speq, + const struct arm_spe_record *record) { - union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA }; + union perf_mem_data_src data_src = {}; /* Only synthesize data source for LDST operations */ if (!is_ldst_op(record->op)) - return 0; + return data_src; if (record->op & ARM_SPE_OP_LD) data_src.mem_op = PERF_MEM_OP_LOAD; else if (record->op & ARM_SPE_OP_ST) data_src.mem_op = PERF_MEM_OP_STORE; else - return 0; + return data_src; - if (!arm_spe__synth_ds(speq, record, &data_src)) - arm_spe__synth_memory_level(record, &data_src); + arm_spe__synth_ds(speq, record, &data_src); + arm_spe__synth_memory_level(speq, record, &data_src); if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { data_src.mem_dtlb = PERF_MEM_TLB_WK; @@ -919,16 +1011,24 @@ static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, data_src.mem_dtlb |= PERF_MEM_TLB_HIT; } - return data_src.val; + return data_src; } static int arm_spe_sample(struct arm_spe_queue *speq) { const struct arm_spe_record *record = &speq->decoder->record; struct arm_spe *spe = speq->spe; - u64 data_src; + union perf_mem_data_src data_src; int err; + /* + * Discard all samples until period is reached + */ + speq->sample_count++; + if (speq->sample_count < spe->synth_opts.period) + return 0; + speq->sample_count = 0; + arm_spe__sample_flags(speq); data_src = arm_spe__synth_data_source(speq, record); @@ -1532,6 +1632,7 @@ static const char * const metadata_per_cpu_fmts[] = { [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n", [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n", [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n", + [ARM_SPE_CAP_EVENT_FILTER] = " Event Filter :0x%"PRIx64"\n", }; static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr) @@ -1628,6 +1729,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) attr.exclude_guest = evsel->core.attr.exclude_guest; attr.sample_id_all = evsel->core.attr.sample_id_all; attr.read_format = evsel->core.attr.read_format; + attr.sample_period = spe->synth_opts.period; /* create new id val to be a fixed offset from evsel id */ id = evsel->core.id[0] + 1000000000; @@ -1744,25 +1846,15 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) } if (spe->synth_opts.instructions) { - if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { - pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n"); - goto synth_instructions_out; - } - if (spe->synth_opts.period > 1) - pr_warning("Arm SPE has a hardware-based sample period.\n" - "Additional instruction events will be discarded by --itrace\n"); - spe->sample_instructions = true; attr.config = PERF_COUNT_HW_INSTRUCTIONS; - attr.sample_period = spe->synth_opts.period; - spe->instructions_sample_period = attr.sample_period; + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->instructions_id = id; arm_spe_set_event_name(evlist, id, "instructions"); } -synth_instructions_out: return 0; } @@ -1871,10 +1963,23 @@ int arm_spe_process_auxtrace_info(union perf_event *event, if (dump_trace) return 0; - if (session->itrace_synth_opts && session->itrace_synth_opts->set) + if (session->itrace_synth_opts && session->itrace_synth_opts->set) { spe->synth_opts = *session->itrace_synth_opts; - else + } else { itrace_synth_opts__set_default(&spe->synth_opts, false); + /* Default nanoseconds period not supported */ + spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS; + spe->synth_opts.period = 1; + } + + if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { + ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n"); + err = -EINVAL; + goto err_free_queues; + } + if (spe->synth_opts.period > 1) + ui__warning("Arm SPE has a hardware-based sampling period.\n\n" + "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n"); err = arm_spe_synth_events(spe, session); if (err) |