diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-14 16:31:23 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-14 16:31:23 -0700 |
commit | 1bbeaf83dd7b5e3628b98bec66ff8fe2646e14aa (patch) | |
tree | a391eed8ae206613b48e02e56e6ad5c4432d8767 /tools/perf/util/threads.c | |
parent | 63bd30f249dcf0a7ce16967935cecee8feec24bb (diff) | |
parent | 0f66dfe7b91d2743cc71dfff37af503215b204ef (diff) |
Merge tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools
Pull perf tools updates from Namhyung Kim:
"perf stat:
- Support new 'cluster' aggregation mode for shared resources
depending on the hardware configuration:
$ sudo perf stat -a --per-cluster -e cycles,instructions sleep 1
Performance counter stats for 'system wide':
S0-D0-CLS0 2 85,051,822 cycles
S0-D0-CLS0 2 73,909,908 instructions # 0.87 insn per cycle
S0-D0-CLS2 2 93,365,918 cycles
S0-D0-CLS2 2 83,006,158 instructions # 0.89 insn per cycle
S0-D0-CLS4 2 104,157,523 cycles
S0-D0-CLS4 2 53,234,396 instructions # 0.51 insn per cycle
S0-D0-CLS6 2 65,891,079 cycles
S0-D0-CLS6 2 41,478,273 instructions # 0.63 insn per cycle
1.002407989 seconds time elapsed
- Various fixes and cleanups for event metrics including NaN handling
perf script:
- Use libcapstone if available to disassemble the instructions. This
enables 'perf script -F disasm' and 'perf script --insn-trace=disasm'
(for Intel-PT):
$ perf script -F event,ip,disasm
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffa9839d25 movq %rax, %r14
cycles:P: ffffffffa9cdcaf0 endbr64
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffaa401f86 iretq
cycles:P: ffffffffa99c4de5 movq 0x30(%rcx), %r8
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffaa401f86 iretq
cycles:P: ffffffffa9907983 movl 0x68(%rbx), %eax
cycles:P: ffffffffa988d428 wrmsr
- Expose sample ID / stream ID to python scripts
perf test:
- Add more perf test cases from Redhat internal test suites. This
time it adds the base infra and a few perf probe tests. More to
come. :)
- Add 'perf test -p' for parallel execution and fix some issues found
by the parallel test
- Support symbol test to print symbols in given (active) module:
$ perf test -F -v Symbols --dso /lib/modules/$(uname -r)/kernel/fs/ext4/ext4.ko
--- start ---
Testing /lib/modules/6.5.13-1rodete2-amd64/kernel/fs/ext4/ext4.ko
Overlapping symbols:
7a990-7a9a0 l __pfx_ext4_exit_fs
7a990-7a9a0 g __pfx_cleanup_module
Overlapping symbols:
7a9a0-7aa1c l ext4_exit_fs
7a9a0-7aa1c g cleanup_module
...
JSON metric updates:
- A new round of Intel metric updates
- Support Power11 PVR (compatible to Power10)
- Fix cache latency events on Zen 4 to set SliceId properly
Internal:
- Fix reference counting for 'map' data structure, tireless work from
Ian!
- More memory optimization for struct thread and annotate histogram.
Now, 'perf report' (TUI) and 'perf annotate' should be much
lighter-weight in terms of memory footprint
- Support cross-arch perf register access. Clean up the build
configuration so that it can detect arch-register support at
runtime. This can allow to parse register data in sample which was
recorded in a different arch
Others:
- Sync task state in 'perf sched' to kernel using trace event fields.
The task states have been changed so tools cannot assume a fixed
encoding
- Clean up 'perf mem' to generalize the arch-specific events
- Add support for local and global variables to data type profiling.
This would increase the success rate of type resolution with DWARF
- Add short option -H for --hierarchy in 'perf report' and 'perf top'"
* tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (154 commits)
perf annotate: Add comments in the data structures
perf annotate: Remove sym_hist.addr[] array
perf annotate: Calculate instruction overhead using hashmap
perf annotate: Add a hashmap for symbol histogram
perf threads: Reduce table size from 256 to 8
perf threads: Switch from rbtree to hashmap
perf threads: Move threads to its own files
perf machine: Move machine's threads into its own abstraction
perf machine: Move fprintf to for_each loop and a callback
perf trace: Ignore thread hashing in summary
perf report: Sort child tasks by tid
perf vendor events amd: Fix Zen 4 cache latency events
perf version: Display availability of OpenCSD support
perf vendor events intel: Add umasks/occ_sel to PCU events.
perf map: Fix map reference count issues
libperf evlist: Avoid out-of-bounds access
perf lock contention: Account contending locks too
perf metrics: Fix segv for metrics with no events
perf metrics: Fix metric matching
perf pmu: Fix a potential memory leak in perf_pmu__lookup()
...
Diffstat (limited to 'tools/perf/util/threads.c')
-rw-r--r-- | tools/perf/util/threads.c | 190 |
1 files changed, 190 insertions, 0 deletions
diff --git a/tools/perf/util/threads.c b/tools/perf/util/threads.c new file mode 100644 index 000000000000..ff2b169e0085 --- /dev/null +++ b/tools/perf/util/threads.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "threads.h" +#include "machine.h" +#include "thread.h" + +static struct threads_table_entry *threads__table(struct threads *threads, pid_t tid) +{ + /* Cast it to handle tid == -1 */ + return &threads->table[(unsigned int)tid % THREADS__TABLE_SIZE]; +} + +static size_t key_hash(long key, void *ctx __maybe_unused) +{ + /* The table lookup removes low bit entropy, but this is just ignored here. */ + return key; +} + +static bool key_equal(long key1, long key2, void *ctx __maybe_unused) +{ + return key1 == key2; +} + +void threads__init(struct threads *threads) +{ + for (int i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads_table_entry *table = &threads->table[i]; + + hashmap__init(&table->shard, key_hash, key_equal, NULL); + init_rwsem(&table->lock); + table->last_match = NULL; + } +} + +void threads__exit(struct threads *threads) +{ + threads__remove_all_threads(threads); + for (int i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads_table_entry *table = &threads->table[i]; + + hashmap__clear(&table->shard); + exit_rwsem(&table->lock); + } +} + +size_t threads__nr(struct threads *threads) +{ + size_t nr = 0; + + for (int i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads_table_entry *table = &threads->table[i]; + + down_read(&table->lock); + nr += hashmap__size(&table->shard); + up_read(&table->lock); + } + return nr; +} + +/* + * Front-end cache - TID lookups come in blocks, + * so most of the time we dont have to look up + * the full rbtree: + */ +static struct thread *__threads_table_entry__get_last_match(struct threads_table_entry *table, + pid_t tid) +{ + struct thread *th, *res = NULL; + + th = table->last_match; + if (th != NULL) { + if (thread__tid(th) == tid) + res = thread__get(th); + } + return res; +} + +static void __threads_table_entry__set_last_match(struct threads_table_entry *table, + struct thread *th) +{ + thread__put(table->last_match); + table->last_match = thread__get(th); +} + +static void threads_table_entry__set_last_match(struct threads_table_entry *table, + struct thread *th) +{ + down_write(&table->lock); + __threads_table_entry__set_last_match(table, th); + up_write(&table->lock); +} + +struct thread *threads__find(struct threads *threads, pid_t tid) +{ + struct threads_table_entry *table = threads__table(threads, tid); + struct thread *res; + + down_read(&table->lock); + res = __threads_table_entry__get_last_match(table, tid); + if (!res) { + if (hashmap__find(&table->shard, tid, &res)) + res = thread__get(res); + } + up_read(&table->lock); + if (res) + threads_table_entry__set_last_match(table, res); + return res; +} + +struct thread *threads__findnew(struct threads *threads, pid_t pid, pid_t tid, bool *created) +{ + struct threads_table_entry *table = threads__table(threads, tid); + struct thread *res = NULL; + + *created = false; + down_write(&table->lock); + res = thread__new(pid, tid); + if (res) { + if (hashmap__add(&table->shard, tid, res)) { + /* Add failed. Assume a race so find other entry. */ + thread__put(res); + res = NULL; + if (hashmap__find(&table->shard, tid, &res)) + res = thread__get(res); + } else { + res = thread__get(res); + *created = true; + } + if (res) + __threads_table_entry__set_last_match(table, res); + } + up_write(&table->lock); + return res; +} + +void threads__remove_all_threads(struct threads *threads) +{ + for (int i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads_table_entry *table = &threads->table[i]; + struct hashmap_entry *cur, *tmp; + size_t bkt; + + down_write(&table->lock); + __threads_table_entry__set_last_match(table, NULL); + hashmap__for_each_entry_safe((&table->shard), cur, tmp, bkt) { + struct thread *old_value; + + hashmap__delete(&table->shard, cur->key, /*old_key=*/NULL, &old_value); + thread__put(old_value); + } + up_write(&table->lock); + } +} + +void threads__remove(struct threads *threads, struct thread *thread) +{ + struct threads_table_entry *table = threads__table(threads, thread__tid(thread)); + struct thread *old_value; + + down_write(&table->lock); + if (table->last_match && RC_CHK_EQUAL(table->last_match, thread)) + __threads_table_entry__set_last_match(table, NULL); + + hashmap__delete(&table->shard, thread__tid(thread), /*old_key=*/NULL, &old_value); + thread__put(old_value); + up_write(&table->lock); +} + +int threads__for_each_thread(struct threads *threads, + int (*fn)(struct thread *thread, void *data), + void *data) +{ + for (int i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads_table_entry *table = &threads->table[i]; + struct hashmap_entry *cur; + size_t bkt; + + down_read(&table->lock); + hashmap__for_each_entry((&table->shard), cur, bkt) { + int rc = fn((struct thread *)cur->pvalue, data); + + if (rc != 0) { + up_read(&table->lock); + return rc; + } + } + up_read(&table->lock); + } + return 0; + +} |