diff options
Diffstat (limited to 'tools/perf/Documentation')
-rw-r--r-- | tools/perf/Documentation/perf-amd-ibs.txt | 68 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-c2c.txt | 11 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-config.txt | 4 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-list.txt | 9 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-lock.txt | 15 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-mem.txt | 82 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-record.txt | 16 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-report.txt | 1 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-stat.txt | 7 | ||||
-rw-r--r-- | tools/perf/Documentation/perf-trace.txt | 9 | ||||
-rw-r--r-- | tools/perf/Documentation/perf.data-file-format.txt | 24 |
11 files changed, 218 insertions, 28 deletions
diff --git a/tools/perf/Documentation/perf-amd-ibs.txt b/tools/perf/Documentation/perf-amd-ibs.txt index 2fd31d9d7b71..548549935760 100644 --- a/tools/perf/Documentation/perf-amd-ibs.txt +++ b/tools/perf/Documentation/perf-amd-ibs.txt @@ -85,6 +85,15 @@ System-wide profile, uOps event, sampling period: 100000, L3MissOnly (Zen4 onwar # perf record -e ibs_op/cnt_ctl=1,l3missonly=1/ -c 100000 -a +System-wide profile, cycles event, sampling period: 100000, LdLat filtering (Zen5 +onward) + + # perf record -e ibs_op/ldlat=128/ -c 100000 -a + + Supported load latency threshold values are 128 to 2048 (both inclusive). + Latency value which is a multiple of 128 incurs a little less profiling + overhead compared to other values. + Per process(upstream v6.2 onward), uOps event, sampling period: 100000 # perf record -e ibs_op/cnt_ctl=1/ -c 100000 -p 1234 @@ -162,23 +171,48 @@ Below is a simple example of the perf mem tool. # perf mem report A normal perf mem report output will provide detailed memory access profile. -However, it can also be aggregated based on output fields. For example: - - # perf mem report -F mem,sample,snoop - Samples: 3M of event 'ibs_op//', Event count (approx.): 23524876 - Memory access Samples Snoop - N/A 1903343 N/A - L1 hit 1056754 N/A - L2 hit 75231 N/A - L3 hit 9496 HitM - L3 hit 2270 N/A - RAM hit 8710 N/A - Remote node, same socket RAM hit 3241 N/A - Remote core, same node Any cache hit 1572 HitM - Remote core, same node Any cache hit 514 N/A - Remote node, same socket Any cache hit 1216 HitM - Remote node, same socket Any cache hit 350 N/A - Uncached hit 18 N/A +New output fields will show related access info together. For example: + + # perf mem report -F overhead,cache,snoop,comm + ... + # Samples: 92K of event 'ibs_op//' + # Total weight : 531104 + # + # ---------- Cache ----------- --- Snoop ---- + # Overhead L1 L2 L1-buf Other HitM Other Command + # ........ ............................ .............. .......... + # + 76.07% 5.8% 35.7% 0.0% 34.6% 23.3% 52.8% cc1 + 5.79% 0.2% 0.0% 0.0% 5.6% 0.1% 5.7% make + 5.78% 0.1% 4.4% 0.0% 1.2% 0.5% 5.3% gcc + 5.33% 0.3% 3.9% 0.0% 1.1% 0.2% 5.2% as + 5.00% 0.1% 3.8% 0.0% 1.0% 0.3% 4.7% sh + 1.56% 0.1% 0.1% 0.0% 1.4% 0.6% 0.9% ld + 0.28% 0.1% 0.0% 0.0% 0.2% 0.1% 0.2% pkg-config + 0.09% 0.0% 0.0% 0.0% 0.1% 0.0% 0.1% git + 0.03% 0.0% 0.0% 0.0% 0.0% 0.0% 0.0% rm + ... + +Also, it can be aggregated based on various memory access info using the +sort keys. For example: + + # perf mem report -s mem,snoop + ... + # Samples: 92K of event 'ibs_op//' + # Total weight : 531104 + # Sort order : mem,snoop + # + # Overhead Samples Memory access Snoop + # ........ ............ ....................................... ............ + # + 47.99% 1509 L2 hit N/A + 25.08% 338 core, same node Any cache hit HitM + 10.24% 54374 N/A N/A + 6.77% 35938 L1 hit N/A + 6.39% 101 core, same node Any cache hit N/A + 3.50% 69 RAM hit N/A + 0.03% 158 LFB/MAB hit N/A + 0.00% 2 Uncached hit N/A Please refer to their man page for more detail. diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index 856f0dfb8e5a..f4af2dd6ab31 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -54,8 +54,15 @@ RECORD OPTIONS -l:: --ldlat:: - Configure mem-loads latency. Supported on Intel and Arm64 processors - only. Ignored on other archs. + Configure mem-loads latency. Supported on Intel, Arm64 and some AMD + processors. Ignored on other archs. + + On supported AMD processors: + - /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'. + - Supported latency values are 128 to 2048 (both inclusive). + - Latency value which is a multiple of 128 incurs a little less profiling + overhead compared to other values. + - Load latency filtering is disabled by default. -k:: --all-kernel:: diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 36ebebc875ea..c6f335659667 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -708,6 +708,10 @@ intel-pt.*:: the maximum is exceeded there will be a "Never-ending loop" error. The default is 100000. + intel-pt.all-switch-events:: + If the user has permission to do so, always record all context + switch events on all CPUs. + auxtrace.*:: auxtrace.dumpdir:: diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 8914f12d2b85..ce0735021473 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -289,6 +289,15 @@ Sums up the event counts for all hardware threads in a core, e.g.: perf stat -e cpu/event=0,umask=0x3,percore=1/ +cpu: + +Specifies the CPU to open the event upon. The value may be repeated to +specify opening the event on multiple CPUs: + + + perf stat -e instructions/cpu=0,cpu=2/,cycles/cpu=1,cpu=2/ -a sleep 1 + perf stat -e data_read/cpu=0/,data_write/cpu=1/ -a sleep 1 + EVENT GROUPS ------------ diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 859dc11a7372..c17b3e318169 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -216,6 +216,21 @@ CONTENTION OPTIONS --cgroup-filter=<value>:: Show lock contention only in the given cgroups (comma separated list). +-J:: +--inject-delay=<time@function>:: + Add delays to the given lock. It's added to the contention-end part so + that the (new) owner of the lock will be delayed. But by slowing down + the owner, the waiters will also be delayed as well. This is working + only with -b/--use-bpf. + + The 'time' is specified in nsec but it can have a unit suffix. Available + units are "ms", "us" and "ns". Currently it accepts up to 10ms of delays + for safety reasons. + + Note that it will busy-wait after it gets the lock. Delaying locks can + have significant consequences including potential kernel crashes. Please + use it at your own risk. + SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 8a1bd9ff0f86..4d164836d094 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -28,6 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide. Due to the statistical nature of SPE sampling, not every memory operation will be sampled. +On AMD this use IBS Op PMU to sample load-store operations. + COMMON OPTIONS -------------- -f:: @@ -67,8 +69,15 @@ RECORD OPTIONS Configure all used events to run in user space. --ldlat <n>:: - Specify desired latency for loads event. Supported on Intel and Arm64 - processors only. Ignored on other archs. + Specify desired latency for loads event. Supported on Intel, Arm64 and + some AMD processors. Ignored on other archs. + + On supported AMD processors: + - /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'. + - Supported latency values are 128 to 2048 (both inclusive). + - Latency value which is a multiple of 128 incurs a little less profiling + overhead compared to other values. + - Load latency filtering is disabled by default. REPORT OPTIONS -------------- @@ -110,6 +119,22 @@ REPORT OPTIONS And the default sort keys are changed to local_weight, mem, sym, dso, symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat. +-F:: +--fields=:: + Specify output field - multiple keys can be specified in CSV format. + Please see linkperf:perf-report[1] for details. + + In addition to the default fields, 'perf mem report' will provide the + following fields to break down sample periods. + + - op: operation in the sample instruction (load, store, prefetch, ...) + - cache: location in CPU cache (L1, L2, ...) where the sample hit + - mem: location in memory or other places the sample hit + - dtlb: location in Data TLB (L1, L2) where the sample hit + - snoop: snoop result for the sampled data access + + Please take a look at the OUTPUT FIELD SELECTION section for caveats. + -T:: --type-profile:: Show data-type profile result instead of code symbols. This requires @@ -128,6 +153,59 @@ REPORT OPTIONS In addition, for report all perf report options are valid, and for record all perf record options. +OVERHEAD CALCULATION +-------------------- +Unlike linkperf:perf-report[1], which calculates overhead from the actual +sample period, perf-mem overhead is calculated using sample weight. E.g. +there are two samples in perf.data file, both with the same sample period, +but one sample with weight 180 and the other with weight 20: + + $ perf script -F period,data_src,weight,ip,sym + 100000 629080842 |OP LOAD|LVL L3 hit|... 20 7e69b93ca524 strcmp + 100000 1a29081042 |OP LOAD|LVL RAM hit|... 180 ffffffff82429168 memcpy + + $ perf report -F overhead,symbol + 50% [.] strcmp + 50% [k] memcpy + + $ perf mem report -F overhead,symbol + 90% [k] memcpy + 10% [.] strcmp + +OUTPUT FIELD SELECTION +---------------------- +"perf mem report" adds a number of new output fields specific to data source +information in the sample. Some of them have the same name with the existing +sort keys ("mem" and "snoop"). So unlike other fields and sort keys, they'll +behave differently when it's used by -F/--fields or -s/--sort. + +Using those two as output fields will aggregate samples altogether and show +breakdown. + + $ perf mem report -F mem,snoop + ... + # ------ Memory ------- --- Snoop ---- + # RAM Uncach Other HitM Other + # ..................... .............. + # + 3.5% 0.0% 96.5% 25.1% 74.9% + +But using the same name for sort keys will aggregate samples for each type +separately. + + $ perf mem report -s mem,snoop + # Overhead Samples Memory access Snoop + # ........ ............ ....................................... ............ + # + 47.99% 1509 L2 hit N/A + 25.08% 338 core, same node Any cache hit HitM + 10.24% 54374 N/A N/A + 6.77% 35938 L1 hit N/A + 6.39% 101 core, same node Any cache hit N/A + 3.50% 69 RAM hit N/A + 0.03% 158 LFB/MAB hit N/A + 0.00% 2 Uncached hit N/A + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-arm-spe[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index c7fc1ba265e2..612612fa2d80 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -340,7 +340,7 @@ OPTIONS -d:: --data:: - Record the sample virtual addresses. + Record the sample virtual addresses. Implies --sample-mem-info. --phys-data:: Record the sample physical addresses. @@ -368,6 +368,11 @@ OPTIONS the sample_type member of the struct perf_event_attr argument to the perf_event_open system call. +--sample-mem-info:: + Record the sample data source information for memory operations. + It requires hardware supports and may work on specific events only. + Please consider using 'perf mem record' instead if you're not sure. + -n:: --no-samples:: Don't sample. @@ -837,6 +842,15 @@ filtered through the mask provided by -C option. only, as of now. So the applications built without the frame pointer might see bogus addresses. + off-cpu profiling consists two types of samples: direct samples, which + share the same behavior as regular samples, and the accumulated + samples, stored in BPF stack trace map, presented after all the regular + samples. + +--off-cpu-thresh:: + Once a task's off-cpu time reaches this threshold (in milliseconds), it + generates a direct off-cpu sample. The default is 500ms. + --setup-filter=<action>:: Prepare BPF filter to be used by regular users. The action should be either "pin" or "unpin". The filter can be used after it's pinned. diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 3376c4710575..acef3ff4178e 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -94,6 +94,7 @@ OPTIONS - comm: command (name) of the task which can be read via /proc/<pid>/comm - pid: command and tid of the task + - tgid: command and tgid of the task - dso: name of library or module executed at the time of sample - dso_size: size of library or module executed at the time of sample - symbol: name of function executed at the time of sample diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 2bc063672486..61d091670dee 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -506,6 +506,13 @@ this option is not set. The TPEBS hardware feature starts from Intel Granite Rapids microarchitecture. This option only exists in X86_64 and is meaningful on Intel platforms with TPEBS feature. +--tpebs-mode=[mean|min|max|last]:: +Set how retirement latency events have their sample times +combined. The default "mean" gives the average of retirement +latency. "min" or "max" give the smallest or largest retirment latency +times respectively. "last" uses the last retirment latency sample's +time. + --td-level:: Print the top-down statistics that equal the input level. It allows users to print the interested top-down metrics level instead of the diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 887dc37773d0..c1fb6056a0d3 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -152,7 +152,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. --summary-mode=mode:: To be used with -s or -S, to select how to show summary. By default it'll - show the syscall summary by thread. Possible values are: thread, total. + show the syscall summary by thread. Possible values are: thread, total, + cgroup. --tool_stats:: Show tool stats such as number of times fd->pathname was discovered thru @@ -251,6 +252,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can better pretty-print integer flags and struct pointers. +--bpf-summary:: + Collect system call statistics in BPF. This is only for live mode and + works well with -s/--summary option where no argument information is + required. + + PAGEFAULTS ---------- diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index 010a4edcd384..cd95ba09f727 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -370,7 +370,7 @@ struct { u32 mmap_len; }; -Indicates that trace contains records of PERF_RECORD_COMPRESSED type +Indicates that trace contains records of PERF_RECORD_COMPRESSED2 type that have perf_events records in compressed form. HEADER_CPU_PMU_CAPS = 28, @@ -602,7 +602,14 @@ struct auxtrace_error_event { Describes a header feature. These are records used in pipe-mode that contain information that otherwise would be in perf.data file's header. - PERF_RECORD_COMPRESSED = 81, + PERF_RECORD_COMPRESSED = 81, /* deprecated */ + +The header is followed by compressed data frame that can be decompressed +into array of perf trace records. The size of the entire compressed event +record including the header is limited by the max value of header.size. + +It is deprecated and new files should use PERF_RECORD_COMPRESSED2 to gurantee +8-byte alignment. struct compressed_event { struct perf_event_header header; @@ -618,10 +625,17 @@ This is used, for instance, to 'perf inject' events after init and before regular events, those emitted by the kernel, to support combining guest and host records. + PERF_RECORD_COMPRESSED2 = 83, -The header is followed by compressed data frame that can be decompressed -into array of perf trace records. The size of the entire compressed event -record including the header is limited by the max value of header.size. +8-byte aligned version of `PERF_RECORD_COMPRESSED`. `header.size` indicates the +total record size, including padding for 8-byte alignment, and `data_size` +specifies the actual size of the compressed data. + +struct perf_record_compressed2 { + struct perf_event_header header; + __u64 data_size; + char data[]; +}; Event types |