diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-14 16:31:23 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-14 16:31:23 -0700 |
commit | 1bbeaf83dd7b5e3628b98bec66ff8fe2646e14aa (patch) | |
tree | a391eed8ae206613b48e02e56e6ad5c4432d8767 /tools/perf/util/annotate.h | |
parent | 63bd30f249dcf0a7ce16967935cecee8feec24bb (diff) | |
parent | 0f66dfe7b91d2743cc71dfff37af503215b204ef (diff) |
Merge tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools
Pull perf tools updates from Namhyung Kim:
"perf stat:
- Support new 'cluster' aggregation mode for shared resources
depending on the hardware configuration:
$ sudo perf stat -a --per-cluster -e cycles,instructions sleep 1
Performance counter stats for 'system wide':
S0-D0-CLS0 2 85,051,822 cycles
S0-D0-CLS0 2 73,909,908 instructions # 0.87 insn per cycle
S0-D0-CLS2 2 93,365,918 cycles
S0-D0-CLS2 2 83,006,158 instructions # 0.89 insn per cycle
S0-D0-CLS4 2 104,157,523 cycles
S0-D0-CLS4 2 53,234,396 instructions # 0.51 insn per cycle
S0-D0-CLS6 2 65,891,079 cycles
S0-D0-CLS6 2 41,478,273 instructions # 0.63 insn per cycle
1.002407989 seconds time elapsed
- Various fixes and cleanups for event metrics including NaN handling
perf script:
- Use libcapstone if available to disassemble the instructions. This
enables 'perf script -F disasm' and 'perf script --insn-trace=disasm'
(for Intel-PT):
$ perf script -F event,ip,disasm
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffa9839d25 movq %rax, %r14
cycles:P: ffffffffa9cdcaf0 endbr64
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffaa401f86 iretq
cycles:P: ffffffffa99c4de5 movq 0x30(%rcx), %r8
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffaa401f86 iretq
cycles:P: ffffffffa9907983 movl 0x68(%rbx), %eax
cycles:P: ffffffffa988d428 wrmsr
- Expose sample ID / stream ID to python scripts
perf test:
- Add more perf test cases from Redhat internal test suites. This
time it adds the base infra and a few perf probe tests. More to
come. :)
- Add 'perf test -p' for parallel execution and fix some issues found
by the parallel test
- Support symbol test to print symbols in given (active) module:
$ perf test -F -v Symbols --dso /lib/modules/$(uname -r)/kernel/fs/ext4/ext4.ko
--- start ---
Testing /lib/modules/6.5.13-1rodete2-amd64/kernel/fs/ext4/ext4.ko
Overlapping symbols:
7a990-7a9a0 l __pfx_ext4_exit_fs
7a990-7a9a0 g __pfx_cleanup_module
Overlapping symbols:
7a9a0-7aa1c l ext4_exit_fs
7a9a0-7aa1c g cleanup_module
...
JSON metric updates:
- A new round of Intel metric updates
- Support Power11 PVR (compatible to Power10)
- Fix cache latency events on Zen 4 to set SliceId properly
Internal:
- Fix reference counting for 'map' data structure, tireless work from
Ian!
- More memory optimization for struct thread and annotate histogram.
Now, 'perf report' (TUI) and 'perf annotate' should be much
lighter-weight in terms of memory footprint
- Support cross-arch perf register access. Clean up the build
configuration so that it can detect arch-register support at
runtime. This can allow to parse register data in sample which was
recorded in a different arch
Others:
- Sync task state in 'perf sched' to kernel using trace event fields.
The task states have been changed so tools cannot assume a fixed
encoding
- Clean up 'perf mem' to generalize the arch-specific events
- Add support for local and global variables to data type profiling.
This would increase the success rate of type resolution with DWARF
- Add short option -H for --hierarchy in 'perf report' and 'perf top'"
* tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (154 commits)
perf annotate: Add comments in the data structures
perf annotate: Remove sym_hist.addr[] array
perf annotate: Calculate instruction overhead using hashmap
perf annotate: Add a hashmap for symbol histogram
perf threads: Reduce table size from 256 to 8
perf threads: Switch from rbtree to hashmap
perf threads: Move threads to its own files
perf machine: Move machine's threads into its own abstraction
perf machine: Move fprintf to for_each loop and a callback
perf trace: Ignore thread hashing in summary
perf report: Sort child tasks by tid
perf vendor events amd: Fix Zen 4 cache latency events
perf version: Display availability of OpenCSD support
perf vendor events intel: Add umasks/occ_sel to PCU events.
perf map: Fix map reference count issues
libperf evlist: Avoid out-of-bounds access
perf lock contention: Account contending locks too
perf metrics: Fix segv for metrics with no events
perf metrics: Fix metric matching
perf pmu: Fix a potential memory leak in perf_pmu__lookup()
...
Diffstat (limited to 'tools/perf/util/annotate.h')
-rw-r--r-- | tools/perf/util/annotate.h | 98 |
1 files changed, 86 insertions, 12 deletions
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index dba50762c6e8..13cc659e508c 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -12,6 +12,7 @@ #include "symbol_conf.h" #include "mutex.h" #include "spark.h" +#include "hashmap.h" struct hist_browser_timer; struct hist_entry; @@ -238,12 +239,42 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool r size_t disasm__fprintf(struct list_head *head, FILE *fp); void symbol__calc_percent(struct symbol *sym, struct evsel *evsel); +/** + * struct sym_hist - symbol histogram information for an event + * + * @nr_samples: Total number of samples. + * @period: Sum of sample periods. + */ struct sym_hist { u64 nr_samples; u64 period; - struct sym_hist_entry addr[]; }; +/** + * struct cyc_hist - (CPU) cycle histogram for a basic block + * + * @start: Start address of current block (if known). + * @cycles: Sum of cycles for the longest basic block. + * @cycles_aggr: Total cycles for this address. + * @cycles_max: Max cycles for this address. + * @cycles_min: Min cycles for this address. + * @cycles_spark: History of cycles for the longest basic block. + * @num: Number of samples for the longest basic block. + * @num_aggr: Total number of samples for this address. + * @have_start: Whether the current branch info has a start address. + * @reset: Number of resets due to a different start address. + * + * If sample has branch_stack and cycles info, it can construct basic blocks + * between two adjacent branches. It'd have start and end addresses but + * sometimes the start address may not be available. So the cycles are + * accounted at the end address. If multiple basic blocks end at the same + * address, it will take the longest one. + * + * The @start, @cycles, @cycles_spark and @num fields are used for the longest + * block only. Other fields are used for all cases. + * + * See __symbol__account_cycles(). + */ struct cyc_hist { u64 start; u64 cycles; @@ -258,18 +289,24 @@ struct cyc_hist { u16 reset; }; -/** struct annotated_source - symbols with hits have this attached as in sannotation +/** + * struct annotated_source - symbols with hits have this attached as in annotation * - * @histograms: Array of addr hit histograms per event being monitored - * nr_histograms: This may not be the same as evsel->evlist->core.nr_entries if + * @source: List head for annotated_line (embeded in disasm_line). + * @histograms: Array of symbol histograms per event to maintain the total number + * of samples and period. + * @nr_histograms: This may not be the same as evsel->evlist->core.nr_entries if * we have more than a group in a evlist, where we will want * to see each group separately, that is why symbol__annotate2() * sets src->nr_histograms to evsel->nr_members. - * @lines: If 'print_lines' is specified, per source code line percentages - * @source: source parsed from a disassembler like objdump -dS - * @cyc_hist: Average cycles per basic block + * @offsets: Array of annotation_line to be accessed by offset. + * @samples: Hash map of sym_hist_entry. Keyed by event index and offset in symbol. + * @nr_entries: Number of annotated_line in the source list. + * @nr_asm_entries: Number of annotated_line with actual asm instruction in the + * source list. + * @max_line_len: Maximum length of objdump output in an annotated_line. * - * lines is allocated, percentages calculated and all sorted by percentage + * disasm_lines are allocated, percentages calculated and all sorted by percentage * when the annotation is about to be presented, so the percentages are for * one of the entries in the histogram array, i.e. for the event/counter being * presented. It is deallocated right after symbol__{tui,tty,etc}_annotate @@ -277,15 +314,33 @@ struct cyc_hist { */ struct annotated_source { struct list_head source; - size_t sizeof_sym_hist; struct sym_hist *histograms; struct annotation_line **offsets; + struct hashmap *samples; int nr_histograms; int nr_entries; int nr_asm_entries; u16 max_line_len; }; +/** + * struct annotated_branch - basic block and IPC information for a symbol. + * + * @hit_cycles: Total executed cycles. + * @hit_insn: Total number of instructions executed. + * @total_insn: Number of instructions in the function. + * @cover_insn: Number of distinct, actually executed instructions. + * @cycles_hist: Array of cyc_hist for each instruction. + * @max_coverage: Maximum number of covered basic block (used for block-range). + * + * This struct is used by two different codes when the sample has branch stack + * and cycles information. annotation__compute_ipc() calculates average IPC + * using @hit_insn / @hit_cycles. The actual coverage can be calculated using + * @cover_insn / @total_insn. The @cycles_hist can give IPC for each (longest) + * basic block ends at the given address. + * process_basic_block() calculates coverage of instructions (or basic blocks) + * in the function. + */ struct annotated_branch { u64 hit_cycles; u64 hit_insn; @@ -346,7 +401,7 @@ void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *m static inline struct sym_hist *annotated_source__histogram(struct annotated_source *src, int idx) { - return ((void *)src->histograms) + (src->sizeof_sym_hist * idx); + return &src->histograms[idx]; } static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx) @@ -354,6 +409,17 @@ static inline struct sym_hist *annotation__histogram(struct annotation *notes, i return annotated_source__histogram(notes->src, idx); } +static inline struct sym_hist_entry * +annotated_source__hist_entry(struct annotated_source *src, int idx, u64 offset) +{ + struct sym_hist_entry *entry; + long key = offset << 16 | idx; + + if (!hashmap__find(src->samples, key, &entry)) + return NULL; + return entry; +} + static inline struct annotation *symbol__annotation(struct symbol *sym) { return (void *)sym - symbol_conf.priv_size; @@ -442,14 +508,18 @@ int annotate_check_args(void); /** * struct annotated_op_loc - Location info of instruction operand - * @reg: Register in the operand + * @reg1: First register in the operand + * @reg2: Second register in the operand * @offset: Memory access offset in the operand * @mem_ref: Whether the operand accesses memory + * @multi_regs: Whether the second register is used */ struct annotated_op_loc { - int reg; + int reg1; + int reg2; int offset; bool mem_ref; + bool multi_regs; }; enum annotated_insn_ops { @@ -487,4 +557,8 @@ struct annotated_item_stat { }; extern struct list_head ann_insn_stat; +/* Calculate PC-relative address */ +u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, + struct disasm_line *dl); + #endif /* __PERF_ANNOTATE_H */ |