Hello,
This is another series of memory optimization in perf annotate.
v2 changes:
* fix a bug when offset is bigger than 16 bits
When perf annotate (or perf report/top with TUI) processes samples, it
needs to save the sample period (overhead) at instruction level. For
now, it allocates an array to do that for the whole symbol when it
hits any new symbol. This comes with a lot of waste since samples can
be very few and instructions span to multiple bytes.
For example, when a sample hits symbol 'foo' that has size of 100 and
that's the only sample falls into the symbol. Then it needs to
allocate a symbol histogram (sym_hist) and the its size would be
16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
But actually it just needs 32 (header + sym_hist_entry) bytes. Things
get worse if the symbol size is bigger (and it doesn't have many
samples in different places). Also note that it needs a separate
histogram for each event.
Let's split the sym_hist_entry and have it in a hash table so that it
can allocate only necessary entries.
No functional change intended.
Thanks,
Namhyung
Namhyung Kim (4):
perf annotate: Add a hashmap for symbol histogram
perf annotate: Calculate instruction overhead using hashmap
perf annotate: Remove sym_hist.addr[] array
perf annotate: Add comments in the data structures
tools/perf/ui/gtk/annotate.c | 14 ++++-
tools/perf/util/annotate.c | 116 ++++++++++++++++++++++-------------
tools/perf/util/annotate.h | 86 +++++++++++++++++++++++---
3 files changed, 159 insertions(+), 57 deletions(-)
--
2.44.0.rc1.240.g4c46232300-goog
It's not used anymore and the code is coverted to use a hash map. Now
sym_hist has a static size, so no need to have sizeof_sym_hist in the
struct annotated_source.
Reviewed-by: Ian Rogers <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate.c | 36 +++++-------------------------------
tools/perf/util/annotate.h | 4 +---
2 files changed, 6 insertions(+), 34 deletions(-)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 1451699d931d..ac002d907d81 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -896,33 +896,10 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src
}
static int annotated_source__alloc_histograms(struct annotated_source *src,
- size_t size, int nr_hists)
+ int nr_hists)
{
- size_t sizeof_sym_hist;
-
- /*
- * Add buffer of one element for zero length symbol.
- * When sample is taken from first instruction of
- * zero length symbol, perf still resolves it and
- * shows symbol name in perf report and allows to
- * annotate it.
- */
- if (size == 0)
- size = 1;
-
- /* Check for overflow when calculating sizeof_sym_hist */
- if (size > (SIZE_MAX - sizeof(struct sym_hist)) / sizeof(struct sym_hist_entry))
- return -1;
-
- sizeof_sym_hist = (sizeof(struct sym_hist) + size * sizeof(struct sym_hist_entry));
-
- /* Check for overflow in zalloc argument */
- if (sizeof_sym_hist > SIZE_MAX / nr_hists)
- return -1;
-
- src->sizeof_sym_hist = sizeof_sym_hist;
src->nr_histograms = nr_hists;
- src->histograms = calloc(nr_hists, sizeof_sym_hist) ;
+ src->histograms = calloc(nr_hists, sizeof(*src->histograms));
if (src->histograms == NULL)
return -1;
@@ -941,7 +918,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym)
annotation__lock(notes);
if (notes->src != NULL) {
memset(notes->src->histograms, 0,
- notes->src->nr_histograms * notes->src->sizeof_sym_hist);
+ notes->src->nr_histograms * sizeof(*notes->src->histograms));
hashmap__clear(notes->src->samples);
}
if (notes->branch && notes->branch->cycles_hist) {
@@ -1039,9 +1016,7 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
}
h->nr_samples++;
- h->addr[offset].nr_samples++;
h->period += sample->period;
- h->addr[offset].period += sample->period;
entry->nr_samples++;
entry->period += sample->period;
@@ -1094,8 +1069,7 @@ struct annotated_source *symbol__hists(struct symbol *sym, int nr_hists)
if (notes->src->histograms == NULL) {
alloc_histograms:
- annotated_source__alloc_histograms(notes->src, symbol__size(sym),
- nr_hists);
+ annotated_source__alloc_histograms(notes->src, nr_hists);
}
return notes->src;
@@ -2854,7 +2828,7 @@ void symbol__annotate_zero_histogram(struct symbol *sym, int evidx)
struct annotation *notes = symbol__annotation(sym);
struct sym_hist *h = annotation__histogram(notes, evidx);
- memset(h, 0, notes->src->sizeof_sym_hist);
+ memset(h, 0, sizeof(*notes->src->histograms) * notes->src->nr_histograms);
}
void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 3362980a5d3d..4bdc70a9d376 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -242,7 +242,6 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel);
struct sym_hist {
u64 nr_samples;
u64 period;
- struct sym_hist_entry addr[];
};
struct cyc_hist {
@@ -278,7 +277,6 @@ struct cyc_hist {
*/
struct annotated_source {
struct list_head source;
- size_t sizeof_sym_hist;
struct sym_hist *histograms;
struct annotation_line **offsets;
struct hashmap *samples;
@@ -348,7 +346,7 @@ void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *m
static inline struct sym_hist *annotated_source__histogram(struct annotated_source *src, int idx)
{
- return ((void *)src->histograms) + (src->sizeof_sym_hist * idx);
+ return &src->histograms[idx];
}
static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx)
--
2.44.0.rc1.240.g4c46232300-goog
Reviewed-by: Ian Rogers <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate.h | 69 ++++++++++++++++++++++++++++++++++----
1 file changed, 62 insertions(+), 7 deletions(-)
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 4bdc70a9d376..13cc659e508c 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -239,11 +239,42 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool r
size_t disasm__fprintf(struct list_head *head, FILE *fp);
void symbol__calc_percent(struct symbol *sym, struct evsel *evsel);
+/**
+ * struct sym_hist - symbol histogram information for an event
+ *
+ * @nr_samples: Total number of samples.
+ * @period: Sum of sample periods.
+ */
struct sym_hist {
u64 nr_samples;
u64 period;
};
+/**
+ * struct cyc_hist - (CPU) cycle histogram for a basic block
+ *
+ * @start: Start address of current block (if known).
+ * @cycles: Sum of cycles for the longest basic block.
+ * @cycles_aggr: Total cycles for this address.
+ * @cycles_max: Max cycles for this address.
+ * @cycles_min: Min cycles for this address.
+ * @cycles_spark: History of cycles for the longest basic block.
+ * @num: Number of samples for the longest basic block.
+ * @num_aggr: Total number of samples for this address.
+ * @have_start: Whether the current branch info has a start address.
+ * @reset: Number of resets due to a different start address.
+ *
+ * If sample has branch_stack and cycles info, it can construct basic blocks
+ * between two adjacent branches. It'd have start and end addresses but
+ * sometimes the start address may not be available. So the cycles are
+ * accounted at the end address. If multiple basic blocks end at the same
+ * address, it will take the longest one.
+ *
+ * The @start, @cycles, @cycles_spark and @num fields are used for the longest
+ * block only. Other fields are used for all cases.
+ *
+ * See __symbol__account_cycles().
+ */
struct cyc_hist {
u64 start;
u64 cycles;
@@ -258,18 +289,24 @@ struct cyc_hist {
u16 reset;
};
-/** struct annotated_source - symbols with hits have this attached as in sannotation
+/**
+ * struct annotated_source - symbols with hits have this attached as in annotation
*
- * @histograms: Array of addr hit histograms per event being monitored
- * nr_histograms: This may not be the same as evsel->evlist->core.nr_entries if
+ * @source: List head for annotated_line (embeded in disasm_line).
+ * @histograms: Array of symbol histograms per event to maintain the total number
+ * of samples and period.
+ * @nr_histograms: This may not be the same as evsel->evlist->core.nr_entries if
* we have more than a group in a evlist, where we will want
* to see each group separately, that is why symbol__annotate2()
* sets src->nr_histograms to evsel->nr_members.
- * @lines: If 'print_lines' is specified, per source code line percentages
- * @source: source parsed from a disassembler like objdump -dS
- * @cyc_hist: Average cycles per basic block
+ * @offsets: Array of annotation_line to be accessed by offset.
+ * @samples: Hash map of sym_hist_entry. Keyed by event index and offset in symbol.
+ * @nr_entries: Number of annotated_line in the source list.
+ * @nr_asm_entries: Number of annotated_line with actual asm instruction in the
+ * source list.
+ * @max_line_len: Maximum length of objdump output in an annotated_line.
*
- * lines is allocated, percentages calculated and all sorted by percentage
+ * disasm_lines are allocated, percentages calculated and all sorted by percentage
* when the annotation is about to be presented, so the percentages are for
* one of the entries in the histogram array, i.e. for the event/counter being
* presented. It is deallocated right after symbol__{tui,tty,etc}_annotate
@@ -286,6 +323,24 @@ struct annotated_source {
u16 max_line_len;
};
+/**
+ * struct annotated_branch - basic block and IPC information for a symbol.
+ *
+ * @hit_cycles: Total executed cycles.
+ * @hit_insn: Total number of instructions executed.
+ * @total_insn: Number of instructions in the function.
+ * @cover_insn: Number of distinct, actually executed instructions.
+ * @cycles_hist: Array of cyc_hist for each instruction.
+ * @max_coverage: Maximum number of covered basic block (used for block-range).
+ *
+ * This struct is used by two different codes when the sample has branch stack
+ * and cycles information. annotation__compute_ipc() calculates average IPC
+ * using @hit_insn / @hit_cycles. The actual coverage can be calculated using
+ * @cover_insn / @total_insn. The @cycles_hist can give IPC for each (longest)
+ * basic block ends at the given address.
+ * process_basic_block() calculates coverage of instructions (or basic blocks)
+ * in the function.
+ */
struct annotated_branch {
u64 hit_cycles;
u64 hit_insn;
--
2.44.0.rc1.240.g4c46232300-goog
Now symbol histogram uses an array to save per-offset sample counts.
But it wastes a lot of memory if the symbol has a few samples only.
Add a hashmap to save values only for actual samples.
For now, it has duplicate histogram (one in the existing array and
another in the new hash map). Once it can convert to use the hash
in all places, we can get rid of the array later.
Reviewed-by: Ian Rogers <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate.c | 42 ++++++++++++++++++++++++++++++++++++--
tools/perf/util/annotate.h | 2 ++
2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 107b264fa41e..caaea9421235 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -38,6 +38,7 @@
#include "arch/common.h"
#include "namespaces.h"
#include "thread.h"
+#include "hashmap.h"
#include <regex.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
@@ -863,6 +864,17 @@ bool arch__is(struct arch *arch, const char *name)
return !strcmp(arch->name, name);
}
+/* symbol histogram: key = offset << 16 | evsel->core.idx */
+static size_t sym_hist_hash(long key, void *ctx __maybe_unused)
+{
+ return (key >> 16) + (key & 0xffff);
+}
+
+static bool sym_hist_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+ return key1 == key2;
+}
+
static struct annotated_source *annotated_source__new(void)
{
struct annotated_source *src = zalloc(sizeof(*src));
@@ -877,6 +889,8 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src
{
if (src == NULL)
return;
+
+ hashmap__free(src->samples);
zfree(&src->histograms);
free(src);
}
@@ -909,6 +923,14 @@ static int annotated_source__alloc_histograms(struct annotated_source *src,
src->sizeof_sym_hist = sizeof_sym_hist;
src->nr_histograms = nr_hists;
src->histograms = calloc(nr_hists, sizeof_sym_hist) ;
+
+ if (src->histograms == NULL)
+ return -1;
+
+ src->samples = hashmap__new(sym_hist_hash, sym_hist_equal, NULL);
+ if (src->samples == NULL)
+ zfree(&src->histograms);
+
return src->histograms ? 0 : -1;
}
@@ -920,6 +942,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym)
if (notes->src != NULL) {
memset(notes->src->histograms, 0,
notes->src->nr_histograms * notes->src->sizeof_sym_hist);
+ hashmap__clear(notes->src->samples);
}
if (notes->branch && notes->branch->cycles_hist) {
memset(notes->branch->cycles_hist, 0,
@@ -983,8 +1006,10 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
struct perf_sample *sample)
{
struct symbol *sym = ms->sym;
- unsigned offset;
+ long hash_key;
+ u64 offset;
struct sym_hist *h;
+ struct sym_hist_entry *entry;
pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map__unmap_ip(ms->map, addr));
@@ -1002,15 +1027,28 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
__func__, __LINE__, sym->name, sym->start, addr, sym->end, sym->type == STT_FUNC);
return -ENOMEM;
}
+
+ hash_key = offset << 16 | evidx;
+ if (!hashmap__find(src->samples, hash_key, &entry)) {
+ entry = zalloc(sizeof(*entry));
+ if (entry == NULL)
+ return -ENOMEM;
+
+ if (hashmap__add(src->samples, hash_key, entry) < 0)
+ return -ENOMEM;
+ }
+
h->nr_samples++;
h->addr[offset].nr_samples++;
h->period += sample->period;
h->addr[offset].period += sample->period;
+ entry->nr_samples++;
+ entry->period += sample->period;
pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
", evidx=%d] => nr_samples: %" PRIu64 ", period: %" PRIu64 "\n",
sym->start, sym->name, addr, addr - sym->start, evidx,
- h->addr[offset].nr_samples, h->addr[offset].period);
+ entry->nr_samples, entry->period);
return 0;
}
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 94435607c958..a2b0c8210740 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -12,6 +12,7 @@
#include "symbol_conf.h"
#include "mutex.h"
#include "spark.h"
+#include "hashmap.h"
struct hist_browser_timer;
struct hist_entry;
@@ -280,6 +281,7 @@ struct annotated_source {
size_t sizeof_sym_hist;
struct sym_hist *histograms;
struct annotation_line **offsets;
+ struct hashmap *samples;
int nr_histograms;
int nr_entries;
int nr_asm_entries;
--
2.44.0.rc1.240.g4c46232300-goog
Use annotated_source.samples hashmap instead of addr array in the
struct sym_hist.
Reviewed-by: Ian Rogers <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/ui/gtk/annotate.c | 14 +++++++++---
tools/perf/util/annotate.c | 44 ++++++++++++++++++++++++------------
tools/perf/util/annotate.h | 11 +++++++++
3 files changed, 52 insertions(+), 17 deletions(-)
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index 394861245fd3..93ce3d47e47e 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -28,21 +28,29 @@ static const char *const col_names[] = {
static int perf_gtk__get_percent(char *buf, size_t size, struct symbol *sym,
struct disasm_line *dl, int evidx)
{
+ struct annotation *notes;
struct sym_hist *symhist;
+ struct sym_hist_entry *entry;
double percent = 0.0;
const char *markup;
int ret = 0;
+ u64 nr_samples = 0;
strcpy(buf, "");
if (dl->al.offset == (s64) -1)
return 0;
- symhist = annotation__histogram(symbol__annotation(sym), evidx);
- if (!symbol_conf.event_group && !symhist->addr[dl->al.offset].nr_samples)
+ notes = symbol__annotation(sym);
+ symhist = annotation__histogram(notes, evidx);
+ entry = annotated_source__hist_entry(notes->src, evidx, dl->al.offset);
+ if (entry)
+ nr_samples = entry->nr_samples;
+
+ if (!symbol_conf.event_group && nr_samples == 0)
return 0;
- percent = 100.0 * symhist->addr[dl->al.offset].nr_samples / symhist->nr_samples;
+ percent = 100.0 * nr_samples / symhist->nr_samples;
markup = perf_gtk__get_percent_color(percent);
if (markup)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index caaea9421235..1451699d931d 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2368,17 +2368,25 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
return err;
}
-static void calc_percent(struct sym_hist *sym_hist,
- struct hists *hists,
+static void calc_percent(struct annotation *notes,
+ struct evsel *evsel,
struct annotation_data *data,
s64 offset, s64 end)
{
+ struct hists *hists = evsel__hists(evsel);
+ int evidx = evsel->core.idx;
+ struct sym_hist *sym_hist = annotation__histogram(notes, evidx);
unsigned int hits = 0;
u64 period = 0;
while (offset < end) {
- hits += sym_hist->addr[offset].nr_samples;
- period += sym_hist->addr[offset].period;
+ struct sym_hist_entry *entry;
+
+ entry = annotated_source__hist_entry(notes->src, evidx, offset);
+ if (entry) {
+ hits += entry->nr_samples;
+ period += entry->period;
+ }
++offset;
}
@@ -2415,16 +2423,13 @@ static void annotation__calc_percent(struct annotation *notes,
end = next ? next->offset : len;
for_each_group_evsel(evsel, leader) {
- struct hists *hists = evsel__hists(evsel);
struct annotation_data *data;
- struct sym_hist *sym_hist;
BUG_ON(i >= al->data_nr);
- sym_hist = annotation__histogram(notes, evsel->core.idx);
data = &al->data[i++];
- calc_percent(sym_hist, hists, data, al->offset, end);
+ calc_percent(notes, evsel, data, al->offset, end);
}
}
}
@@ -2619,14 +2624,19 @@ static void print_summary(struct rb_root *root, const char *filename)
static void symbol__annotate_hits(struct symbol *sym, struct evsel *evsel)
{
+ int evidx = evsel->core.idx;
struct annotation *notes = symbol__annotation(sym);
- struct sym_hist *h = annotation__histogram(notes, evsel->core.idx);
+ struct sym_hist *h = annotation__histogram(notes, evidx);
u64 len = symbol__size(sym), offset;
- for (offset = 0; offset < len; ++offset)
- if (h->addr[offset].nr_samples != 0)
+ for (offset = 0; offset < len; ++offset) {
+ struct sym_hist_entry *entry;
+
+ entry = annotated_source__hist_entry(notes->src, evidx, offset);
+ if (entry && entry->nr_samples != 0)
printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
- sym->start + offset, h->addr[offset].nr_samples);
+ sym->start + offset, entry->nr_samples);
+ }
printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->nr_samples", h->nr_samples);
}
@@ -2855,8 +2865,14 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
h->nr_samples = 0;
for (offset = 0; offset < len; ++offset) {
- h->addr[offset].nr_samples = h->addr[offset].nr_samples * 7 / 8;
- h->nr_samples += h->addr[offset].nr_samples;
+ struct sym_hist_entry *entry;
+
+ entry = annotated_source__hist_entry(notes->src, evidx, offset);
+ if (entry == NULL)
+ continue;
+
+ entry->nr_samples = entry->nr_samples * 7 / 8;
+ h->nr_samples += entry->nr_samples;
}
}
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index a2b0c8210740..3362980a5d3d 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -356,6 +356,17 @@ static inline struct sym_hist *annotation__histogram(struct annotation *notes, i
return annotated_source__histogram(notes->src, idx);
}
+static inline struct sym_hist_entry *
+annotated_source__hist_entry(struct annotated_source *src, int idx, u64 offset)
+{
+ struct sym_hist_entry *entry;
+ long key = offset << 16 | idx;
+
+ if (!hashmap__find(src->samples, key, &entry))
+ return NULL;
+ return entry;
+}
+
static inline struct annotation *symbol__annotation(struct symbol *sym)
{
return (void *)sym - symbol_conf.priv_size;
--
2.44.0.rc1.240.g4c46232300-goog
Hi Arnaldo,
On Mon, Mar 4, 2024 at 3:08 PM Namhyung Kim <[email protected]> wrote:
>
> Hello,
>
> This is another series of memory optimization in perf annotate.
>
> v2 changes:
> * fix a bug when offset is bigger than 16 bits
Are you ok with this now?
Thanks,
Namhyung
>
>
> When perf annotate (or perf report/top with TUI) processes samples, it
> needs to save the sample period (overhead) at instruction level. For
> now, it allocates an array to do that for the whole symbol when it
> hits any new symbol. This comes with a lot of waste since samples can
> be very few and instructions span to multiple bytes.
>
> For example, when a sample hits symbol 'foo' that has size of 100 and
> that's the only sample falls into the symbol. Then it needs to
> allocate a symbol histogram (sym_hist) and the its size would be
>
> 16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
>
> But actually it just needs 32 (header + sym_hist_entry) bytes. Things
> get worse if the symbol size is bigger (and it doesn't have many
> samples in different places). Also note that it needs a separate
> histogram for each event.
>
> Let's split the sym_hist_entry and have it in a hash table so that it
> can allocate only necessary entries.
>
> No functional change intended.
>
> Thanks,
> Namhyung
>
>
> Namhyung Kim (4):
> perf annotate: Add a hashmap for symbol histogram
> perf annotate: Calculate instruction overhead using hashmap
> perf annotate: Remove sym_hist.addr[] array
> perf annotate: Add comments in the data structures
>
> tools/perf/ui/gtk/annotate.c | 14 ++++-
> tools/perf/util/annotate.c | 116 ++++++++++++++++++++++-------------
> tools/perf/util/annotate.h | 86 +++++++++++++++++++++++---
> 3 files changed, 159 insertions(+), 57 deletions(-)
>
> --
> 2.44.0.rc1.240.g4c46232300-goog
>
>
On Wed, Mar 06, 2024 at 10:26:24AM -0800, Namhyung Kim wrote:
> Hi Arnaldo,
>
> On Mon, Mar 4, 2024 at 3:08 PM Namhyung Kim <[email protected]> wrote:
> >
> > Hello,
> >
> > This is another series of memory optimization in perf annotate.
> >
> > v2 changes:
> > * fix a bug when offset is bigger than 16 bits
>
> Are you ok with this now?
I'll try to check soon, thanks for checking out.
- Arnaldo
On Mon, Mar 04, 2024 at 03:08:11PM -0800, Namhyung Kim wrote:
> Hello,
>
> This is another series of memory optimization in perf annotate.
>
> v2 changes:
> * fix a bug when offset is bigger than 16 bits
>
>
> When perf annotate (or perf report/top with TUI) processes samples, it
> needs to save the sample period (overhead) at instruction level. For
> now, it allocates an array to do that for the whole symbol when it
> hits any new symbol. This comes with a lot of waste since samples can
> be very few and instructions span to multiple bytes.
>
> For example, when a sample hits symbol 'foo' that has size of 100 and
> that's the only sample falls into the symbol. Then it needs to
> allocate a symbol histogram (sym_hist) and the its size would be
>
> 16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
>
> But actually it just needs 32 (header + sym_hist_entry) bytes. Things
> get worse if the symbol size is bigger (and it doesn't have many
> samples in different places). Also note that it needs a separate
> histogram for each event.
>
> Let's split the sym_hist_entry and have it in a hash table so that it
> can allocate only necessary entries.
>
> No functional change intended.
>
> Thanks,
> Namhyung
No difference before/after on that 'perf annotate --stdio2' for all
binaries in a perf record of building perf using the default binutils
objdump disassembler, etc.
Reviewed-by: Arnaldo Carvalho de Melo <[email protected]>
Tested-by: Arnaldo Carvalho de Melo <[email protected]>
- Arnaldo
>
> Namhyung Kim (4):
> perf annotate: Add a hashmap for symbol histogram
> perf annotate: Calculate instruction overhead using hashmap
> perf annotate: Remove sym_hist.addr[] array
> perf annotate: Add comments in the data structures
>
> tools/perf/ui/gtk/annotate.c | 14 ++++-
> tools/perf/util/annotate.c | 116 ++++++++++++++++++++++-------------
> tools/perf/util/annotate.h | 86 +++++++++++++++++++++++---
> 3 files changed, 159 insertions(+), 57 deletions(-)
>
> --
> 2.44.0.rc1.240.g4c46232300-goog
On Wed, Mar 6, 2024 at 2:22 PM Arnaldo Carvalho de Melo <[email protected]> wrote:
>
> On Mon, Mar 04, 2024 at 03:08:11PM -0800, Namhyung Kim wrote:
> > Hello,
> >
> > This is another series of memory optimization in perf annotate.
> >
> > v2 changes:
> > * fix a bug when offset is bigger than 16 bits
> >
> >
> > When perf annotate (or perf report/top with TUI) processes samples, it
> > needs to save the sample period (overhead) at instruction level. For
> > now, it allocates an array to do that for the whole symbol when it
> > hits any new symbol. This comes with a lot of waste since samples can
> > be very few and instructions span to multiple bytes.
> >
> > For example, when a sample hits symbol 'foo' that has size of 100 and
> > that's the only sample falls into the symbol. Then it needs to
> > allocate a symbol histogram (sym_hist) and the its size would be
> >
> > 16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
> >
> > But actually it just needs 32 (header + sym_hist_entry) bytes. Things
> > get worse if the symbol size is bigger (and it doesn't have many
> > samples in different places). Also note that it needs a separate
> > histogram for each event.
> >
> > Let's split the sym_hist_entry and have it in a hash table so that it
> > can allocate only necessary entries.
> >
> > No functional change intended.
> >
> > Thanks,
> > Namhyung
>
> No difference before/after on that 'perf annotate --stdio2' for all
> binaries in a perf record of building perf using the default binutils
> objdump disassembler, etc.
>
> Reviewed-by: Arnaldo Carvalho de Melo <[email protected]>
> Tested-by: Arnaldo Carvalho de Melo <[email protected]>
Great, thanks for testing.
Namhyung
On Mon, 4 Mar 2024 15:08:11 -0800, Namhyung Kim wrote:
> This is another series of memory optimization in perf annotate.
>
> v2 changes:
> * fix a bug when offset is bigger than 16 bits
>
>
> When perf annotate (or perf report/top with TUI) processes samples, it
> needs to save the sample period (overhead) at instruction level. For
> now, it allocates an array to do that for the whole symbol when it
> hits any new symbol. This comes with a lot of waste since samples can
> be very few and instructions span to multiple bytes.
>
> [...]
Applied to perf-tools-next, thanks!
Best regards,
--
Namhyung Kim <[email protected]>