Hello,
This is another series of memory optimization in perf annotate.
When perf annotate (or perf report/top with TUI) processes samples, it
needs to save the sample period (overhead) at instruction level. For
now, it allocates an array to do that for the whole symbol when it
hits any new symbol. This comes with a lot of waste since samples can
be very few and instructions span to multiple bytes.
For example, when a sample hits symbol 'foo' that has size of 100 and
that's the only sample falls into the symbol. Then it needs to
allocate a symbol histogram (sym_hist) and the its size would be
16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
But actually it just needs 32 (header + sym_hist_entry) bytes. Things
get worse if the symbol size is bigger (and it doesn't have many
samples in different places). Also note that it needs separate
histogram for each event.
Let's split the sym_hist_entry and have it in a hash table so that it
can allocate only necessary entries.
No functional change intended.
Thanks,
Namhyung
Namhyung Kim (4):
perf annotate: Add a hashmap for symbol histogram
perf annotate: Calculate instruction overhead using hashmap
perf annotate: Remove sym_hist.addr[] array
perf annotate: Add comments in the data structures
tools/perf/ui/gtk/annotate.c | 14 ++++-
tools/perf/util/annotate.c | 114 ++++++++++++++++++++++-------------
tools/perf/util/annotate.h | 86 +++++++++++++++++++++++---
3 files changed, 158 insertions(+), 56 deletions(-)
--
2.44.0.rc1.240.g4c46232300-goog
Now symbol histogram uses an array to save per-offset sample counts.
But it wastes a lot of memory if the symbol has a few samples only.
Add a hashmap to save values only for actual samples.
For now, it has duplicate histogram (one in the existing array and
another in the new hash map). Once it can convert to use the hash
in all places, we can get rid of the array later.
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate.c | 40 +++++++++++++++++++++++++++++++++++++-
tools/perf/util/annotate.h | 2 ++
2 files changed, 41 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 107b264fa41e..7a70e4d35c9b 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -38,6 +38,7 @@
#include "arch/common.h"
#include "namespaces.h"
#include "thread.h"
+#include "hashmap.h"
#include <regex.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
@@ -863,6 +864,17 @@ bool arch__is(struct arch *arch, const char *name)
return !strcmp(arch->name, name);
}
+/* symbol histogram: key = offset << 16 | evsel->core.idx */
+static size_t sym_hist_hash(long key, void *ctx __maybe_unused)
+{
+ return (key >> 16) + (key & 0xffff);
+}
+
+static bool sym_hist_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+ return key1 == key2;
+}
+
static struct annotated_source *annotated_source__new(void)
{
struct annotated_source *src = zalloc(sizeof(*src));
@@ -877,6 +889,8 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src
{
if (src == NULL)
return;
+
+ hashmap__free(src->samples);
zfree(&src->histograms);
free(src);
}
@@ -909,6 +923,14 @@ static int annotated_source__alloc_histograms(struct annotated_source *src,
src->sizeof_sym_hist = sizeof_sym_hist;
src->nr_histograms = nr_hists;
src->histograms = calloc(nr_hists, sizeof_sym_hist) ;
+
+ if (src->histograms == NULL)
+ return -1;
+
+ src->samples = hashmap__new(sym_hist_hash, sym_hist_equal, NULL);
+ if (src->samples == NULL)
+ zfree(&src->histograms);
+
return src->histograms ? 0 : -1;
}
@@ -920,6 +942,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym)
if (notes->src != NULL) {
memset(notes->src->histograms, 0,
notes->src->nr_histograms * notes->src->sizeof_sym_hist);
+ hashmap__clear(notes->src->samples);
}
if (notes->branch && notes->branch->cycles_hist) {
memset(notes->branch->cycles_hist, 0,
@@ -983,8 +1006,10 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
struct perf_sample *sample)
{
struct symbol *sym = ms->sym;
+ long hash_key;
unsigned offset;
struct sym_hist *h;
+ struct sym_hist_entry *entry;
pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map__unmap_ip(ms->map, addr));
@@ -1002,15 +1027,28 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
__func__, __LINE__, sym->name, sym->start, addr, sym->end, sym->type == STT_FUNC);
return -ENOMEM;
}
+
+ hash_key = offset << 16 | evidx;
+ if (!hashmap__find(src->samples, hash_key, &entry)) {
+ entry = zalloc(sizeof(*entry));
+ if (entry == NULL)
+ return -ENOMEM;
+
+ if (hashmap__add(src->samples, hash_key, entry) < 0)
+ return -ENOMEM;
+ }
+
h->nr_samples++;
h->addr[offset].nr_samples++;
h->period += sample->period;
h->addr[offset].period += sample->period;
+ entry->nr_samples++;
+ entry->period += sample->period;
pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
", evidx=%d] => nr_samples: %" PRIu64 ", period: %" PRIu64 "\n",
sym->start, sym->name, addr, addr - sym->start, evidx,
- h->addr[offset].nr_samples, h->addr[offset].period);
+ entry->nr_samples, entry->period);
return 0;
}
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 94435607c958..a2b0c8210740 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -12,6 +12,7 @@
#include "symbol_conf.h"
#include "mutex.h"
#include "spark.h"
+#include "hashmap.h"
struct hist_browser_timer;
struct hist_entry;
@@ -280,6 +281,7 @@ struct annotated_source {
size_t sizeof_sym_hist;
struct sym_hist *histograms;
struct annotation_line **offsets;
+ struct hashmap *samples;
int nr_histograms;
int nr_entries;
int nr_asm_entries;
--
2.44.0.rc1.240.g4c46232300-goog
On Tue, Feb 27, 2024 at 4:52 PM Namhyung Kim <[email protected]> wrote:
>
> Hello,
>
> This is another series of memory optimization in perf annotate.
>
> When perf annotate (or perf report/top with TUI) processes samples, it
> needs to save the sample period (overhead) at instruction level. For
> now, it allocates an array to do that for the whole symbol when it
> hits any new symbol. This comes with a lot of waste since samples can
> be very few and instructions span to multiple bytes.
>
> For example, when a sample hits symbol 'foo' that has size of 100 and
> that's the only sample falls into the symbol. Then it needs to
> allocate a symbol histogram (sym_hist) and the its size would be
>
> 16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
>
> But actually it just needs 32 (header + sym_hist_entry) bytes. Things
> get worse if the symbol size is bigger (and it doesn't have many
> samples in different places). Also note that it needs separate
> histogram for each event.
>
> Let's split the sym_hist_entry and have it in a hash table so that it
> can allocate only necessary entries.
>
> No functional change intended.
>
> Thanks,
> Namhyung
>
>
> Namhyung Kim (4):
> perf annotate: Add a hashmap for symbol histogram
> perf annotate: Calculate instruction overhead using hashmap
> perf annotate: Remove sym_hist.addr[] array
> perf annotate: Add comments in the data structures
Reviewed-by: Ian Rogers <[email protected]>
Thanks,
Ian
> tools/perf/ui/gtk/annotate.c | 14 ++++-
> tools/perf/util/annotate.c | 114 ++++++++++++++++++++++-------------
> tools/perf/util/annotate.h | 86 +++++++++++++++++++++++---
> 3 files changed, 158 insertions(+), 56 deletions(-)
>
> --
> 2.44.0.rc1.240.g4c46232300-goog
>
On Tue, Feb 27, 2024 at 04:52:26PM -0800, Namhyung Kim wrote:
> This is another series of memory optimization in perf annotate.
> When perf annotate (or perf report/top with TUI) processes samples, it
> needs to save the sample period (overhead) at instruction level. For
> now, it allocates an array to do that for the whole symbol when it
> hits any new symbol. This comes with a lot of waste since samples can
> be very few and instructions span to multiple bytes.
> For example, when a sample hits symbol 'foo' that has size of 100 and
> that's the only sample falls into the symbol. Then it needs to
> allocate a symbol histogram (sym_hist) and the its size would be
> 16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
> But actually it just needs 32 (header + sym_hist_entry) bytes. Things
> get worse if the symbol size is bigger (and it doesn't have many
> samples in different places). Also note that it needs separate
> histogram for each event.
> Let's split the sym_hist_entry and have it in a hash table so that it
> can allocate only necessary entries.
> No functional change intended.
I tried this before/after this series:
$ time perf annotate --stdio2 -i perf.data.annotate
For:
perf record -e '{cycles,instructions,cache-misses}' make -k CORESIGHT=1 O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin
And found these odd cases:
$ diff -u before after
--- before 2024-02-28 15:38:25.086062812 -0300
+++ after 2024-02-29 14:12:05.606652725 -0300
@@ -2450826,7 +2450826,7 @@
↓ je 1c62
→ call operator delete(void*)@plt
{ return _M_dataplus._M_p; }
- 1c62: mov 0x13c0(%rsp),%rdi
+ 0.00 0.00 100.00 1c62: mov 0x13c0(%rsp),%rdi
if (_M_data() == _M_local_data())
lea 0x13d0(%rsp),%rax
cmp %rax,%rdi
@@ -2470648,7 +2470648,7 @@
mov %rbx,%rdi
→ call operator delete(void*)@plt
using reference = T &;
- 0.00 0.00 100.00 11c65: mov 0x8(%r12),%rax
+ 11c65: mov 0x8(%r12),%rax
size_t size() const { return Size; }
mov 0x10(%r12),%ecx
mov %rax,%rbp
$
This is a large function:
Samples: 574K of events 'anon group { cpu_core/cycles/u, cpu_core/instructions/u, cpu_core/cache-misses/u }', 4000 Hz, Event count (approx.): 614695751751, [percent: local period]$
clang::CompilerInvocation::ParseCodeGenArgs(clang::CodeGenOptions&, llvm::opt::ArgList&, clang::InputKind, clang::DiagnosticsEngine&, llvm::Triple const&, std::__cxx11::basic_string<char, std
Percent
Probably when building the BPF skels in tools/perf/
- Arnaldo
Hi Arnaldo,
On Mon, Mar 4, 2024 at 5:58 AM Arnaldo Carvalho de Melo <[email protected]> wrote:
>
> On Tue, Feb 27, 2024 at 04:52:26PM -0800, Namhyung Kim wrote:
> > This is another series of memory optimization in perf annotate.
>
> > When perf annotate (or perf report/top with TUI) processes samples, it
> > needs to save the sample period (overhead) at instruction level. For
> > now, it allocates an array to do that for the whole symbol when it
> > hits any new symbol. This comes with a lot of waste since samples can
> > be very few and instructions span to multiple bytes.
>
> > For example, when a sample hits symbol 'foo' that has size of 100 and
> > that's the only sample falls into the symbol. Then it needs to
> > allocate a symbol histogram (sym_hist) and the its size would be
>
> > 16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
>
> > But actually it just needs 32 (header + sym_hist_entry) bytes. Things
> > get worse if the symbol size is bigger (and it doesn't have many
> > samples in different places). Also note that it needs separate
> > histogram for each event.
>
> > Let's split the sym_hist_entry and have it in a hash table so that it
> > can allocate only necessary entries.
>
> > No functional change intended.
>
> I tried this before/after this series:
>
> $ time perf annotate --stdio2 -i perf.data.annotate
>
> For:
>
> perf record -e '{cycles,instructions,cache-misses}' make -k CORESIGHT=1 O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin
>
> And found these odd cases:
>
> $ diff -u before after
> --- before 2024-02-28 15:38:25.086062812 -0300
> +++ after 2024-02-29 14:12:05.606652725 -0300
> @@ -2450826,7 +2450826,7 @@
> ↓ je 1c62
> → call operator delete(void*)@plt
> { return _M_dataplus._M_p; }
> - 1c62: mov 0x13c0(%rsp),%rdi
> + 0.00 0.00 100.00 1c62: mov 0x13c0(%rsp),%rdi
> if (_M_data() == _M_local_data())
> lea 0x13d0(%rsp),%rax
> cmp %rax,%rdi
> @@ -2470648,7 +2470648,7 @@
> mov %rbx,%rdi
> → call operator delete(void*)@plt
> using reference = T &;
> - 0.00 0.00 100.00 11c65: mov 0x8(%r12),%rax
> + 11c65: mov 0x8(%r12),%rax
> size_t size() const { return Size; }
> mov 0x10(%r12),%ecx
> mov %rax,%rbp
> $
>
>
> This is a large function:
Thanks for the test! I think it missed the cast to 64-bit somewhere.
I'll check and send v2 soon.
Thanks,
Namhyung
>
> Samples: 574K of events 'anon group { cpu_core/cycles/u, cpu_core/instructions/u, cpu_core/cache-misses/u }', 4000 Hz, Event count (approx.): 614695751751, [percent: local period]$
> clang::CompilerInvocation::ParseCodeGenArgs(clang::CodeGenOptions&, llvm::opt::ArgList&, clang::InputKind, clang::DiagnosticsEngine&, llvm::Triple const&, std::__cxx11::basic_string<char, std
> Percent
>
> Probably when building the BPF skels in tools/perf/
>
> - Arnaldo
On Mon, Mar 4, 2024 at 8:36 AM Namhyung Kim <[email protected]> wrote:
>
> Hi Arnaldo,
>
> On Mon, Mar 4, 2024 at 5:58 AM Arnaldo Carvalho de Melo <[email protected]> wrote:
> >
> > On Tue, Feb 27, 2024 at 04:52:26PM -0800, Namhyung Kim wrote:
> > > This is another series of memory optimization in perf annotate.
> >
> > > When perf annotate (or perf report/top with TUI) processes samples, it
> > > needs to save the sample period (overhead) at instruction level. For
> > > now, it allocates an array to do that for the whole symbol when it
> > > hits any new symbol. This comes with a lot of waste since samples can
> > > be very few and instructions span to multiple bytes.
> >
> > > For example, when a sample hits symbol 'foo' that has size of 100 and
> > > that's the only sample falls into the symbol. Then it needs to
> > > allocate a symbol histogram (sym_hist) and the its size would be
> >
> > > 16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
> >
> > > But actually it just needs 32 (header + sym_hist_entry) bytes. Things
> > > get worse if the symbol size is bigger (and it doesn't have many
> > > samples in different places). Also note that it needs separate
> > > histogram for each event.
> >
> > > Let's split the sym_hist_entry and have it in a hash table so that it
> > > can allocate only necessary entries.
> >
> > > No functional change intended.
> >
> > I tried this before/after this series:
> >
> > $ time perf annotate --stdio2 -i perf.data.annotate
> >
> > For:
> >
> > perf record -e '{cycles,instructions,cache-misses}' make -k CORESIGHT=1 O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin
> >
> > And found these odd cases:
> >
> > $ diff -u before after
> > --- before 2024-02-28 15:38:25.086062812 -0300
> > +++ after 2024-02-29 14:12:05.606652725 -0300
> > @@ -2450826,7 +2450826,7 @@
> > ↓ je 1c62
> > → call operator delete(void*)@plt
> > { return _M_dataplus._M_p; }
> > - 1c62: mov 0x13c0(%rsp),%rdi
> > + 0.00 0.00 100.00 1c62: mov 0x13c0(%rsp),%rdi
> > if (_M_data() == _M_local_data())
> > lea 0x13d0(%rsp),%rax
> > cmp %rax,%rdi
> > @@ -2470648,7 +2470648,7 @@
> > mov %rbx,%rdi
> > → call operator delete(void*)@plt
> > using reference = T &;
> > - 0.00 0.00 100.00 11c65: mov 0x8(%r12),%rax
> > + 11c65: mov 0x8(%r12),%rax
> > size_t size() const { return Size; }
> > mov 0x10(%r12),%ecx
> > mov %rax,%rbp
> > $
> >
> >
> > This is a large function:
>
> Thanks for the test! I think it missed the cast to 64-bit somewhere.
> I'll check and send v2 soon.
Yep, the offset variable in __symbol__inc_addr_samples()
should be u64.
Thanks,
Namhyung