2013-06-23 03:22:33

by Greg Price

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

For example, in an application with an expensive function
implemented with deeply nested recursive calls, the default
call-graph presentation is dominated by the different callchains
within that function. By treating the function as a black box,
we can collect the callchains leading into the function and
compactly identify what to blame for expensive calls.

For example, in this report the callers of garbage_collect() are
scattered across the tree:
$ perf report -d ruby 2>- | grep -m10 ^[^#]*[a-z]
22.03% ruby [.] gc_mark
--- gc_mark
|--59.40%-- mark_keyvalue
| st_foreach
| gc_mark_children
| |--99.75%-- rb_gc_mark
| | rb_vm_mark
| | gc_mark_children
| | gc_marks
| | |--99.00%-- garbage_collect

If we make garbage_collect() a black box, its callers are coalesced:
$ perf report --blackbox garbage_collect -d ruby 2>- | grep -m10 ^[^#]*[a-z]
72.92% ruby [.] garbage_collect
--- garbage_collect
vm_xmalloc
|--47.08%-- ruby_xmalloc
| st_insert2
| rb_hash_aset
| |--98.45%-- features_index_add
| | rb_provide_feature
| | rb_require_safe
| | vm_call_method

Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: David Ahern <[email protected]>
Signed-off-by: Greg Price <[email protected]>
---

On Fri, Jan 11, 2013 at 02:27:36AM -0300, Arnaldo Carvalho de Melo wrote:
> Looks like an interesting feature, will review this soon,

Rebased on top of v3.10-rc7, please take a look at your convenience.


tools/perf/builtin-report.c | 19 ++++++++++++++++---
tools/perf/builtin-top.c | 3 +--
tools/perf/util/machine.c | 26 +++++++++++++++++---------
tools/perf/util/machine.h | 9 ++++++++-
tools/perf/util/session.c | 3 +--
5 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index bd0ca81..bf0d860 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -83,7 +83,7 @@ static int perf_report__add_mem_hist_entry(struct perf_tool *tool,
if ((sort__has_parent || symbol_conf.use_callchain) &&
sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent);
+ sample, &parent, al);
if (err)
return err;
}
@@ -174,7 +174,7 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
if ((sort__has_parent || symbol_conf.use_callchain)
&& sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent);
+ sample, &parent, al);
if (err)
return err;
}
@@ -245,7 +245,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,

if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent);
+ sample, &parent, al);
if (err)
return err;
}
@@ -764,6 +764,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
"Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt),
OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
"alias for inverted call graph"),
+ OPT_STRING(0, "blackbox", &blackbox_pattern, "regex",
+ "functions to treat as black boxes in call graphs, collapsing callees"),
OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
"only consider symbols in these dsos"),
OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
@@ -918,6 +920,17 @@ repeat:
} else
symbol_conf.exclude_other = false;

+ if (blackbox_pattern) {
+ int err = regcomp(&blackbox_regex, blackbox_pattern, REG_EXTENDED);
+ if (err) {
+ char buf[BUFSIZ];
+ regerror(err, &blackbox_regex, buf, sizeof(buf));
+ pr_err("Invalid blackbox regex: %s\n%s", blackbox_pattern, buf);
+ goto error;
+ }
+ have_blackbox = 1;
+ }
+
if (argc) {
/*
* Special case: if there's an argument left then assume that
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 67bdb9f..abec83d 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -775,8 +775,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
sample->callchain) {
err = machine__resolve_callchain(machine, evsel,
al.thread, sample,
- &parent);
-
+ &parent, NULL);
if (err)
return;
}
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b2ecad6..a14489c 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -11,6 +11,10 @@
#include <stdbool.h>
#include "unwind.h"

+regex_t blackbox_regex;
+const char *blackbox_pattern;
+int have_blackbox = 0;
+
int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
{
map_groups__init(&machine->kmaps);
@@ -1058,11 +1062,10 @@ int machine__process_event(struct machine *machine, union perf_event *event)
return ret;
}

-static bool symbol__match_parent_regex(struct symbol *sym)
+static bool symbol__match_regex(struct symbol *sym, regex_t *regex)
{
- if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0))
+ if (sym->name && !regexec(regex, sym->name, 0, NULL, 0))
return 1;
-
return 0;
}

@@ -1159,8 +1162,8 @@ struct branch_info *machine__resolve_bstack(struct machine *machine,
static int machine__resolve_callchain_sample(struct machine *machine,
struct thread *thread,
struct ip_callchain *chain,
- struct symbol **parent)
-
+ struct symbol **parent,
+ struct addr_location *root_al)
{
u8 cpumode = PERF_RECORD_MISC_USER;
unsigned int i;
@@ -1211,8 +1214,13 @@ static int machine__resolve_callchain_sample(struct machine *machine,
MAP__FUNCTION, ip, &al, NULL);
if (al.sym != NULL) {
if (sort__has_parent && !*parent &&
- symbol__match_parent_regex(al.sym))
+ symbol__match_regex(al.sym, &parent_regex))
*parent = al.sym;
+ else if (have_blackbox && root_al &&
+ symbol__match_regex(al.sym, &blackbox_regex)) {
+ *root_al = al;
+ callchain_cursor_reset(&callchain_cursor);
+ }
if (!symbol_conf.use_callchain)
break;
}
@@ -1237,15 +1245,15 @@ int machine__resolve_callchain(struct machine *machine,
struct perf_evsel *evsel,
struct thread *thread,
struct perf_sample *sample,
- struct symbol **parent)
-
+ struct symbol **parent,
+ struct addr_location *root_al)
{
int ret;

callchain_cursor_reset(&callchain_cursor);

ret = machine__resolve_callchain_sample(machine, thread,
- sample->callchain, parent);
+ sample->callchain, parent, root_al);
if (ret)
return ret;

diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 7794068..6f0310a 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -3,8 +3,14 @@

#include <sys/types.h>
#include <linux/rbtree.h>
+#include <regex.h>
#include "map.h"

+extern regex_t blackbox_regex;
+extern const char *blackbox_pattern;
+extern int have_blackbox;
+
+struct addr_location;
struct branch_stack;
struct perf_evsel;
struct perf_sample;
@@ -83,7 +89,8 @@ int machine__resolve_callchain(struct machine *machine,
struct perf_evsel *evsel,
struct thread *thread,
struct perf_sample *sample,
- struct symbol **parent);
+ struct symbol **parent,
+ struct addr_location *root_al);

/*
* Default guest kernel is defined by parameter --guestkallsyms
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index cf1fe01..7024950 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1397,9 +1397,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,

if (symbol_conf.use_callchain && sample->callchain) {

-
if (machine__resolve_callchain(machine, evsel, al.thread,
- sample, NULL) != 0) {
+ sample, NULL, NULL) != 0) {
if (verbose)
error("Failed to resolve callchain. Skipping\n");
return;
--
1.8.2


2013-06-23 21:53:42

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

On Sat, Jun 22, 2013 at 11:17:20PM -0400, Greg Price wrote:
> For example, in an application with an expensive function
> implemented with deeply nested recursive calls, the default
> call-graph presentation is dominated by the different callchains
> within that function. By treating the function as a black box,
> we can collect the callchains leading into the function and
> compactly identify what to blame for expensive calls.
>
> For example, in this report the callers of garbage_collect() are
> scattered across the tree:
> $ perf report -d ruby 2>- | grep -m10 ^[^#]*[a-z]
> 22.03% ruby [.] gc_mark
> --- gc_mark
> |--59.40%-- mark_keyvalue
> | st_foreach
> | gc_mark_children
> | |--99.75%-- rb_gc_mark
> | | rb_vm_mark
> | | gc_mark_children
> | | gc_marks
> | | |--99.00%-- garbage_collect
>
> If we make garbage_collect() a black box, its callers are coalesced:
> $ perf report --blackbox garbage_collect -d ruby 2>- | grep -m10 ^[^#]*[a-z]
> 72.92% ruby [.] garbage_collect
> --- garbage_collect
> vm_xmalloc
> |--47.08%-- ruby_xmalloc
> | st_insert2
> | rb_hash_aset
> | |--98.45%-- features_index_add
> | | rb_provide_feature
> | | rb_require_safe
> | | vm_call_method

Seems useful, sort of oposite to parent option (-p)

few comments below

SNIP

> * Special case: if there's an argument left then assume that
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 67bdb9f..abec83d 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -775,8 +775,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
> sample->callchain) {
> err = machine__resolve_callchain(machine, evsel,
> al.thread, sample,
> - &parent);
> -
> + &parent, NULL);
> if (err)
> return;
> }

Any reason why not add this for top?

> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index b2ecad6..a14489c 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -11,6 +11,10 @@
> #include <stdbool.h>
> #include "unwind.h"
>
> +regex_t blackbox_regex;
> +const char *blackbox_pattern;
> +int have_blackbox = 0;

util/sort.c mich be better place for this

It could also make sense to allow sorting on this
the same way as we do for '-s parent' and report only
'[other]' and 'blackbox' entries.

Also I dont like the 'blackbox' option name, it should
complement the parent option somehow.. but no idea ;-)

I tested this on separate example and numbers seem ok.

jirka

2013-06-24 08:32:59

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph


* Jiri Olsa <[email protected]> wrote:

> It could also make sense to allow sorting on this
> the same way as we do for '-s parent' and report only
> '[other]' and 'blackbox' entries.
>
> Also I dont like the 'blackbox' option name, it should
> complement the parent option somehow.. but no idea ;-)

Looks like a nice feature.

Maybe calling it '--collapse' would be a better name?

By default the call-graphs are all expanded to maximum. With this option
certain function(s) and all their child chains can be collapsed.

--parent filters the call-chains, excluding all others that don't include
this parent. It might make sense to rename it to --filter?

It would also be nice if all these visualization variants were available
in the GTK front-end.

Thanks,

Ingo

2013-06-24 22:50:38

by Greg Price

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

On Sun, Jun 23, 2013 at 11:53:27PM +0200, Jiri Olsa wrote:
> Seems useful, sort of oposite to parent option (-p)

Cool, good to hear.


> Any reason why not add this for top?

Only because I didn't think about it. :) Seems like a good idea; I'll
add that.


> > diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> > index b2ecad6..a14489c 100644
> > --- a/tools/perf/util/machine.c
> > +++ b/tools/perf/util/machine.c
> > @@ -11,6 +11,10 @@
> > #include <stdbool.h>
> > #include "unwind.h"
> >
> > +regex_t blackbox_regex;
> > +const char *blackbox_pattern;
> > +int have_blackbox = 0;
>
> util/sort.c mich be better place for this

Sure, happy to put it there.


> It could also make sense to allow sorting on this
> the same way as we do for '-s parent' and report only
> '[other]' and 'blackbox' entries.

Could be. My inclination would be to wait for someone to show up
wanting to use that functionality, because it's not yet obvious to me
why it will be useful.


> Also I dont like the 'blackbox' option name, it should
> complement the parent option somehow.. but no idea ;-)

OK, I'll think of some alternative possible names.


Thanks for the review!

Cheers,
Greg

2013-06-24 23:14:39

by Greg Price

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

On Mon, Jun 24, 2013 at 10:32:53AM +0200, Ingo Molnar wrote:
> * Jiri Olsa <[email protected]> wrote:
> > It could also make sense to allow sorting on this
> > the same way as we do for '-s parent' and report only
> > '[other]' and 'blackbox' entries.
> >
> > Also I dont like the 'blackbox' option name, it should
> > complement the parent option somehow.. but no idea ;-)
>
> Looks like a nice feature.

Thanks, good to hear!


> Maybe calling it '--collapse' would be a better name?
>
> By default the call-graphs are all expanded to maximum. With this option
> certain function(s) and all their child chains can be collapsed.

Maybe. But what this does is actually a bit of an inverse of the
collapsing one does in the front-end (with 'C' or with enter, etc.);
unless we have --inverted/-G, the collapsing one can do interactively
is of the various *callers*, recursively, of a function, whereas this
collapses the recursive *callees*. Or in other words, interactively
one may collapse or expand a subtree of the display; with this option
one collects together and merges all the subtrees that have a given
symbol name at their respective roots.

So this is doing something that can't be achieved by just doing some
grunt work in the front-end as a user, and it might be confusing to
give it the same name as the interactive thing.

If you're investigating the kind of question that --inverted/-G is
good for answering, then this is the same thing one can do with that
grunt work, but in that case the option is less interesting. :) The
example in my commit message, which comes from a real use case, is
without -G.

Perhaps --coalesce, but that's a little mysterious. --trim-callees?
--ignore-callees? --ignore-inside? --strip-callees? --trim-inside?


> --parent filters the call-chains, excluding all others that don't include
> this parent. It might make sense to rename it to --filter?

Sure, maybe so. I don't have a firm opinion on the name or exact
semantics of --parent.


> It would also be nice if all these visualization variants were available
> in the GTK front-end.

TBH I'm not really familiar with the GTK front-end, as I mainly use
the TUI. At a quick trial, it looks like --blackbox has the expected
effect on the display there; though with or without --blackbox I can't
seem to get the entries to expand to show me a call-graph profile, so
it's hard to demonstrate it fully. Not sure what I may have done
wrong in building or running perf to make that not work (or is that
expected?)

What changes do you have in mind to make these available in the GTK
front-end?


Cheers,
Greg

2013-06-25 07:47:39

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph


* Greg Price <[email protected]> wrote:

> > It would also be nice if all these visualization variants were available
> > in the GTK front-end.
>
> TBH I'm not really familiar with the GTK front-end, as I mainly use
> the TUI. At a quick trial, it looks like --blackbox has the expected
> effect on the display there; though with or without --blackbox I can't
> seem to get the entries to expand to show me a call-graph profile, so
> it's hard to demonstrate it fully. Not sure what I may have done
> wrong in building or running perf to make that not work (or is that
> expected?)
>
> What changes do you have in mind to make these available in the GTK
> front-end?

I was thinking of something obvious like right-clicking it to make that
function back boxed away or so? Have no firm ideas - maybe the GTK gents
on Cc: know how to best integrate such features.

Thanks,

Ingo

2013-06-25 08:01:50

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

Hi,

2013-06-25 PM 4:47, Ingo Molnar wrote:
>
> * Greg Price <[email protected]> wrote:
>
>>> It would also be nice if all these visualization variants were available
>>> in the GTK front-end.
>>
>> TBH I'm not really familiar with the GTK front-end, as I mainly use
>> the TUI. At a quick trial, it looks like --blackbox has the expected
>> effect on the display there; though with or without --blackbox I can't
>> seem to get the entries to expand to show me a call-graph profile, so
>> it's hard to demonstrate it fully. Not sure what I may have done
>> wrong in building or running perf to make that not work (or is that
>> expected?)

Currently perf GTK front-end does not support callchains at all.
There's a floating patchset to enable it though.

https://lkml.org/lkml/2013/6/4/181

>>
>> What changes do you have in mind to make these available in the GTK
>> front-end?
>
> I was thinking of something obvious like right-clicking it to make that
> function back boxed away or so? Have no firm ideas - maybe the GTK gents
> on Cc: know how to best integrate such features.

We don't support context menu yet. Frankly, perf GTK code is still
premature and needs some love. Any contribution should be welcomed!

Thanks,
Namhyung

2013-06-26 01:29:00

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

Hi Greg,

On Sat, 22 Jun 2013 23:17:20 -0400, Greg Price wrote:
> For example, in an application with an expensive function
> implemented with deeply nested recursive calls, the default
> call-graph presentation is dominated by the different callchains
> within that function. By treating the function as a black box,
> we can collect the callchains leading into the function and
> compactly identify what to blame for expensive calls.
>
> For example, in this report the callers of garbage_collect() are

s/callers/callees/ ?

And it'd be better it shows more lines after garbage_collect so that one
can see its callers also to understand what it does more clearly.


> scattered across the tree:
> $ perf report -d ruby 2>- | grep -m10 ^[^#]*[a-z]
> 22.03% ruby [.] gc_mark
> --- gc_mark
> |--59.40%-- mark_keyvalue
> | st_foreach
> | gc_mark_children
> | |--99.75%-- rb_gc_mark
> | | rb_vm_mark
> | | gc_mark_children
> | | gc_marks
> | | |--99.00%-- garbage_collect
>
> If we make garbage_collect() a black box, its callers are coalesced:

Again, s/callers/callees/ ?


> $ perf report --blackbox garbage_collect -d ruby 2>- | grep -m10 ^[^#]*[a-z]
> 72.92% ruby [.] garbage_collect
> --- garbage_collect
> vm_xmalloc
> |--47.08%-- ruby_xmalloc
> | st_insert2
> | rb_hash_aset
> | |--98.45%-- features_index_add
> | | rb_provide_feature
> | | rb_require_safe
> | | vm_call_method
>
> Cc: Arnaldo Carvalho de Melo <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Paul Mackerras <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Jiri Olsa <[email protected]>
> Cc: David Ahern <[email protected]>
> Signed-off-by: Greg Price <[email protected]>
> ---
>
> On Fri, Jan 11, 2013 at 02:27:36AM -0300, Arnaldo Carvalho de Melo wrote:
>> Looks like an interesting feature, will review this soon,
>
> Rebased on top of v3.10-rc7, please take a look at your convenience.
>
>
> tools/perf/builtin-report.c | 19 ++++++++++++++++---
> tools/perf/builtin-top.c | 3 +--
> tools/perf/util/machine.c | 26 +++++++++++++++++---------
> tools/perf/util/machine.h | 9 ++++++++-
> tools/perf/util/session.c | 3 +--

You need to update the doc too.

> 5 files changed, 43 insertions(+), 17 deletions(-)
[SNIP]
> @@ -1211,8 +1214,13 @@ static int machine__resolve_callchain_sample(struct machine *machine,
> MAP__FUNCTION, ip, &al, NULL);
> if (al.sym != NULL) {
> if (sort__has_parent && !*parent &&
> - symbol__match_parent_regex(al.sym))
> + symbol__match_regex(al.sym, &parent_regex))
> *parent = al.sym;
> + else if (have_blackbox && root_al &&
> + symbol__match_regex(al.sym, &blackbox_regex)) {
> + *root_al = al;
> + callchain_cursor_reset(&callchain_cursor);

Okay, this is where the magic happens. :)

So it overwrites the original 'al' in process_sample_event() to
blackboxed symbol and drop the callchain. Wouldn't it deserve a
comment? :)


> + }
> if (!symbol_conf.use_callchain)
> break;
pp
This is unrelated to this patch, but why is this line needed? I guess
this check should be done before calling this function.

Thanks,
Namhyung

2013-06-26 22:30:12

by Greg Price

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

Hi Namhyung,

Thanks for the detailed review!


On Wed, Jun 26, 2013 at 10:28:56AM +0900, Namhyung Kim wrote:
> On Sat, 22 Jun 2013 23:17:20 -0400, Greg Price wrote:
> > For example, in an application with an expensive function
> > implemented with deeply nested recursive calls, the default
> > call-graph presentation is dominated by the different callchains
> > within that function. By treating the function as a black box,
> > we can collect the callchains leading into the function and
> > compactly identify what to blame for expensive calls.
> >
> > For example, in this report the callers of garbage_collect() are
>
> s/callers/callees/ ?

No, 'callers' is right. This report is made without -G/--inverted, so
the trees are rooted at the inmost callees (the actual values of the
IP) and each node's children are its callers, rather than vice versa.
Here we want to see who is making these expensive calls to
garbage_collect, but the answer to this question is obscured because
the relevant callchains are separated according to which internal
helper functions to garbage_collect were on the stack.


> And it'd be better it shows more lines after garbage_collect so that one
> can see its callers also to understand what it does more clearly.

If you mean the commit message, the 'after' example that follows shows
this. In the actual output (without the 'grep -m10' I've used here)
the patch doesn't affect those lines, and plenty more are in fact shown.

I could make the 'before' example longer too, and then we'd see what
the callers were in the callchains that look like
gc_mark <- mark_keyvalue <- st_foreach <- gc_mark_children
<- rb_gc_mark <- rb_vm_mark <- gc_mark_children <- gc_marks
<- garbage_collect <- (the rest of a callchain).
But the callchains that have something else instead of that particular
sequence of eight helper functions (gc_mark called by ... called by
gc_marks) inside garbage_collect won't be included -- they're shown
in other places in the tree. So it's actually precisely by using this
option that it's possible to see the callers completely, rather than
scattered across many places.


> > scattered across the tree:
> > $ perf report -d ruby 2>- | grep -m10 ^[^#]*[a-z]
> > 22.03% ruby [.] gc_mark
> > --- gc_mark
> > |--59.40%-- mark_keyvalue
> > | st_foreach
> > | gc_mark_children
> > | |--99.75%-- rb_gc_mark
> > | | rb_vm_mark
> > | | gc_mark_children
> > | | gc_marks
> > | | |--99.00%-- garbage_collect
> >
> > If we make garbage_collect() a black box, its callers are coalesced:
>
> Again, s/callers/callees/ ?

Same as above.


> > $ perf report --blackbox garbage_collect -d ruby 2>- | grep -m10 ^[^#]*[a-z]
> > 72.92% ruby [.] garbage_collect
> > --- garbage_collect
> > vm_xmalloc
> > |--47.08%-- ruby_xmalloc
> > | st_insert2
> > | rb_hash_aset
> > | |--98.45%-- features_index_add
> > | | rb_provide_feature
> > | | rb_require_safe
> > | | vm_call_method


[snip]
> > tools/perf/builtin-report.c | 19 ++++++++++++++++---
> > tools/perf/builtin-top.c | 3 +--
> > tools/perf/util/machine.c | 26 +++++++++++++++++---------
> > tools/perf/util/machine.h | 9 ++++++++-
> > tools/perf/util/session.c | 3 +--
>
> You need to update the doc too.

Ah, thanks. Will do.


> > 5 files changed, 43 insertions(+), 17 deletions(-)
> [SNIP]
> > @@ -1211,8 +1214,13 @@ static int machine__resolve_callchain_sample(struct machine *machine,
> > MAP__FUNCTION, ip, &al, NULL);
> > if (al.sym != NULL) {
> > if (sort__has_parent && !*parent &&
> > - symbol__match_parent_regex(al.sym))
> > + symbol__match_regex(al.sym, &parent_regex))
> > *parent = al.sym;
> > + else if (have_blackbox && root_al &&
> > + symbol__match_regex(al.sym, &blackbox_regex)) {
> > + *root_al = al;
> > + callchain_cursor_reset(&callchain_cursor);
>
> Okay, this is where the magic happens. :)

Indeed! :)

> So it overwrites the original 'al' in process_sample_event() to
> blackboxed symbol and drop the callchain. Wouldn't it deserve a
> comment? :)

I could do that. Perhaps something like
/* ignore the callchain we had so far, i.e. this symbol's callees */
Sound like what you had in mind?



> > + }
> > if (!symbol_conf.use_callchain)
> > break;
> pp
> This is unrelated to this patch, but why is this line needed? I guess
> this check should be done before calling this function.

Hmm. We actually can get into this function when
!symbol_conf.use_callchain, if we're using say --sort=parent. But I'm
still somewhat puzzled, because in that case it looks like we'll break
the loop after the first address with a symbol, even if we didn't find
the 'parent' there, which seems like it wouldn't serve the purpose.
Probably I'm still missing something.

FWIW, this logic has worked essentially the same way since v2.6.31-rc4~3^2~63.

Cheers,
Greg

2013-06-26 22:41:47

by Greg Price

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

Hi Namhyung, hi Ingo,

On Tue, Jun 25, 2013 at 05:01:47PM +0900, Namhyung Kim wrote:
> >>TBH I'm not really familiar with the GTK front-end, as I mainly use
> >>the TUI. At a quick trial, it looks like --blackbox has the expected
> >>effect on the display there; though with or without --blackbox I can't
> >>seem to get the entries to expand to show me a call-graph profile, so
> >>it's hard to demonstrate it fully. Not sure what I may have done
> >>wrong in building or running perf to make that not work (or is that
> >>expected?)
>
> Currently perf GTK front-end does not support callchains at all.
> There's a floating patchset to enable it though.
>
> https://lkml.org/lkml/2013/6/4/181

OK, good to know I didn't simply mess something up. So probably best
to leave any specific --blackbox-related support aside until that
patchset lands.


> >>What changes do you have in mind to make these available in the GTK
> >>front-end?
> >
> >I was thinking of something obvious like right-clicking it to make that
> >function back boxed away or so? Have no firm ideas - maybe the GTK gents
> >on Cc: know how to best integrate such features.
>
> We don't support context menu yet. Frankly, perf GTK code is still
> premature and needs some love. Any contribution should be welcomed!

Noted.

Cheers,
Greg

2013-06-27 04:58:19

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

On Wed, 26 Jun 2013 18:25:01 -0400, Greg Price wrote:
> Hi Namhyung,
>
> Thanks for the detailed review!
>
>
> On Wed, Jun 26, 2013 at 10:28:56AM +0900, Namhyung Kim wrote:
>> On Sat, 22 Jun 2013 23:17:20 -0400, Greg Price wrote:
>> > For example, in an application with an expensive function
>> > implemented with deeply nested recursive calls, the default
>> > call-graph presentation is dominated by the different callchains
>> > within that function. By treating the function as a black box,
>> > we can collect the callchains leading into the function and
>> > compactly identify what to blame for expensive calls.
>> >
>> > For example, in this report the callers of garbage_collect() are
>>
>> s/callers/callees/ ?
>
> No, 'callers' is right. This report is made without -G/--inverted, so
> the trees are rooted at the inmost callees (the actual values of the
> IP) and each node's children are its callers, rather than vice versa.
> Here we want to see who is making these expensive calls to
> garbage_collect, but the answer to this question is obscured because
> the relevant callchains are separated according to which internal
> helper functions to garbage_collect were on the stack.
>
>
>> And it'd be better it shows more lines after garbage_collect so that one
>> can see its callers also to understand what it does more clearly.
>
> If you mean the commit message, the 'after' example that follows shows
> this. In the actual output (without the 'grep -m10' I've used here)
> the patch doesn't affect those lines, and plenty more are in fact shown.
>
> I could make the 'before' example longer too, and then we'd see what
> the callers were in the callchains that look like
> gc_mark <- mark_keyvalue <- st_foreach <- gc_mark_children
> <- rb_gc_mark <- rb_vm_mark <- gc_mark_children <- gc_marks
> <- garbage_collect <- (the rest of a callchain).
> But the callchains that have something else instead of that particular
> sequence of eight helper functions (gc_mark called by ... called by
> gc_marks) inside garbage_collect won't be included -- they're shown
> in other places in the tree. So it's actually precisely by using this
> option that it's possible to see the callers completely, rather than
> scattered across many places.

Okay, I can see your point now. Thanks for the explanation.


>> [SNIP]
>> > @@ -1211,8 +1214,13 @@ static int machine__resolve_callchain_sample(struct machine *machine,
>> > MAP__FUNCTION, ip, &al, NULL);
>> > if (al.sym != NULL) {
>> > if (sort__has_parent && !*parent &&
>> > - symbol__match_parent_regex(al.sym))
>> > + symbol__match_regex(al.sym, &parent_regex))
>> > *parent = al.sym;
>> > + else if (have_blackbox && root_al &&
>> > + symbol__match_regex(al.sym, &blackbox_regex)) {
>> > + *root_al = al;
>> > + callchain_cursor_reset(&callchain_cursor);
>>
>> Okay, this is where the magic happens. :)
>
> Indeed! :)
>
>> So it overwrites the original 'al' in process_sample_event() to
>> blackboxed symbol and drop the callchain. Wouldn't it deserve a
>> comment? :)
>
> I could do that. Perhaps something like
> /* ignore the callchain we had so far, i.e. this symbol's callees */
> Sound like what you had in mind?

More important thing is, I think, it updates the original al (root_al)
so that the perf will see the new symbol as if it was sampled in the
first place and it will collect all of its callers in one place.

>
>
>> > + }
>> > if (!symbol_conf.use_callchain)
>> > break;
>> pp
>> This is unrelated to this patch, but why is this line needed? I guess
>> this check should be done before calling this function.
>
> Hmm. We actually can get into this function when
> !symbol_conf.use_callchain, if we're using say --sort=parent. But I'm
> still somewhat puzzled, because in that case it looks like we'll break
> the loop after the first address with a symbol, even if we didn't find
> the 'parent' there, which seems like it wouldn't serve the purpose.

Right, that's what I want to say.

We already have the check before calling this function so breaking the
loop after checking only first callchain node looks strange. If we
don't want to see callchains but only parents, it should either check
every callchain nodes or fail out as an unsupported operation IMHO.

Thanks,
Namhyung


> Probably I'm still missing something.
>
> FWIW, this logic has worked essentially the same way since v2.6.31-rc4~3^2~63.
>
> Cheers,
> Greg

2013-07-01 14:07:25

by Greg Price

[permalink] [raw]
Subject: Re: [PATCH] perf report: Add option to collapse undesired parts of call graph

On Thu, Jun 27, 2013 at 01:58:16PM +0900, Namhyung Kim wrote:
> On Wed, 26 Jun 2013 18:25:01 -0400, Greg Price wrote:
> > On Wed, Jun 26, 2013 at 10:28:56AM +0900, Namhyung Kim wrote:
> >> On Sat, 22 Jun 2013 23:17:20 -0400, Greg Price wrote:
> >> > @@ -1211,8 +1214,13 @@ static int machine__resolve_callchain_sample(struct machine *machine,
> >> > MAP__FUNCTION, ip, &al, NULL);
> >> > if (al.sym != NULL) {
> >> > if (sort__has_parent && !*parent &&
> >> > - symbol__match_parent_regex(al.sym))
> >> > + symbol__match_regex(al.sym, &parent_regex))
> >> > *parent = al.sym;
> >> > + else if (have_blackbox && root_al &&
> >> > + symbol__match_regex(al.sym, &blackbox_regex)) {
> >> > + *root_al = al;
> >> > + callchain_cursor_reset(&callchain_cursor);
> >>
> >> Okay, this is where the magic happens. :)
> >
> > Indeed! :)
> >
> >> So it overwrites the original 'al' in process_sample_event() to
> >> blackboxed symbol and drop the callchain. Wouldn't it deserve a
> >> comment? :)
> >
> > I could do that. Perhaps something like
> > /* ignore the callchain we had so far, i.e. this symbol's callees */
> > Sound like what you had in mind?
>
> More important thing is, I think, it updates the original al (root_al)
> so that the perf will see the new symbol as if it was sampled in the
> first place and it will collect all of its callers in one place.

OK, I'll try to capture that in a comment in v2.



> >> > + }
> >> > if (!symbol_conf.use_callchain)
> >> > break;
> >> pp
> >> This is unrelated to this patch, but why is this line needed? I guess
> >> this check should be done before calling this function.
> >
> > Hmm. We actually can get into this function when
> > !symbol_conf.use_callchain, if we're using say --sort=parent. But I'm
> > still somewhat puzzled, because in that case it looks like we'll break
> > the loop after the first address with a symbol, even if we didn't find
> > the 'parent' there, which seems like it wouldn't serve the purpose.
>
> Right, that's what I want to say.
>
> We already have the check before calling this function so breaking the
> loop after checking only first callchain node looks strange. If we
> don't want to see callchains but only parents, it should either check
> every callchain nodes or fail out as an unsupported operation IMHO.

I agree. Will reply momentarily with a patch.

I actually wrote a version that tries to keep the optimization, but
decided it was too complicated for what it gains us.

Cheers,
Greg

2013-07-01 14:08:55

by Greg Price

[permalink] [raw]
Subject: [PATCH] perf report: Fix bug in case "--no-call-graph -p foo"

The loop in machine__resolve_callchain_sample, which examines each
element of the callchain for several purposes, contains an optimization
which is intended to bail out early in case we have found the "parent"
and we aren't using the callchain for anything else.

But since v2.6.31-rc4~3^2~45 we have displayed callchains by default
when they are present, which makes that situation unusual. And in
fact in v2.6.31-rc4~3^2~63 we had already broken this optimization so
that it completely messes up the output if it applies: if we're
looking for the parent and not otherwise using the callchain, we bail
out after the first entry which we can resolve to a symbol, even if it
doesn't match the parent pattern (as it usually won't.) So it must
really be unusual to see this optimization apply, or someone would
have fixed the bug by now.

Therefore just kill the optimization.

Example before this patch:

$ perf report -p ^vm_ 2>- | grep Event
# Event count (approx.): 6392784226

$ perf report -p ^vm_ --no-call-graph 2>- | grep Event
# Event count (approx.): 54639399

After this patch:

$ perf report -p ^vm_ 2>- | grep Event
# Event count (approx.): 6392784226

$ perf report -p ^vm_ --no-call-graph 2>- | grep Event
# Event count (approx.): 6392784226

Reported-by: Namhyung Kim <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Signed-off-by: Greg Price <[email protected]>
---

The relevant previous thread:

On Thu, Jun 27, 2013 at 01:58:16PM +0900, Namhyung Kim wrote:
> On Wed, 26 Jun 2013 18:25:01 -0400, Greg Price wrote:
> > On Wed, Jun 26, 2013 at 10:28:56AM +0900, Namhyung Kim wrote:
> >> On Sat, 22 Jun 2013 23:17:20 -0400, Greg Price wrote:
> >> > + }
> >> > if (!symbol_conf.use_callchain)
> >> > break;
> >> pp
> >> This is unrelated to this patch, but why is this line needed? I guess
> >> this check should be done before calling this function.
> >
> > Hmm. We actually can get into this function when
> > !symbol_conf.use_callchain, if we're using say --sort=parent. But I'm
> > still somewhat puzzled, because in that case it looks like we'll break
> > the loop after the first address with a symbol, even if we didn't find
> > the 'parent' there, which seems like it wouldn't serve the purpose.
>
> Right, that's what I want to say.
>
> We already have the check before calling this function so breaking the
> loop after checking only first callchain node looks strange. If we
> don't want to see callchains but only parents, it should either check
> every callchain nodes or fail out as an unsupported operation IMHO.

Cheers,
Greg


tools/perf/util/machine.c | 2 --
1 file changed, 2 deletions(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 4618e03..3bd8912 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1217,8 +1217,6 @@ static int machine__resolve_callchain_sample(struct machine *machine,
*root_al = al;
callchain_cursor_reset(&callchain_cursor);
}
- if (!symbol_conf.use_callchain)
- break;
}

err = callchain_cursor_append(&callchain_cursor,
--
1.8.2