v1->v2:
- Include a compilation fix patch and a code streamlining patch
into the patch set.
- Use the __stringify() macro in stringify.h instead of adding a
duplicate macro.
- Add the --max-stack option to perf-top as well.
This perf patch set contains the following changes:
Patch 1 - Fix a perf tool compilation error that happens on SLES 11
sp3 system.
Patch 2 - Streamline the append_chain() function to make it run a bit
faster.
Patch 3 - Add a --max-stack option to perf-report to speed up its
processing at the expense of less backtrace information
available.
Patch 4 - Add a similar --max-stack option to perf-top.
Waiman Long (4):
perf: Fix potential compilation error with some compilers
perf: streamline append_chain() function
perf-report: add --max-stack option to limit callchain stack scan
perf-top: add --max-stack option to limit callchain stack scan
tools/perf/Documentation/perf-report.txt | 8 +++++++
tools/perf/Documentation/perf-top.txt | 8 +++++++
tools/perf/builtin-report.c | 22 +++++++++++++++----
tools/perf/builtin-top.c | 9 ++++++-
tools/perf/util/callchain.c | 9 +++----
tools/perf/util/machine.c | 14 ++++++++----
tools/perf/util/machine.h | 3 +-
.../perf/util/scripting-engines/trace-event-perl.c | 6 ++++-
tools/perf/util/session.c | 3 +-
tools/perf/util/top.h | 1 +
10 files changed, 63 insertions(+), 20 deletions(-)
The building of the perf tool failed in a SLES11 sp3 system with the
following compilation error:
cc1: warnings being treated as errors
util/scripting-engines/trace-event-perl.c: In function
‘perl_process_tracepoint’:
util/scripting-engines/trace-event-perl.c:285: error: format ‘%lu’
expects type ‘long unsigned int’, but argument 2 has type ‘__u64’
This patch replaces PRIu64 which is "lu" by the explicit "llu" to
fix this problem as __u64 is of type "long long unsigned".
Signed-off-by: Waiman Long <[email protected]>
---
.../perf/util/scripting-engines/trace-event-perl.c | 6 +++++-
1 files changed, 5 insertions(+), 1 deletions(-)
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index a85e4ae..d6eb9c5 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -281,8 +281,12 @@ static void perl_process_tracepoint(union perf_event *perf_event __maybe_unused,
return;
event = find_cache_event(evsel);
+ /*
+ * attr.config is a __u64 which requires "%llu" to avoid compilation
+ * error/warning with some compilers.
+ */
if (!event)
- die("ug! no event found for type %" PRIu64, evsel->attr.config);
+ die("ug! no event found for type %llu", evsel->attr.config);
pid = raw_field_value(event, "common_pid", data);
--
1.7.1
When callgraph data was included in the perf data file, it may take a
long time to scan all those data and merge them together especially
if the stored callchains are long and the perf data file itself is
large, like a Gbyte or so.
The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
This is a large value. Usually the callgraph data that developers are
most interested in are the first few levels, the rests are usually
not looked at.
This patch adds a new --max-stack option to perf-report to limit the
depth of callchain stack data to look at to reduce the time it takes
for perf-report to finish its processing. It trades the presence of
trailing stack information with faster speed.
The following table shows the elapsed time of doing perf-report on a
perf.data file of size 985,531,828 bytes.
--max_stack Elapsed Time Output data size
----------- ------------ ----------------
not set 88.0s 124,422,651
64 87.5s 116,303,213
32 87.2s 112,023,804
16 86.6s 94,326,380
8 59.9s 33,697,248
4 40.7s 10,116,637
-g none 27.1s 2,555,810
Signed-off-by: Waiman Long <[email protected]>
---
tools/perf/Documentation/perf-report.txt | 8 ++++++++
tools/perf/builtin-report.c | 22 +++++++++++++++++-----
tools/perf/builtin-top.c | 3 ++-
tools/perf/util/machine.c | 14 +++++++++-----
tools/perf/util/machine.h | 3 ++-
tools/perf/util/session.c | 3 ++-
6 files changed, 40 insertions(+), 13 deletions(-)
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 2b8097e..be3f196 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -135,6 +135,14 @@ OPTIONS
Default: fractal,0.5,callee,function.
+--max-stack::
+ Set the stack depth limit when parsing the callchain, anything
+ beyond the specified depth will be ignored. This is a trade-off
+ between information loss and faster processing especially for
+ workloads that can have a very long callchain stack.
+
+ Default: 127
+
-G::
--inverted::
alias for inverted caller based call graph.
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 72eae74..d0c9504 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -47,6 +47,7 @@ struct perf_report {
bool show_threads;
bool inverted_callchain;
bool mem_mode;
+ int max_stack;
struct perf_read_values show_threads_values;
const char *pretty_printing_style;
const char *cpu_list;
@@ -88,7 +89,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool *tool,
if ((sort__has_parent || symbol_conf.use_callchain) &&
sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent, al);
+ sample, &parent, al,
+ rep->max_stack);
if (err)
return err;
}
@@ -179,7 +181,8 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
if ((sort__has_parent || symbol_conf.use_callchain)
&& sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent, al);
+ sample, &parent, al,
+ rep->max_stack);
if (err)
return err;
}
@@ -242,18 +245,21 @@ out:
return err;
}
-static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
+static int perf_evsel__add_hist_entry(struct perf_tool *tool,
+ struct perf_evsel *evsel,
struct addr_location *al,
struct perf_sample *sample,
struct machine *machine)
{
+ struct perf_report *rep = container_of(tool, struct perf_report, tool);
struct symbol *parent = NULL;
int err = 0;
struct hist_entry *he;
if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent, al);
+ sample, &parent, al,
+ rep->max_stack);
if (err)
return err;
}
@@ -330,7 +336,8 @@ static int process_sample_event(struct perf_tool *tool,
if (al.map != NULL)
al.map->dso->hit = 1;
- ret = perf_evsel__add_hist_entry(evsel, &al, sample, machine);
+ ret = perf_evsel__add_hist_entry(tool, evsel, &al, sample,
+ machine);
if (ret < 0)
pr_debug("problem incrementing symbol period, skipping event\n");
}
@@ -757,6 +764,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
.ordered_samples = true,
.ordering_requires_timestamps = true,
},
+ .max_stack = PERF_MAX_STACK_DEPTH,
.pretty_printing_style = "normal",
};
const struct option options[] = {
@@ -797,6 +805,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
"Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
"Default: fractal,0.5,callee,function", &parse_callchain_opt, callchain_default_opt),
+ OPT_INTEGER(0, "max-stack", &report.max_stack,
+ "Set the maximum stack depth when parsing the callchain, "
+ "anything beyond the specified depth will be ignored. "
+ "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
"alias for inverted call graph"),
OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 2122141..2725aca 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -771,7 +771,8 @@ static void perf_event__process_sample(struct perf_tool *tool,
sample->callchain) {
err = machine__resolve_callchain(machine, evsel,
al.thread, sample,
- &parent, &al);
+ &parent, &al,
+ PERF_MAX_STACK_DEPTH);
if (err)
return;
}
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 6188d28..9617c4a 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1267,10 +1267,12 @@ static int machine__resolve_callchain_sample(struct machine *machine,
struct thread *thread,
struct ip_callchain *chain,
struct symbol **parent,
- struct addr_location *root_al)
+ struct addr_location *root_al,
+ int max_stack)
{
u8 cpumode = PERF_RECORD_MISC_USER;
- unsigned int i;
+ int chain_nr = min(max_stack, (int)chain->nr);
+ int i;
int err;
callchain_cursor_reset(&callchain_cursor);
@@ -1280,7 +1282,7 @@ static int machine__resolve_callchain_sample(struct machine *machine,
return 0;
}
- for (i = 0; i < chain->nr; i++) {
+ for (i = 0; i < chain_nr; i++) {
u64 ip;
struct addr_location al;
@@ -1352,12 +1354,14 @@ int machine__resolve_callchain(struct machine *machine,
struct thread *thread,
struct perf_sample *sample,
struct symbol **parent,
- struct addr_location *root_al)
+ struct addr_location *root_al,
+ int max_stack)
{
int ret;
ret = machine__resolve_callchain_sample(machine, thread,
- sample->callchain, parent, root_al);
+ sample->callchain, parent,
+ root_al, max_stack);
if (ret)
return ret;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 58a6be1..d09cce0 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -91,7 +91,8 @@ int machine__resolve_callchain(struct machine *machine,
struct thread *thread,
struct perf_sample *sample,
struct symbol **parent,
- struct addr_location *root_al);
+ struct addr_location *root_al,
+ int max_stack);
/*
* Default guest kernel is defined by parameter --guestkallsyms
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 568b750..96e5449 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1525,7 +1525,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,
if (symbol_conf.use_callchain && sample->callchain) {
if (machine__resolve_callchain(machine, evsel, al.thread,
- sample, NULL, NULL) != 0) {
+ sample, NULL, NULL,
+ PERF_MAX_STACK_DEPTH) != 0) {
if (verbose)
error("Failed to resolve callchain. Skipping\n");
return;
--
1.7.1
When the callgraph function is enabled (-G), it may take a long time to
scan all the stack data and merge them accordingly.
This patch adds a new --max-stack option to perf-top to limit the depth
of callchain stack data to look at to reduce the time it takes for
perf-top to finish its processing. It reduces the amount of information
provided to the user in exchange for faster speed.
Signed-off-by: Waiman Long <[email protected]>
---
tools/perf/Documentation/perf-top.txt | 8 ++++++++
tools/perf/builtin-top.c | 8 ++++++--
tools/perf/util/top.h | 1 +
3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 58d6598..3fd911c 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -155,6 +155,14 @@ Default is to monitor all CPUS.
Default: fractal,0.5,callee.
+--max-stack::
+ Set the stack depth limit when parsing the callchain, anything
+ beyond the specified depth will be ignored. This is a trade-off
+ between information loss and faster processing especially for
+ workloads that can have a very long callchain stack.
+
+ Default: 127
+
--ignore-callees=<regex>::
Ignore callees of the function(s) matching the given regex.
This has the effect of collecting the callers of each such
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 2725aca..14902b0 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -772,7 +772,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
err = machine__resolve_callchain(machine, evsel,
al.thread, sample,
&parent, &al,
- PERF_MAX_STACK_DEPTH);
+ top->max_stack);
if (err)
return;
}
@@ -1052,10 +1052,11 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
.user_freq = UINT_MAX,
.user_interval = ULLONG_MAX,
.freq = 4000, /* 4 KHz */
- .target = {
+ .target = {
.uses_mmap = true,
},
},
+ .max_stack = PERF_MAX_STACK_DEPTH,
.sym_pcnt_filter = 5,
};
struct perf_record_opts *opts = &top.record_opts;
@@ -1110,6 +1111,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_CALLBACK_DEFAULT('G', "call-graph", &top.record_opts,
"mode[,dump_size]", record_callchain_help,
&parse_callchain_opt, "fp"),
+ OPT_INTEGER(0, "max-stack", &top.max_stack,
+ "Set the maximum stack depth when parsing the callchain. "
+ "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
"ignore callees of these functions in call graphs",
report_parse_ignore_callees_opt),
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index b554ffc..88cfeaf 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -24,6 +24,7 @@ struct perf_top {
u64 exact_samples;
u64 guest_us_samples, guest_kernel_samples;
int print_entries, count_filter, delay_secs;
+ int max_stack;
bool hide_kernel_symbols, hide_user_symbols, zero;
bool use_tui, use_stdio;
bool kptr_restrict_warned;
--
1.7.1
When callgraph is enabled, the append_chain() function consumes a major
portion of the total CPU time. This patch tries to streamline the
append_chain() function by removing unneeded conditional test as well as
using ?: statement which can be more efficient than the regular if
statement in some architectures.
Signed-off-by: Waiman Long <[email protected]>
---
tools/perf/util/callchain.c | 9 ++++-----
1 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 482f680..1e79001 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -315,6 +315,7 @@ append_chain(struct callchain_node *root,
struct callchain_list *cnode;
u64 start = cursor->pos;
bool found = false;
+ bool func_mode = (callchain_param.key == CCKEY_FUNCTION);
u64 matches;
/*
@@ -331,17 +332,15 @@ append_chain(struct callchain_node *root,
if (!node)
break;
- sym = node->sym;
+ sym = func_mode ? node->sym : NULL;
- if (cnode->ms.sym && sym &&
- callchain_param.key == CCKEY_FUNCTION) {
+ if (cnode->ms.sym && sym) {
if (cnode->ms.sym->start != sym->start)
break;
} else if (cnode->ip != node->ip)
break;
- if (!found)
- found = true;
+ found = true;
callchain_cursor_advance(cursor);
}
--
1.7.1
Em Fri, Oct 18, 2013 at 10:38:48AM -0400, Waiman Long escreveu:
> When callgraph data was included in the perf data file, it may take a
> long time to scan all those data and merge them together especially
> if the stored callchains are long and the perf data file itself is
> large, like a Gbyte or so.
>
> The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
> This is a large value. Usually the callgraph data that developers are
> most interested in are the first few levels, the rests are usually
> not looked at.
>
> This patch adds a new --max-stack option to perf-report to limit the
> depth of callchain stack data to look at to reduce the time it takes
> for perf-report to finish its processing. It trades the presence of
> trailing stack information with faster speed.
>
> The following table shows the elapsed time of doing perf-report on a
> perf.data file of size 985,531,828 bytes.
>
> --max_stack Elapsed Time Output data size
> ----------- ------------ ----------------
Please prefix lines like this (------) with a space, otherwise 'git am'
will chop off everything from that line onwards. Fixing it up now.
- Arnaldo
> not set 88.0s 124,422,651
> 64 87.5s 116,303,213
> 32 87.2s 112,023,804
> 16 86.6s 94,326,380
> 8 59.9s 33,697,248
> 4 40.7s 10,116,637
> -g none 27.1s 2,555,810
>
> Signed-off-by: Waiman Long <[email protected]>
> ---
> tools/perf/Documentation/perf-report.txt | 8 ++++++++
> tools/perf/builtin-report.c | 22 +++++++++++++++++-----
> tools/perf/builtin-top.c | 3 ++-
> tools/perf/util/machine.c | 14 +++++++++-----
> tools/perf/util/machine.h | 3 ++-
> tools/perf/util/session.c | 3 ++-
> 6 files changed, 40 insertions(+), 13 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
> index 2b8097e..be3f196 100644
> --- a/tools/perf/Documentation/perf-report.txt
> +++ b/tools/perf/Documentation/perf-report.txt
> @@ -135,6 +135,14 @@ OPTIONS
>
> Default: fractal,0.5,callee,function.
>
> +--max-stack::
> + Set the stack depth limit when parsing the callchain, anything
> + beyond the specified depth will be ignored. This is a trade-off
> + between information loss and faster processing especially for
> + workloads that can have a very long callchain stack.
> +
> + Default: 127
> +
> -G::
> --inverted::
> alias for inverted caller based call graph.
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 72eae74..d0c9504 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -47,6 +47,7 @@ struct perf_report {
> bool show_threads;
> bool inverted_callchain;
> bool mem_mode;
> + int max_stack;
> struct perf_read_values show_threads_values;
> const char *pretty_printing_style;
> const char *cpu_list;
> @@ -88,7 +89,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool *tool,
> if ((sort__has_parent || symbol_conf.use_callchain) &&
> sample->callchain) {
> err = machine__resolve_callchain(machine, evsel, al->thread,
> - sample, &parent, al);
> + sample, &parent, al,
> + rep->max_stack);
> if (err)
> return err;
> }
> @@ -179,7 +181,8 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
> if ((sort__has_parent || symbol_conf.use_callchain)
> && sample->callchain) {
> err = machine__resolve_callchain(machine, evsel, al->thread,
> - sample, &parent, al);
> + sample, &parent, al,
> + rep->max_stack);
> if (err)
> return err;
> }
> @@ -242,18 +245,21 @@ out:
> return err;
> }
>
> -static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
> +static int perf_evsel__add_hist_entry(struct perf_tool *tool,
> + struct perf_evsel *evsel,
> struct addr_location *al,
> struct perf_sample *sample,
> struct machine *machine)
> {
> + struct perf_report *rep = container_of(tool, struct perf_report, tool);
> struct symbol *parent = NULL;
> int err = 0;
> struct hist_entry *he;
>
> if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
> err = machine__resolve_callchain(machine, evsel, al->thread,
> - sample, &parent, al);
> + sample, &parent, al,
> + rep->max_stack);
> if (err)
> return err;
> }
> @@ -330,7 +336,8 @@ static int process_sample_event(struct perf_tool *tool,
> if (al.map != NULL)
> al.map->dso->hit = 1;
>
> - ret = perf_evsel__add_hist_entry(evsel, &al, sample, machine);
> + ret = perf_evsel__add_hist_entry(tool, evsel, &al, sample,
> + machine);
> if (ret < 0)
> pr_debug("problem incrementing symbol period, skipping event\n");
> }
> @@ -757,6 +764,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
> .ordered_samples = true,
> .ordering_requires_timestamps = true,
> },
> + .max_stack = PERF_MAX_STACK_DEPTH,
> .pretty_printing_style = "normal",
> };
> const struct option options[] = {
> @@ -797,6 +805,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
> OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
> "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
> "Default: fractal,0.5,callee,function", &parse_callchain_opt, callchain_default_opt),
> + OPT_INTEGER(0, "max-stack", &report.max_stack,
> + "Set the maximum stack depth when parsing the callchain, "
> + "anything beyond the specified depth will be ignored. "
> + "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
> OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
> "alias for inverted call graph"),
> OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 2122141..2725aca 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -771,7 +771,8 @@ static void perf_event__process_sample(struct perf_tool *tool,
> sample->callchain) {
> err = machine__resolve_callchain(machine, evsel,
> al.thread, sample,
> - &parent, &al);
> + &parent, &al,
> + PERF_MAX_STACK_DEPTH);
> if (err)
> return;
> }
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 6188d28..9617c4a 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -1267,10 +1267,12 @@ static int machine__resolve_callchain_sample(struct machine *machine,
> struct thread *thread,
> struct ip_callchain *chain,
> struct symbol **parent,
> - struct addr_location *root_al)
> + struct addr_location *root_al,
> + int max_stack)
> {
> u8 cpumode = PERF_RECORD_MISC_USER;
> - unsigned int i;
> + int chain_nr = min(max_stack, (int)chain->nr);
> + int i;
> int err;
>
> callchain_cursor_reset(&callchain_cursor);
> @@ -1280,7 +1282,7 @@ static int machine__resolve_callchain_sample(struct machine *machine,
> return 0;
> }
>
> - for (i = 0; i < chain->nr; i++) {
> + for (i = 0; i < chain_nr; i++) {
> u64 ip;
> struct addr_location al;
>
> @@ -1352,12 +1354,14 @@ int machine__resolve_callchain(struct machine *machine,
> struct thread *thread,
> struct perf_sample *sample,
> struct symbol **parent,
> - struct addr_location *root_al)
> + struct addr_location *root_al,
> + int max_stack)
> {
> int ret;
>
> ret = machine__resolve_callchain_sample(machine, thread,
> - sample->callchain, parent, root_al);
> + sample->callchain, parent,
> + root_al, max_stack);
> if (ret)
> return ret;
>
> diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
> index 58a6be1..d09cce0 100644
> --- a/tools/perf/util/machine.h
> +++ b/tools/perf/util/machine.h
> @@ -91,7 +91,8 @@ int machine__resolve_callchain(struct machine *machine,
> struct thread *thread,
> struct perf_sample *sample,
> struct symbol **parent,
> - struct addr_location *root_al);
> + struct addr_location *root_al,
> + int max_stack);
>
> /*
> * Default guest kernel is defined by parameter --guestkallsyms
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 568b750..96e5449 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1525,7 +1525,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,
> if (symbol_conf.use_callchain && sample->callchain) {
>
> if (machine__resolve_callchain(machine, evsel, al.thread,
> - sample, NULL, NULL) != 0) {
> + sample, NULL, NULL,
> + PERF_MAX_STACK_DEPTH) != 0) {
> if (verbose)
> error("Failed to resolve callchain. Skipping\n");
> return;
> --
> 1.7.1
On 10/18/13 8:38 AM, Waiman Long wrote:
> When callgraph data was included in the perf data file, it may take a
> long time to scan all those data and merge them together especially
> if the stored callchains are long and the perf data file itself is
> large, like a Gbyte or so.
>
> The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
> This is a large value. Usually the callgraph data that developers are
> most interested in are the first few levels, the rests are usually
> not looked at.
>
> This patch adds a new --max-stack option to perf-report to limit the
> depth of callchain stack data to look at to reduce the time it takes
> for perf-report to finish its processing. It trades the presence of
> trailing stack information with faster speed.
>
> The following table shows the elapsed time of doing perf-report on a
> perf.data file of size 985,531,828 bytes.
>
> --max_stack Elapsed Time Output data size
> ----------- ------------ ----------------
> not set 88.0s 124,422,651
> 64 87.5s 116,303,213
> 32 87.2s 112,023,804
> 16 86.6s 94,326,380
> 8 59.9s 33,697,248
> 4 40.7s 10,116,637
> -g none 27.1s 2,555,810
>
> Signed-off-by: Waiman Long <[email protected]>
> ---
> tools/perf/Documentation/perf-report.txt | 8 ++++++++
> tools/perf/builtin-report.c | 22 +++++++++++++++++-----
> tools/perf/builtin-top.c | 3 ++-
> tools/perf/util/machine.c | 14 +++++++++-----
> tools/perf/util/machine.h | 3 ++-
> tools/perf/util/session.c | 3 ++-
> 6 files changed, 40 insertions(+), 13 deletions(-)
>
Looks good to me. Acked-by: David Ahern <[email protected]>
On 10/18/13 8:38 AM, Waiman Long wrote:
> When the callgraph function is enabled (-G), it may take a long time to
> scan all the stack data and merge them accordingly.
>
> This patch adds a new --max-stack option to perf-top to limit the depth
> of callchain stack data to look at to reduce the time it takes for
> perf-top to finish its processing. It reduces the amount of information
> provided to the user in exchange for faster speed.
>
> Signed-off-by: Waiman Long <[email protected]>
> ---
> tools/perf/Documentation/perf-top.txt | 8 ++++++++
> tools/perf/builtin-top.c | 8 ++++++--
> tools/perf/util/top.h | 1 +
> 3 files changed, 15 insertions(+), 2 deletions(-)
>
Looks good to me. Acked-by: David Ahern <[email protected]>
Waiman Long <[email protected]> writes:
> as well as
> using ?: statement which can be more efficient than the regular if
> statement in some architectures.
I don't think that's true, the compiler does if conversion anyways for both.
But change seems reasonable.
-Andi
--
[email protected] -- Speaking for myself only
On Fri, 2013-10-18 at 10:38 -0400, Waiman Long wrote:
> When the callgraph function is enabled (-G), it may take a long time to
> scan all the stack data and merge them accordingly.
>
> This patch adds a new --max-stack option to perf-top to limit the depth
> of callchain stack data to look at to reduce the time it takes for
> perf-top to finish its processing. It reduces the amount of information
> provided to the user in exchange for faster speed.
>
> Signed-off-by: Waiman Long <[email protected]>
Tested-by: Davidlohr Bueso <[email protected]>
> ---
> tools/perf/Documentation/perf-top.txt | 8 ++++++++
> tools/perf/builtin-top.c | 8 ++++++--
> tools/perf/util/top.h | 1 +
> 3 files changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
> index 58d6598..3fd911c 100644
> --- a/tools/perf/Documentation/perf-top.txt
> +++ b/tools/perf/Documentation/perf-top.txt
> @@ -155,6 +155,14 @@ Default is to monitor all CPUS.
>
> Default: fractal,0.5,callee.
>
> +--max-stack::
> + Set the stack depth limit when parsing the callchain, anything
> + beyond the specified depth will be ignored. This is a trade-off
> + between information loss and faster processing especially for
> + workloads that can have a very long callchain stack.
> +
> + Default: 127
> +
> --ignore-callees=<regex>::
> Ignore callees of the function(s) matching the given regex.
> This has the effect of collecting the callers of each such
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 2725aca..14902b0 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -772,7 +772,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
> err = machine__resolve_callchain(machine, evsel,
> al.thread, sample,
> &parent, &al,
> - PERF_MAX_STACK_DEPTH);
> + top->max_stack);
> if (err)
> return;
> }
> @@ -1052,10 +1052,11 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
> .user_freq = UINT_MAX,
> .user_interval = ULLONG_MAX,
> .freq = 4000, /* 4 KHz */
> - .target = {
> + .target = {
> .uses_mmap = true,
> },
> },
> + .max_stack = PERF_MAX_STACK_DEPTH,
> .sym_pcnt_filter = 5,
> };
> struct perf_record_opts *opts = &top.record_opts;
> @@ -1110,6 +1111,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
> OPT_CALLBACK_DEFAULT('G', "call-graph", &top.record_opts,
> "mode[,dump_size]", record_callchain_help,
> &parse_callchain_opt, "fp"),
> + OPT_INTEGER(0, "max-stack", &top.max_stack,
> + "Set the maximum stack depth when parsing the callchain. "
> + "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
> OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
> "ignore callees of these functions in call graphs",
> report_parse_ignore_callees_opt),
> diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
> index b554ffc..88cfeaf 100644
> --- a/tools/perf/util/top.h
> +++ b/tools/perf/util/top.h
> @@ -24,6 +24,7 @@ struct perf_top {
> u64 exact_samples;
> u64 guest_us_samples, guest_kernel_samples;
> int print_entries, count_filter, delay_secs;
> + int max_stack;
> bool hide_kernel_symbols, hide_user_symbols, zero;
> bool use_tui, use_stdio;
> bool kptr_restrict_warned;
On 10/19/2013 08:29 PM, Andi Kleen wrote:
> Waiman Long<[email protected]> writes:
>
>> as well as
>> using ?: statement which can be more efficient than the regular if
>> statement in some architectures.
> I don't think that's true, the compiler does if conversion anyways for both.
>
> But change seems reasonable.
>
> -Andi
>
>
That may be true for a simple if statement. However, the condition was
checked as the last of 3 tests. I doubt if the compiler is able to
optimize that effectively.
-Longman
On 10/18/2013 01:17 PM, Arnaldo Carvalho de Melo wrote:
> Em Fri, Oct 18, 2013 at 10:38:48AM -0400, Waiman Long escreveu:
>> When callgraph data was included in the perf data file, it may take a
>> long time to scan all those data and merge them together especially
>> if the stored callchains are long and the perf data file itself is
>> large, like a Gbyte or so.
>>
>> The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
>> This is a large value. Usually the callgraph data that developers are
>> most interested in are the first few levels, the rests are usually
>> not looked at.
>>
>> This patch adds a new --max-stack option to perf-report to limit the
>> depth of callchain stack data to look at to reduce the time it takes
>> for perf-report to finish its processing. It trades the presence of
>> trailing stack information with faster speed.
>>
>> The following table shows the elapsed time of doing perf-report on a
>> perf.data file of size 985,531,828 bytes.
>>
>> --max_stack Elapsed Time Output data size
>> ----------- ------------ ----------------
> Please prefix lines like this (------) with a space, otherwise 'git am'
> will chop off everything from that line onwards. Fixing it up now.
>
> - Arnaldo
>
>
Thank for spotting the problem, will fix that in the next version.
-Longman
Commit-ID: 5dbb6e81d85e55ee2b4cf523c1738e16f63e5400
Gitweb: http://git.kernel.org/tip/5dbb6e81d85e55ee2b4cf523c1738e16f63e5400
Author: Waiman Long <[email protected]>
AuthorDate: Fri, 18 Oct 2013 10:38:49 -0400
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 21 Oct 2013 17:36:25 -0300
perf top: Add --max-stack option to limit callchain stack scan
When the callgraph function is enabled (-G), it may take a long time to
scan all the stack data and merge them accordingly.
This patch adds a new --max-stack option to perf-top to limit the depth
of callchain stack data to look at to reduce the time it takes for
perf-top to finish its processing. It reduces the amount of information
provided to the user in exchange for faster speed.
Signed-off-by: Waiman Long <[email protected]>
Acked-by: David Ahern <[email protected]>
Tested-by: Davidlohr Bueso <[email protected]>
Cc: Adrian Hunter <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: David Ahern <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Stephane Eranian <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/Documentation/perf-top.txt | 8 ++++++++
tools/perf/builtin-top.c | 8 ++++++--
tools/perf/util/top.h | 1 +
3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index f65777c..c16a09e 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -158,6 +158,14 @@ Default is to monitor all CPUS.
Default: fractal,0.5,callee.
+--max-stack::
+ Set the stack depth limit when parsing the callchain, anything
+ beyond the specified depth will be ignored. This is a trade-off
+ between information loss and faster processing especially for
+ workloads that can have a very long callchain stack.
+
+ Default: 127
+
--ignore-callees=<regex>::
Ignore callees of the function(s) matching the given regex.
This has the effect of collecting the callers of each such
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 112cb7d..386d833 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -771,7 +771,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
err = machine__resolve_callchain(machine, evsel,
al.thread, sample,
&parent, &al,
- PERF_MAX_STACK_DEPTH);
+ top->max_stack);
if (err)
return;
}
@@ -1048,10 +1048,11 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
.user_freq = UINT_MAX,
.user_interval = ULLONG_MAX,
.freq = 4000, /* 4 KHz */
- .target = {
+ .target = {
.uses_mmap = true,
},
},
+ .max_stack = PERF_MAX_STACK_DEPTH,
.sym_pcnt_filter = 5,
};
struct perf_record_opts *opts = &top.record_opts;
@@ -1110,6 +1111,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_CALLBACK_DEFAULT('G', "call-graph", &top.record_opts,
"mode[,dump_size]", record_callchain_help,
&parse_callchain_opt, "fp"),
+ OPT_INTEGER(0, "max-stack", &top.max_stack,
+ "Set the maximum stack depth when parsing the callchain. "
+ "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
"ignore callees of these functions in call graphs",
report_parse_ignore_callees_opt),
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index b554ffc..88cfeaf 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -24,6 +24,7 @@ struct perf_top {
u64 exact_samples;
u64 guest_us_samples, guest_kernel_samples;
int print_entries, count_filter, delay_secs;
+ int max_stack;
bool hide_kernel_symbols, hide_user_symbols, zero;
bool use_tui, use_stdio;
bool kptr_restrict_warned;
Commit-ID: 91e95617429cb272fd908b1928a1915b37b9655f
Gitweb: http://git.kernel.org/tip/91e95617429cb272fd908b1928a1915b37b9655f
Author: Waiman Long <[email protected]>
AuthorDate: Fri, 18 Oct 2013 10:38:48 -0400
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 21 Oct 2013 17:36:25 -0300
perf report: Add --max-stack option to limit callchain stack scan
When callgraph data was included in the perf data file, it may take a
long time to scan all those data and merge them together especially if
the stored callchains are long and the perf data file itself is large,
like a Gbyte or so.
The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
This is a large value. Usually the callgraph data that developers are
most interested in are the first few levels, the rests are usually not
looked at.
This patch adds a new --max-stack option to perf-report to limit the
depth of callchain stack data to look at to reduce the time it takes for
perf-report to finish its processing. It trades the presence of trailing
stack information with faster speed.
The following table shows the elapsed time of doing perf-report on a
perf.data file of size 985,531,828 bytes.
--max_stack Elapsed Time Output data size
----------- ------------ ----------------
not set 88.0s 124,422,651
64 87.5s 116,303,213
32 87.2s 112,023,804
16 86.6s 94,326,380
8 59.9s 33,697,248
4 40.7s 10,116,637
-g none 27.1s 2,555,810
Signed-off-by: Waiman Long <[email protected]>
Acked-by: David Ahern <[email protected]>
Cc: Adrian Hunter <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: David Ahern <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Stephane Eranian <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/Documentation/perf-report.txt | 8 ++++++++
tools/perf/builtin-report.c | 22 +++++++++++++++++-----
tools/perf/builtin-top.c | 3 ++-
tools/perf/util/machine.c | 14 +++++++++-----
tools/perf/util/machine.h | 3 ++-
tools/perf/util/session.c | 3 ++-
6 files changed, 40 insertions(+), 13 deletions(-)
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index be5ad87..10a2798 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -141,6 +141,14 @@ OPTIONS
Default: fractal,0.5,callee,function.
+--max-stack::
+ Set the stack depth limit when parsing the callchain, anything
+ beyond the specified depth will be ignored. This is a trade-off
+ between information loss and faster processing especially for
+ workloads that can have a very long callchain stack.
+
+ Default: 127
+
-G::
--inverted::
alias for inverted caller based call graph.
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index fa68a36..81addca 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -49,6 +49,7 @@ struct perf_report {
bool show_threads;
bool inverted_callchain;
bool mem_mode;
+ int max_stack;
struct perf_read_values show_threads_values;
const char *pretty_printing_style;
const char *cpu_list;
@@ -90,7 +91,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool *tool,
if ((sort__has_parent || symbol_conf.use_callchain) &&
sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent, al);
+ sample, &parent, al,
+ rep->max_stack);
if (err)
return err;
}
@@ -181,7 +183,8 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
if ((sort__has_parent || symbol_conf.use_callchain)
&& sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent, al);
+ sample, &parent, al,
+ rep->max_stack);
if (err)
return err;
}
@@ -244,18 +247,21 @@ out:
return err;
}
-static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
+static int perf_evsel__add_hist_entry(struct perf_tool *tool,
+ struct perf_evsel *evsel,
struct addr_location *al,
struct perf_sample *sample,
struct machine *machine)
{
+ struct perf_report *rep = container_of(tool, struct perf_report, tool);
struct symbol *parent = NULL;
int err = 0;
struct hist_entry *he;
if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread,
- sample, &parent, al);
+ sample, &parent, al,
+ rep->max_stack);
if (err)
return err;
}
@@ -332,7 +338,8 @@ static int process_sample_event(struct perf_tool *tool,
if (al.map != NULL)
al.map->dso->hit = 1;
- ret = perf_evsel__add_hist_entry(evsel, &al, sample, machine);
+ ret = perf_evsel__add_hist_entry(tool, evsel, &al, sample,
+ machine);
if (ret < 0)
pr_debug("problem incrementing symbol period, skipping event\n");
}
@@ -772,6 +779,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
.ordered_samples = true,
.ordering_requires_timestamps = true,
},
+ .max_stack = PERF_MAX_STACK_DEPTH,
.pretty_printing_style = "normal",
};
const struct option options[] = {
@@ -812,6 +820,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
"Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
"Default: fractal,0.5,callee,function", &parse_callchain_opt, callchain_default_opt),
+ OPT_INTEGER(0, "max-stack", &report.max_stack,
+ "Set the maximum stack depth when parsing the callchain, "
+ "anything beyond the specified depth will be ignored. "
+ "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
"alias for inverted call graph"),
OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d934f70..112cb7d 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -770,7 +770,8 @@ static void perf_event__process_sample(struct perf_tool *tool,
sample->callchain) {
err = machine__resolve_callchain(machine, evsel,
al.thread, sample,
- &parent, &al);
+ &parent, &al,
+ PERF_MAX_STACK_DEPTH);
if (err)
return;
}
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 6b861ae..ea93425 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1253,10 +1253,12 @@ static int machine__resolve_callchain_sample(struct machine *machine,
struct thread *thread,
struct ip_callchain *chain,
struct symbol **parent,
- struct addr_location *root_al)
+ struct addr_location *root_al,
+ int max_stack)
{
u8 cpumode = PERF_RECORD_MISC_USER;
- unsigned int i;
+ int chain_nr = min(max_stack, (int)chain->nr);
+ int i;
int err;
callchain_cursor_reset(&callchain_cursor);
@@ -1266,7 +1268,7 @@ static int machine__resolve_callchain_sample(struct machine *machine,
return 0;
}
- for (i = 0; i < chain->nr; i++) {
+ for (i = 0; i < chain_nr; i++) {
u64 ip;
struct addr_location al;
@@ -1338,12 +1340,14 @@ int machine__resolve_callchain(struct machine *machine,
struct thread *thread,
struct perf_sample *sample,
struct symbol **parent,
- struct addr_location *root_al)
+ struct addr_location *root_al,
+ int max_stack)
{
int ret;
ret = machine__resolve_callchain_sample(machine, thread,
- sample->callchain, parent, root_al);
+ sample->callchain, parent,
+ root_al, max_stack);
if (ret)
return ret;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index d44c09b..4c1f5d5 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -92,7 +92,8 @@ int machine__resolve_callchain(struct machine *machine,
struct thread *thread,
struct perf_sample *sample,
struct symbol **parent,
- struct addr_location *root_al);
+ struct addr_location *root_al,
+ int max_stack);
/*
* Default guest kernel is defined by parameter --guestkallsyms
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 19fc716..854c5aa 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1512,7 +1512,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,
if (symbol_conf.use_callchain && sample->callchain) {
if (machine__resolve_callchain(machine, evsel, al.thread,
- sample, NULL, NULL) != 0) {
+ sample, NULL, NULL,
+ PERF_MAX_STACK_DEPTH) != 0) {
if (verbose)
error("Failed to resolve callchain. Skipping\n");
return;