2015-11-17 15:05:45

by Jiri Olsa

[permalink] [raw]
Subject: [PATCH 0/3] perf tools DWARF libunwind: Add callchain order support

hi,
as reported by Milian, currently for DWARF unwind (both libdw
and libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libunwind
DWARF unwinder, so we could get following output for report:

$ perf record --call-graph dwarf ls
...
$ perf report --no-children --stdio

39.26% ls libc-2.21.so [.] __strcoll_l
|
---__strcoll_l
mpsort_with_tmp
mpsort_with_tmp
sort_files
main
__libc_start_main
_start
0

$ perf report -g caller --no-children --stdio
...
39.26% ls libc-2.21.so [.] __strcoll_l
|
---0
_start
__libc_start_main
main
sort_files
mpsort_with_tmp
mpsort_with_tmp
__strcoll_l

Tested on x86_64. The change is in generic code only,
so it should not affect other archs. Still it would be
nice to have some confirmation.. Wang Nan? ;-)

It'd be nice to have this for libdw unwind as well,
but it looks like it's out of reach for perf code.. Jan?

Also available in:
git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
perf/callchain_1

thanks,
jirka


Cc: Jan Kratochvil <[email protected]>
---
Jiri Olsa (3):
perf tools: Move initial entry call into get_entries function
perf tools: Add callchain order support for libunwind DWARF unwinder
perf test: Add callchain order setup for DWARF unwinder test

tools/perf/tests/dwarf-unwind.c | 22 +++++++++++++++++++---
tools/perf/util/unwind-libunwind.c | 60 +++++++++++++++++++++++++++++++++++++++---------------------
2 files changed, 58 insertions(+), 24 deletions(-)


2015-11-17 15:05:49

by Jiri Olsa

[permalink] [raw]
Subject: [PATCH 1/3] perf tools: Move initial entry call into get_entries function

Moving initial entry call into get_entries function
so all entries processing is on one place. It will
be useful for next change that adds ordering logic.

Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Jiri Olsa <[email protected]>
---
tools/perf/util/unwind-libunwind.c | 25 +++++++++++++++----------
1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index c83832b555e5..0ae8844fe7a6 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -614,10 +614,22 @@ void unwind__finish_access(struct thread *thread)
static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
void *arg, int max_stack)
{
+ u64 val;
unw_addr_space_t addr_space;
unw_cursor_t c;
int ret;

+ ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
+ if (ret)
+ return ret;
+
+ ret = entry(val, ui->thread, cb, arg);
+ if (ret)
+ return -ENOMEM;
+
+ if (--max_stack == 0)
+ return 0;
+
addr_space = thread__priv(ui->thread);
if (addr_space == NULL)
return -1;
@@ -640,24 +652,17 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
struct thread *thread,
struct perf_sample *data, int max_stack)
{
- u64 ip;
struct unwind_info ui = {
.sample = data,
.thread = thread,
.machine = thread->mg->machine,
};
- int ret;

if (!data->user_regs.regs)
return -EINVAL;

- ret = perf_reg_value(&ip, &data->user_regs, PERF_REG_IP);
- if (ret)
- return ret;
-
- ret = entry(ip, thread, cb, arg);
- if (ret)
- return -ENOMEM;
+ if (max_stack <= 0)
+ return -EINVAL;

- return --max_stack > 0 ? get_entries(&ui, cb, arg, max_stack) : 0;
+ return get_entries(&ui, cb, arg, max_stack);
}
--
2.4.3

2015-11-17 15:06:29

by Jiri Olsa

[permalink] [raw]
Subject: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

From: Jiri Olsa <[email protected]>

As reported by Milian, currently for DWARF unwind (both libdw
and libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libunwind
DWARF unwinder, so we could get following output for report:

$ perf record --call-graph dwarf ls
...
$ perf report --no-children --stdio

39.26% ls libc-2.21.so [.] __strcoll_l
|
---__strcoll_l
mpsort_with_tmp
mpsort_with_tmp
sort_files
main
__libc_start_main
_start
0

$ perf report -g caller --no-children --stdio
...
39.26% ls libc-2.21.so [.] __strcoll_l
|
---0
_start
__libc_start_main
main
sort_files
mpsort_with_tmp
mpsort_with_tmp
__strcoll_l

Reported-by: Milian Wolff <[email protected]>
Based-on-patch-by: Milian Wolff <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Jiri Olsa <[email protected]>
---
tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 0ae8844fe7a6..705e1c19f1ea 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -615,34 +615,47 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
void *arg, int max_stack)
{
u64 val;
+ unw_word_t ips[max_stack];
unw_addr_space_t addr_space;
unw_cursor_t c;
- int ret;
+ int ret, i = 0;

ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
if (ret)
return ret;

- ret = entry(val, ui->thread, cb, arg);
- if (ret)
- return -ENOMEM;
+ ips[i++] = (unw_word_t) val;

- if (--max_stack == 0)
- return 0;
-
- addr_space = thread__priv(ui->thread);
- if (addr_space == NULL)
- return -1;
+ /*
+ * If we need more than one entry, do the DWARF
+ * unwind itself.
+ */
+ if (max_stack - 1 > 0) {
+ addr_space = thread__priv(ui->thread);
+ if (addr_space == NULL)
+ return -1;
+
+ ret = unw_init_remote(&c, addr_space, ui);
+ if (ret)
+ display_error(ret);
+
+ while (!ret && (unw_step(&c) > 0) && i < max_stack) {
+ unw_get_reg(&c, UNW_REG_IP, &ips[i]);
+ ++i;
+ }

- ret = unw_init_remote(&c, addr_space, ui);
- if (ret)
- display_error(ret);
+ max_stack = i;
+ }

- while (!ret && (unw_step(&c) > 0) && max_stack--) {
- unw_word_t ip;
+ /*
+ * Display what we got based on the order setup.
+ */
+ for (i = 0; i < max_stack && !ret; i++) {
+ int j = i;

- unw_get_reg(&c, UNW_REG_IP, &ip);
- ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
+ if (callchain_param.order == ORDER_CALLER)
+ j = max_stack - i - 1;
+ ret = entry(ips[j], ui->thread, cb, arg);
}

return ret;
--
2.4.3

2015-11-17 15:06:01

by Jiri Olsa

[permalink] [raw]
Subject: [PATCH 3/3] perf test: Add callchain order setup for DWARF unwinder test

Adding callchain order setup for DWARF unwinder test. The test
now runs unwinder for both callee and caller orders.

Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Jiri Olsa <[email protected]>
---
tools/perf/tests/dwarf-unwind.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 07221793a3ac..05d9ba387c43 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -51,6 +51,12 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
"krava_1",
"test__dwarf_unwind"
};
+ /*
+ * The funcs[MAX_STACK] array index, based on the
+ * callchain order setup.
+ */
+ int idx = callchain_param.order == ORDER_CALLER ?
+ MAX_STACK - *cnt - 1 : *cnt;

if (*cnt >= MAX_STACK) {
pr_debug("failed: crossed the max stack value %d\n", MAX_STACK);
@@ -63,8 +69,10 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
return -1;
}

- pr_debug("got: %s 0x%" PRIx64 "\n", symbol, entry->ip);
- return strcmp((const char *) symbol, funcs[(*cnt)++]);
+ (*cnt)++;
+ pr_debug("got: %s 0x%" PRIx64 ", expecting %s\n",
+ symbol, entry->ip, funcs[idx]);
+ return strcmp((const char *) symbol, funcs[idx]);
}

__attribute__ ((noinline))
@@ -105,8 +113,16 @@ static int compare(void *p1, void *p2)
/* Any possible value should be 'thread' */
struct thread *thread = *(struct thread **)p1;

- if (global_unwind_retval == -INT_MAX)
+ if (global_unwind_retval == -INT_MAX) {
+ /* Call unwinder twice for both callchain orders. */
+ callchain_param.order = ORDER_CALLER;
+
global_unwind_retval = unwind_thread(thread);
+ if (!global_unwind_retval) {
+ callchain_param.order = ORDER_CALLEE;
+ global_unwind_retval = unwind_thread(thread);
+ }
+ }

return p1 - p2;
}
--
2.4.3

2015-11-18 04:13:40

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder



On 2015/11/17 23:05, Jiri Olsa wrote:
> From: Jiri Olsa <[email protected]>
>
> As reported by Milian, currently for DWARF unwind (both libdw
> and libunwind) we display callchain in callee order only.
>
> Adding the support to follow callchain order setup to libunwind
> DWARF unwinder, so we could get following output for report:
>
> $ perf record --call-graph dwarf ls
> ...
> $ perf report --no-children --stdio
>
> 39.26% ls libc-2.21.so [.] __strcoll_l
> |
> ---__strcoll_l
> mpsort_with_tmp
> mpsort_with_tmp
> sort_files
> main
> __libc_start_main
> _start
> 0
>
> $ perf report -g caller --no-children --stdio
> ...
> 39.26% ls libc-2.21.so [.] __strcoll_l
> |
> ---0
> _start
> __libc_start_main
> main
> sort_files
> mpsort_with_tmp
> mpsort_with_tmp
> __strcoll_l
>
> Reported-by: Milian Wolff <[email protected]>
> Based-on-patch-by: Milian Wolff <[email protected]>
> Link: http://lkml.kernel.org/n/[email protected]
> Signed-off-by: Jiri Olsa <[email protected]>
> ---
> tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
> 1 file changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
> index 0ae8844fe7a6..705e1c19f1ea 100644
> --- a/tools/perf/util/unwind-libunwind.c
> +++ b/tools/perf/util/unwind-libunwind.c

[SNIP]

>
> - unw_get_reg(&c, UNW_REG_IP, &ip);
> - ret = ip ? entry(ip, ui->thread, cb, arg) : 0;

In original code if ip == 0 entry() won't be called.

> + if (callchain_param.order == ORDER_CALLER)
> + j = max_stack - i - 1;
> + ret = entry(ips[j], ui->thread, cb, arg);

But in new code event if ips[j] == 0 an entry will be built, which causes
a behavior changes user noticable:

Before this patch:


# perf report --no-children --stdio --call-graph=callee
...
3.38% a.out a.out [.] funcc
|
---funcc
|
--2.70%-- funcb
funca
main
__libc_start_main
_start

After this patch:

# perf report --no-children --stdio --call-graph=callee
...
3.38% a.out a.out [.] funcc
|
---funcc
|
|--2.70%-- funcb
| funca
| main
| __libc_start_main
| _start
|
--0.68%-- 0


I'm not sure whether we can regard this behavior changing as a bugfix? I
think
there may be some reason the original code explicitly avoid creating an '0'
entry.

Then I tried to find why perf can't get call frame on my case, and
I guess there's something wrong whe dealing with 'call' command, because
the instruction on it I can't get callchain from libunwind is a 'callq':

...
4005bf: be 00 00 00 00 mov $0x0,%esi
4005c4: 48 89 c7 mov %rax,%rdi
4005c7: e8 74 fe ff ff callq 400440 <gettimeofday@plt>
us2 = tv2.tv_sec * 1000000 + tv2.tv_usec;
4005cc: 48 8b 04 24 mov (%rsp),%rax
...

But this is another problem, we can discuss it in a new thread.

Thank you.


2015-11-18 04:22:49

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH 0/3] perf tools DWARF libunwind: Add callchain order support

Hi Jiri,


On 2015/11/17 23:05, Jiri Olsa wrote:
> hi,
> as reported by Milian, currently for DWARF unwind (both libdw
> and libunwind) we display callchain in callee order only.
>
> Adding the support to follow callchain order setup to libunwind
> DWARF unwinder, so we could get following output for report:
>
> $ perf record --call-graph dwarf ls
> ...
> $ perf report --no-children --stdio
>
> 39.26% ls libc-2.21.so [.] __strcoll_l
> |
> ---__strcoll_l
> mpsort_with_tmp
> mpsort_with_tmp
> sort_files
> main
> __libc_start_main
> _start
> 0
>
> $ perf report -g caller --no-children --stdio
> ...
> 39.26% ls libc-2.21.so [.] __strcoll_l
> |
> ---0
> _start
> __libc_start_main
> main
> sort_files
> mpsort_with_tmp
> mpsort_with_tmp
> __strcoll_l
>
> Tested on x86_64. The change is in generic code only,
> so it should not affect other archs. Still it would be
> nice to have some confirmation.. Wang Nan? ;-)
>
> It'd be nice to have this for libdw unwind as well,
> but it looks like it's out of reach for perf code.. Jan?
>
> Also available in:
> git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> perf/callchain_1


Thanks for notifying me about this. I have tested it in my environment.

It works well for me except a small behavior changing. Please see below.

Before applying these patch set:

# perf report --no-children --stdio --call-graph=callee
# Overhead Command Shared Object Symbol
# ........ ....... ................ .........................
#
96.61% a.out [vdso] [.] __vdso_gettimeofday
|
---__vdso_gettimeofday
funcc
funcb
funca
main
__libc_start_main
_start

3.38% a.out a.out [.] funcc
|
---funcc
|
--2.70%-- funcb
funca
main
__libc_start_main
_start

0.02% pref_re [kernel.vmlinux] [k] sched_clock
|
---sched_clock
perf_event_nmi_handler
nmi_handle
...

And caller:

# ./perf report --no-children --stdio --call-graph=caller
# Overhead Command Shared Object Symbol
# ........ ....... ................ .........................
#
96.61% a.out [vdso] [.] __vdso_gettimeofday
|
---__vdso_gettimeofday
funcc
funcb
funca
main
__libc_start_main
_start

3.38% a.out a.out [.] funcc
|
---funcc
|
--2.70%-- funcb
funca
main
__libc_start_main
_start

0.02% pref_re [kernel.vmlinux] [k] sched_clock
|
---return_from_execve
sys_execve
do_execveat_common.isra.27


The user code part of output are identical so I confirm the bug.

After applying this patchset:

# ./perf report --no-children --stdio --call-graph=callee
# Overhead Command Shared Object Symbol
# ........ ....... ................ .........................
#
96.61% a.out [vdso] [.] __vdso_gettimeofday
|
---__vdso_gettimeofday
funcc
funcb
funca
main
__libc_start_main
_start

3.38% a.out a.out [.] funcc
|
---funcc
|
|--2.70%-- funcb
| funca
| main
| __libc_start_main
| _start
|
--0.68%-- 0
0.02% pref_re [kernel.vmlinux] [k] sched_clock
|
---sched_clock
perf_event_nmi_handler
...

And caller:

# ./perf report --no-children --stdio --call-graph=caller
# Overhead Command Shared Object Symbol
# ........ ....... ................ .........................
#
96.61% a.out [vdso] [.] __vdso_gettimeofday
|
---_start
__libc_start_main
main
funca
funcb
funcc
__vdso_gettimeofday

3.38% a.out a.out [.] funcc
|
|--2.70%-- _start
| __libc_start_main
| main
| funca
| funcb
| funcc
|
--0.68%-- 0
funcc

0.02% pref_re [kernel.vmlinux] [k] sched_clock
|
---return_from_execve
sys_execve
...

It fixes the bug. However, do you see the extra "0.68%-- 0" in the tree?

I give a message on patch 2/3, please have a look. I think this change
would be okay for me if we treat the old behavior as a bug (for example:
sum of all branches not equal to the overhead of itself). However, the
original code explicitly avoid generating '0' entry so I think we
should make it clear.

Thank you.


> thanks,
> jirka
>
>
> Cc: Jan Kratochvil <[email protected]>
> ---
> Jiri Olsa (3):
> perf tools: Move initial entry call into get_entries function
> perf tools: Add callchain order support for libunwind DWARF unwinder
> perf test: Add callchain order setup for DWARF unwinder test
>
> tools/perf/tests/dwarf-unwind.c | 22 +++++++++++++++++++---
> tools/perf/util/unwind-libunwind.c | 60 +++++++++++++++++++++++++++++++++++++++---------------------
> 2 files changed, 58 insertions(+), 24 deletions(-)

2015-11-18 05:41:18

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

On Wed, Nov 18, 2015 at 12:13:08PM +0800, Wangnan (F) wrote:
>
>
> On 2015/11/17 23:05, Jiri Olsa wrote:
> >From: Jiri Olsa <[email protected]>
> >
> >As reported by Milian, currently for DWARF unwind (both libdw
> >and libunwind) we display callchain in callee order only.
> >
> >Adding the support to follow callchain order setup to libunwind
> >DWARF unwinder, so we could get following output for report:
> >
> > $ perf record --call-graph dwarf ls
> > ...
> > $ perf report --no-children --stdio
> >
> > 39.26% ls libc-2.21.so [.] __strcoll_l
> > |
> > ---__strcoll_l
> > mpsort_with_tmp
> > mpsort_with_tmp
> > sort_files
> > main
> > __libc_start_main
> > _start
> > 0
> >
> > $ perf report -g caller --no-children --stdio
> > ...
> > 39.26% ls libc-2.21.so [.] __strcoll_l
> > |
> > ---0
> > _start
> > __libc_start_main
> > main
> > sort_files
> > mpsort_with_tmp
> > mpsort_with_tmp
> > __strcoll_l
> >
> >Reported-by: Milian Wolff <[email protected]>
> >Based-on-patch-by: Milian Wolff <[email protected]>
> >Link: http://lkml.kernel.org/n/[email protected]
> >Signed-off-by: Jiri Olsa <[email protected]>
> >---
> > tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
> > 1 file changed, 30 insertions(+), 17 deletions(-)
> >
> >diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
> >index 0ae8844fe7a6..705e1c19f1ea 100644
> >--- a/tools/perf/util/unwind-libunwind.c
> >+++ b/tools/perf/util/unwind-libunwind.c
>
> [SNIP]
>
> >- unw_get_reg(&c, UNW_REG_IP, &ip);
> >- ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
>
> In original code if ip == 0 entry() won't be called.
>
> >+ if (callchain_param.order == ORDER_CALLER)
> >+ j = max_stack - i - 1;
> >+ ret = entry(ips[j], ui->thread, cb, arg);
>
> But in new code event if ips[j] == 0 an entry will be built, which causes
> a behavior changes user noticable:
>
> Before this patch:
>
>
> # perf report --no-children --stdio --call-graph=callee
> ...
> 3.38% a.out a.out [.] funcc
> |
> ---funcc
> |
> --2.70%-- funcb
> funca
> main
> __libc_start_main
> _start
>
> After this patch:
>
> # perf report --no-children --stdio --call-graph=callee
> ...
> 3.38% a.out a.out [.] funcc
> |
> ---funcc
> |
> |--2.70%-- funcb
> | funca
> | main
> | __libc_start_main
> | _start
> |
> --0.68%-- 0
>
>
> I'm not sure whether we can regard this behavior changing as a bugfix? I
> think
> there may be some reason the original code explicitly avoid creating an '0'
> entry.

I think callchain value being 0 is an error or marker for the end of
callchain. So it'd be better avoiding 0 entry.

But unfortunately, we have many 0 entries (and broken callchain after
them) with fp recording on optimized binaries. I think we should omit
those callchains.

Maybe something like this?



diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 5ef90be2a249..22642c5719ab 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1850,6 +1850,15 @@ static int thread__resolve_callchain_sample(struct thread *thread,
#endif
ip = chain->ips[j];

+ /* callchain value inside zero page means it's broken, stop */
+ if (ip < 4096) {
+ if (callchain_param.order == ORDER_CALLER) {
+ callchain_cursor_reset(&callchain_cursor);
+ continue;
+ } else
+ break;
+ }
+
err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);

if (err)

2015-11-18 07:31:36

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder



On 2015/11/18 13:41, Namhyung Kim wrote:
> On Wed, Nov 18, 2015 at 12:13:08PM +0800, Wangnan (F) wrote:
>>
>> On 2015/11/17 23:05, Jiri Olsa wrote:
>>> From: Jiri Olsa <[email protected]>
>>>
>>> As reported by Milian, currently for DWARF unwind (both libdw
>>> and libunwind) we display callchain in callee order only.
>>>
>>> Adding the support to follow callchain order setup to libunwind
>>> DWARF unwinder, so we could get following output for report:
>>>
>>> $ perf record --call-graph dwarf ls
>>> ...
>>> $ perf report --no-children --stdio
>>>
>>> 39.26% ls libc-2.21.so [.] __strcoll_l
>>> |
>>> ---__strcoll_l
>>> mpsort_with_tmp
>>> mpsort_with_tmp
>>> sort_files
>>> main
>>> __libc_start_main
>>> _start
>>> 0
>>>
>>> $ perf report -g caller --no-children --stdio
>>> ...
>>> 39.26% ls libc-2.21.so [.] __strcoll_l
>>> |
>>> ---0
>>> _start
>>> __libc_start_main
>>> main
>>> sort_files
>>> mpsort_with_tmp
>>> mpsort_with_tmp
>>> __strcoll_l
>>>
>>> Reported-by: Milian Wolff <[email protected]>
>>> Based-on-patch-by: Milian Wolff <[email protected]>
>>> Link: http://lkml.kernel.org/n/[email protected]
>>> Signed-off-by: Jiri Olsa <[email protected]>
>>> ---
>>> tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
>>> 1 file changed, 30 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
>>> index 0ae8844fe7a6..705e1c19f1ea 100644
>>> --- a/tools/perf/util/unwind-libunwind.c
>>> +++ b/tools/perf/util/unwind-libunwind.c
>> [SNIP]
>>
>>> - unw_get_reg(&c, UNW_REG_IP, &ip);
>>> - ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
>> In original code if ip == 0 entry() won't be called.
>>
>>> + if (callchain_param.order == ORDER_CALLER)
>>> + j = max_stack - i - 1;
>>> + ret = entry(ips[j], ui->thread, cb, arg);
>> But in new code event if ips[j] == 0 an entry will be built, which causes
>> a behavior changes user noticable:
>>
>> Before this patch:
>>
>>
>> # perf report --no-children --stdio --call-graph=callee
>> ...
>> 3.38% a.out a.out [.] funcc
>> |
>> ---funcc
>> |
>> --2.70%-- funcb
>> funca
>> main
>> __libc_start_main
>> _start
>>
>> After this patch:
>>
>> # perf report --no-children --stdio --call-graph=callee
>> ...
>> 3.38% a.out a.out [.] funcc
>> |
>> ---funcc
>> |
>> |--2.70%-- funcb
>> | funca
>> | main
>> | __libc_start_main
>> | _start
>> |
>> --0.68%-- 0
>>
>>
>> I'm not sure whether we can regard this behavior changing as a bugfix? I
>> think
>> there may be some reason the original code explicitly avoid creating an '0'
>> entry.
> I think callchain value being 0 is an error or marker for the end of
> callchain. So it'd be better avoiding 0 entry.
>
> But unfortunately, we have many 0 entries (and broken callchain after
> them) with fp recording on optimized binaries. I think we should omit
> those callchains.
>
> Maybe something like this?
>
>
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 5ef90be2a249..22642c5719ab 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -1850,6 +1850,15 @@ static int thread__resolve_callchain_sample(struct thread *thread,
> #endif
> ip = chain->ips[j];
>
> + /* callchain value inside zero page means it's broken, stop */
> + if (ip < 4096) {
> + if (callchain_param.order == ORDER_CALLER) {
> + callchain_cursor_reset(&callchain_cursor);
> + continue;
> + } else
> + break;
> + }
> +
> err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);
>
> if (err)

Then we totally get rid of 0 entries, but how can we explain
the sum of overhead of different branches?

Is it possible to explicitly tell user the place where perf
failed to unwind call stack? For example:

3.38% a.out a.out [.] funcc
|
---funcc
|
|--2.70%-- funcb
| funca
| main
| __libc_start_main
| _start
|
--0.68%-- (unwind failure)


Thank you.

2015-11-18 07:52:53

by Jiri Olsa

[permalink] [raw]
Subject: [PATCHv2 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

On Wed, Nov 18, 2015 at 12:13:08PM +0800, Wangnan (F) wrote:
>
>
> On 2015/11/17 23:05, Jiri Olsa wrote:
> >From: Jiri Olsa <[email protected]>
> >
> >As reported by Milian, currently for DWARF unwind (both libdw
> >and libunwind) we display callchain in callee order only.
> >
> >Adding the support to follow callchain order setup to libunwind
> >DWARF unwinder, so we could get following output for report:
> >
> > $ perf record --call-graph dwarf ls
> > ...
> > $ perf report --no-children --stdio
> >
> > 39.26% ls libc-2.21.so [.] __strcoll_l
> > |
> > ---__strcoll_l
> > mpsort_with_tmp
> > mpsort_with_tmp
> > sort_files
> > main
> > __libc_start_main
> > _start
> > 0
> >
> > $ perf report -g caller --no-children --stdio
> > ...
> > 39.26% ls libc-2.21.so [.] __strcoll_l
> > |
> > ---0
> > _start
> > __libc_start_main
> > main
> > sort_files
> > mpsort_with_tmp
> > mpsort_with_tmp
> > __strcoll_l
> >
> >Reported-by: Milian Wolff <[email protected]>
> >Based-on-patch-by: Milian Wolff <[email protected]>
> >Link: http://lkml.kernel.org/n/[email protected]
> >Signed-off-by: Jiri Olsa <[email protected]>
> >---
> > tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
> > 1 file changed, 30 insertions(+), 17 deletions(-)
> >
> >diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
> >index 0ae8844fe7a6..705e1c19f1ea 100644
> >--- a/tools/perf/util/unwind-libunwind.c
> >+++ b/tools/perf/util/unwind-libunwind.c
>
> [SNIP]
>
> >- unw_get_reg(&c, UNW_REG_IP, &ip);
> >- ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
>
> In original code if ip == 0 entry() won't be called.
>
> >+ if (callchain_param.order == ORDER_CALLER)
> >+ j = max_stack - i - 1;
> >+ ret = entry(ips[j], ui->thread, cb, arg);

ouch.. the intention was not to breake current behaviour ;-)

v2 is attached, it's also in perf/callchain_2 branch

thanks,
jirka


---
As reported by Milian, currently for DWARF unwind (both libdw
and libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libunwind
DWARF unwinder, so we could get following output for report:

$ perf record --call-graph dwarf ls
...
$ perf report --no-children --stdio

39.26% ls libc-2.21.so [.] __strcoll_l
|
---__strcoll_l
mpsort_with_tmp
mpsort_with_tmp
sort_files
main
__libc_start_main
_start
0

$ perf report -g caller --no-children --stdio
...
39.26% ls libc-2.21.so [.] __strcoll_l
|
---0
_start
__libc_start_main
main
sort_files
mpsort_with_tmp
mpsort_with_tmp
__strcoll_l

Reported-by: Milian Wolff <[email protected]>
Based-on-patch-by: Milian Wolff <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Jiri Olsa <[email protected]>
---
tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 0ae8844fe7a6..3c258a0e4092 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -615,34 +615,47 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
void *arg, int max_stack)
{
u64 val;
+ unw_word_t ips[max_stack];
unw_addr_space_t addr_space;
unw_cursor_t c;
- int ret;
+ int ret, i = 0;

ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
if (ret)
return ret;

- ret = entry(val, ui->thread, cb, arg);
- if (ret)
- return -ENOMEM;
+ ips[i++] = (unw_word_t) val;

- if (--max_stack == 0)
- return 0;
-
- addr_space = thread__priv(ui->thread);
- if (addr_space == NULL)
- return -1;
+ /*
+ * If we need more than one entry, do the DWARF
+ * unwind itself.
+ */
+ if (max_stack - 1 > 0) {
+ addr_space = thread__priv(ui->thread);
+ if (addr_space == NULL)
+ return -1;
+
+ ret = unw_init_remote(&c, addr_space, ui);
+ if (ret)
+ display_error(ret);
+
+ while (!ret && (unw_step(&c) > 0) && i < max_stack) {
+ unw_get_reg(&c, UNW_REG_IP, &ips[i]);
+ ++i;
+ }

- ret = unw_init_remote(&c, addr_space, ui);
- if (ret)
- display_error(ret);
+ max_stack = i;
+ }

- while (!ret && (unw_step(&c) > 0) && max_stack--) {
- unw_word_t ip;
+ /*
+ * Display what we got based on the order setup.
+ */
+ for (i = 0; i < max_stack && !ret; i++) {
+ int j = i;

- unw_get_reg(&c, UNW_REG_IP, &ip);
- ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
+ if (callchain_param.order == ORDER_CALLER)
+ j = max_stack - i - 1;
+ ret = ips[j] ? entry(ips[j], ui->thread, cb, arg) : 0;
}

return ret;
--
2.4.3

2015-11-18 07:54:36

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

On Wed, Nov 18, 2015 at 12:13:08PM +0800, Wangnan (F) wrote:

SNIP

> 3.38% a.out a.out [.] funcc
> |
> ---funcc
> |
> |--2.70%-- funcb
> | funca
> | main
> | __libc_start_main
> | _start
> |
> --0.68%-- 0
>
>
> I'm not sure whether we can regard this behavior changing as a bugfix? I
> think
> there may be some reason the original code explicitly avoid creating an '0'
> entry.
>
> Then I tried to find why perf can't get call frame on my case, and
> I guess there's something wrong whe dealing with 'call' command, because
> the instruction on it I can't get callchain from libunwind is a 'callq':
>
> ...
> 4005bf: be 00 00 00 00 mov $0x0,%esi
> 4005c4: 48 89 c7 mov %rax,%rdi
> 4005c7: e8 74 fe ff ff callq 400440 <gettimeofday@plt>
> us2 = tv2.tv_sec * 1000000 + tv2.tv_usec;
> 4005cc: 48 8b 04 24 mov (%rsp),%rax
> ...
>
> But this is another problem, we can discuss it in a new thread.

so the problem is you dont see the gettimeofday call at the end?

could you share the test code?

thanks,
jirka

2015-11-18 08:03:34

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder



On 2015/11/18 15:54, Jiri Olsa wrote:
> On Wed, Nov 18, 2015 at 12:13:08PM +0800, Wangnan (F) wrote:
>
> SNIP
>
>> 3.38% a.out a.out [.] funcc
>> |
>> ---funcc
>> |
>> |--2.70%-- funcb
>> | funca
>> | main
>> | __libc_start_main
>> | _start
>> |
>> --0.68%-- 0
>>
>>
>> I'm not sure whether we can regard this behavior changing as a bugfix? I
>> think
>> there may be some reason the original code explicitly avoid creating an '0'
>> entry.
>>
>> Then I tried to find why perf can't get call frame on my case, and
>> I guess there's something wrong whe dealing with 'call' command, because
>> the instruction on it I can't get callchain from libunwind is a 'callq':
>>
>> ...
>> 4005bf: be 00 00 00 00 mov $0x0,%esi
>> 4005c4: 48 89 c7 mov %rax,%rdi
>> 4005c7: e8 74 fe ff ff callq 400440 <gettimeofday@plt>
>> us2 = tv2.tv_sec * 1000000 + tv2.tv_usec;
>> 4005cc: 48 8b 04 24 mov (%rsp),%rax
>> ...
>>
>> But this is another problem, we can discuss it in a new thread.
> so the problem is you dont see the gettimeofday call at the end?

No. The problem is when sample is taken at 'callq' perf is unable to
unwind correctly, even with dwarf and user stack.

> could you share the test code?


I have posted a detail analysis in [1] (rechecked, you are in cc-list).
Test code is here:

#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>

static volatile int x = 0;

int funcc(void)
{
struct timeval tv1, tv2;
unsigned long us1, us2;

gettimeofday(&tv1, NULL);

us1 = tv1.tv_sec * 1000000 + tv1.tv_usec;

while(1) {
x = x + 100;
gettimeofday(&tv2, NULL);
us2 = tv2.tv_sec * 1000000 + tv2.tv_usec;
if (us2 - us1 >= 3000000)
break;
}
return x;
}
int funcb(void) { return funcc();}
int funca(void) { return funcb();}
int main() { funca(); return 0;}

Thank you.

[1] http://lkml.kernel.org/r/[email protected]


>
> thanks,
> jirka

2015-11-18 08:12:11

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

On Wed, Nov 18, 2015 at 08:54:31AM +0100, Jiri Olsa wrote:
> On Wed, Nov 18, 2015 at 12:13:08PM +0800, Wangnan (F) wrote:
>
> SNIP
>
> > 3.38% a.out a.out [.] funcc
> > |
> > ---funcc
> > |
> > |--2.70%-- funcb
> > | funca
> > | main
> > | __libc_start_main
> > | _start
> > |
> > --0.68%-- 0
> >
> >
> > I'm not sure whether we can regard this behavior changing as a bugfix? I
> > think
> > there may be some reason the original code explicitly avoid creating an '0'
> > entry.
> >
> > Then I tried to find why perf can't get call frame on my case, and
> > I guess there's something wrong whe dealing with 'call' command, because
> > the instruction on it I can't get callchain from libunwind is a 'callq':
> >
> > ...
> > 4005bf: be 00 00 00 00 mov $0x0,%esi
> > 4005c4: 48 89 c7 mov %rax,%rdi
> > 4005c7: e8 74 fe ff ff callq 400440 <gettimeofday@plt>
> > us2 = tv2.tv_sec * 1000000 + tv2.tv_usec;
> > 4005cc: 48 8b 04 24 mov (%rsp),%rax
> > ...
> >
> > But this is another problem, we can discuss it in a new thread.
>
> so the problem is you dont see the gettimeofday call at the end?
>
> could you share the test code?

just saw your other email on this issue.. let's continue there

jirka

2015-11-18 08:25:33

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

On Wed, Nov 18, 2015 at 02:41:14PM +0900, Namhyung Kim wrote:

SNIP

> > I'm not sure whether we can regard this behavior changing as a bugfix? I
> > think
> > there may be some reason the original code explicitly avoid creating an '0'
> > entry.
>
> I think callchain value being 0 is an error or marker for the end of
> callchain. So it'd be better avoiding 0 entry.
>
> But unfortunately, we have many 0 entries (and broken callchain after
> them) with fp recording on optimized binaries. I think we should omit
> those callchains.
>
> Maybe something like this?
>
>
>
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 5ef90be2a249..22642c5719ab 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -1850,6 +1850,15 @@ static int thread__resolve_callchain_sample(struct thread *thread,
> #endif
> ip = chain->ips[j];
>
> + /* callchain value inside zero page means it's broken, stop */
> + if (ip < 4096) {
> + if (callchain_param.order == ORDER_CALLER) {
> + callchain_cursor_reset(&callchain_cursor);

hum, do we want to throw away whatever we have till now?

jirka

2015-11-18 08:26:59

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

On Wed, Nov 18, 2015 at 02:41:14PM +0900, Namhyung Kim wrote:

SNIP

>
> I think callchain value being 0 is an error or marker for the end of
> callchain. So it'd be better avoiding 0 entry.
>
> But unfortunately, we have many 0 entries (and broken callchain after
> them) with fp recording on optimized binaries. I think we should omit
> those callchains.
>
> Maybe something like this?
>
>
>
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 5ef90be2a249..22642c5719ab 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -1850,6 +1850,15 @@ static int thread__resolve_callchain_sample(struct thread *thread,
> #endif
> ip = chain->ips[j];
>
> + /* callchain value inside zero page means it's broken, stop */
> + if (ip < 4096) {

you could use page_size in here

jirka

2015-11-18 09:25:37

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

On November 18, 2015 5:25:25 PM GMT+09:00, Jiri Olsa <[email protected]> wrote:
>On Wed, Nov 18, 2015 at 02:41:14PM +0900, Namhyung Kim wrote:
>
>SNIP
>
>> > I'm not sure whether we can regard this behavior changing as a
>bugfix? I
>> > think
>> > there may be some reason the original code explicitly avoid
>creating an '0'
>> > entry.
>>
>> I think callchain value being 0 is an error or marker for the end of
>> callchain. So it'd be better avoiding 0 entry.
>>
>> But unfortunately, we have many 0 entries (and broken callchain after
>> them) with fp recording on optimized binaries. I think we should
>omit
>> those callchains.
>>
>> Maybe something like this?
>>
>>
>>
>> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
>> index 5ef90be2a249..22642c5719ab 100644
>> --- a/tools/perf/util/machine.c
>> +++ b/tools/perf/util/machine.c
>> @@ -1850,6 +1850,15 @@ static int
>thread__resolve_callchain_sample(struct thread *thread,
>> #endif
>> ip = chain->ips[j];
>>
>> + /* callchain value inside zero page means it's broken, stop */
>> + if (ip < 4096) {
>> + if (callchain_param.order == ORDER_CALLER) {
>> + callchain_cursor_reset(&callchain_cursor);
>
>hum, do we want to throw away whatever we have till now?

For caller order, yes.

For callee order, everything after 0 value is garbage. So we need to discard any chains before the 0 for caller IMHO.

Thanks
Namhyung


--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

2015-11-18 09:46:13

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCHv2 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder



On 2015/11/18 15:52, Jiri Olsa wrote:
> On Wed, Nov 18, 2015 at 12:13:08PM +0800, Wangnan (F) wrote:
>>
>> On 2015/11/17 23:05, Jiri Olsa wrote:
>>> From: Jiri Olsa <[email protected]>
>>>
>>> As reported by Milian, currently for DWARF unwind (both libdw
>>> and libunwind) we display callchain in callee order only.
>>>
>>> Adding the support to follow callchain order setup to libunwind
>>> DWARF unwinder, so we could get following output for report:
>>>
>>> $ perf record --call-graph dwarf ls
>>> ...
>>> $ perf report --no-children --stdio
>>>
>>> 39.26% ls libc-2.21.so [.] __strcoll_l
>>> |
>>> ---__strcoll_l
>>> mpsort_with_tmp
>>> mpsort_with_tmp
>>> sort_files
>>> main
>>> __libc_start_main
>>> _start
>>> 0
>>>
>>> $ perf report -g caller --no-children --stdio
>>> ...
>>> 39.26% ls libc-2.21.so [.] __strcoll_l
>>> |
>>> ---0
>>> _start
>>> __libc_start_main
>>> main
>>> sort_files
>>> mpsort_with_tmp
>>> mpsort_with_tmp
>>> __strcoll_l
>>>
>>> Reported-by: Milian Wolff <[email protected]>
>>> Based-on-patch-by: Milian Wolff <[email protected]>
>>> Link: http://lkml.kernel.org/n/[email protected]
>>> Signed-off-by: Jiri Olsa <[email protected]>
>>> ---
>>> tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
>>> 1 file changed, 30 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
>>> index 0ae8844fe7a6..705e1c19f1ea 100644
>>> --- a/tools/perf/util/unwind-libunwind.c
>>> +++ b/tools/perf/util/unwind-libunwind.c
>> [SNIP]
>>
>>> - unw_get_reg(&c, UNW_REG_IP, &ip);
>>> - ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
>> In original code if ip == 0 entry() won't be called.
>>
>>> + if (callchain_param.order == ORDER_CALLER)
>>> + j = max_stack - i - 1;
>>> + ret = entry(ips[j], ui->thread, cb, arg);
> ouch.. the intention was not to breake current behaviour ;-)
>
> v2 is attached, it's also in perf/callchain_2 branch
>
> thanks,
> jirka
>
>
> ---
> As reported by Milian, currently for DWARF unwind (both libdw
> and libunwind) we display callchain in callee order only.
>
> Adding the support to follow callchain order setup to libunwind
> DWARF unwinder, so we could get following output for report:
>
> $ perf record --call-graph dwarf ls
> ...
> $ perf report --no-children --stdio
>
> 39.26% ls libc-2.21.so [.] __strcoll_l
> |
> ---__strcoll_l
> mpsort_with_tmp
> mpsort_with_tmp
> sort_files
> main
> __libc_start_main
> _start
> 0
>
> $ perf report -g caller --no-children --stdio
> ...
> 39.26% ls libc-2.21.so [.] __strcoll_l
> |
> ---0
> _start
> __libc_start_main
> main
> sort_files
> mpsort_with_tmp
> mpsort_with_tmp
> __strcoll_l
>
> Reported-by: Milian Wolff <[email protected]>
> Based-on-patch-by: Milian Wolff <[email protected]>
> Link: http://lkml.kernel.org/n/[email protected]
> Signed-off-by: Jiri Olsa <[email protected]>

This time behavior is not changed. Thank you.

Tested-by: Wang Nan <[email protected]>

2015-11-19 11:22:40

by Jiri Olsa

[permalink] [raw]
Subject: [PATCH 4/3] perf tools: Add callchain order support for libdw DWARF unwinder

On Tue, Nov 17, 2015 at 04:05:36PM +0100, Jiri Olsa wrote:

SNIP

> Tested on x86_64. The change is in generic code only,
> so it should not affect other archs. Still it would be
> nice to have some confirmation.. Wang Nan? ;-)
>
> It'd be nice to have this for libdw unwind as well,
> but it looks like it's out of reach for perf code.. Jan?
>
> Also available in:
> git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> perf/callchain_1

adding also libdw support.. test with 'make NO_LIBUNWIND=1'

Also available in:
git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
perf/callchain_3

thanks,
jirka


---
As reported by Milian, currently for DWARF unwind (both libdw
and libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libdw
DWARF unwinder, so we could get following output for report:

$ perf record --call-graph dwarf ls
...

$ perf report --no-children --stdio

13.63% ls [kernel.vmlinux] [k] __rb_insert_augmented
|
---__rb_insert_augmented
__vma_link_rb
vma_link
do_brk
vm_brk
load_elf_binary
search_binary_handler
do_execveat_common.isra.29
sys_execve
return_from_execve

$ perf report --stdio --no-children -g caller

13.63% ls [kernel.vmlinux] [k] __rb_insert_augmented
|
---return_from_execve
sys_execve
do_execveat_common.isra.29
search_binary_handler
load_elf_binary
vm_brk
do_brk
vma_link
__vma_link_rb
__rb_insert_augmented

Reported-by: Milian Wolff <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Jiri Olsa <[email protected]>
---
tools/perf/util/unwind-libdw.c | 53 ++++++++++++++++++++++++++++++------------
tools/perf/util/unwind-libdw.h | 2 ++
2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 2dcfe9a7c8d0..db8142ba7cb9 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
#include "event.h"
#include "perf_regs.h"
+#include "callchain.h"

static char *debuginfo_path;

@@ -52,25 +53,28 @@ static int report_module(u64 ip, struct unwind_info *ui)
return __report_module(&al, ip, ui);
}

+/*
+ * Store all entries within entries array,
+ * we will process it after we finish unwind.
+ */
static int entry(u64 ip, struct unwind_info *ui)

{
- struct unwind_entry e;
+ struct unwind_entry *e = &ui->entries[ui->idx++];
struct addr_location al;

if (__report_module(&al, ip, ui))
return -1;

- e.ip = ip;
- e.map = al.map;
- e.sym = al.sym;
+ e->ip = ip;
+ e->map = al.map;
+ e->sym = al.sym;

pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
al.sym ? al.sym->name : "''",
ip,
al.map ? al.map->map_ip(al.map, ip) : (u64) 0);
-
- return ui->cb(&e, ui->arg);
+ return 0;
}

static pid_t next_thread(Dwfl *dwfl, void *arg, void **thread_argp)
@@ -168,7 +172,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
struct perf_sample *data,
int max_stack)
{
- struct unwind_info ui = {
+ struct unwind_info *ui, ui_buf = {
.sample = data,
.thread = thread,
.machine = thread->mg->machine,
@@ -177,35 +181,54 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
.max_stack = max_stack,
};
Dwarf_Word ip;
- int err = -EINVAL;
+ int err = -EINVAL, i;

if (!data->user_regs.regs)
return -EINVAL;

- ui.dwfl = dwfl_begin(&offline_callbacks);
- if (!ui.dwfl)
+ ui = zalloc(sizeof(ui_buf) + sizeof(ui_buf.entries[0]) * max_stack);
+ if (!ui)
+ return -ENOMEM;
+
+ *ui = ui_buf;
+
+ ui->dwfl = dwfl_begin(&offline_callbacks);
+ if (!ui->dwfl)
goto out;

err = perf_reg_value(&ip, &data->user_regs, PERF_REG_IP);
if (err)
goto out;

- err = report_module(ip, &ui);
+ err = report_module(ip, ui);
if (err)
goto out;

- if (!dwfl_attach_state(ui.dwfl, EM_NONE, thread->tid, &callbacks, &ui))
+ if (!dwfl_attach_state(ui->dwfl, EM_NONE, thread->tid, &callbacks, ui))
goto out;

- err = dwfl_getthread_frames(ui.dwfl, thread->tid, frame_callback, &ui);
+ err = dwfl_getthread_frames(ui->dwfl, thread->tid, frame_callback, ui);

- if (err && !ui.max_stack)
+ if (err && !ui->max_stack)
err = 0;

+ /*
+ * Display what we got based on the order setup.
+ */
+ for (i = 0; i < ui->idx && !err; i++) {
+ int j = i;
+
+ if (callchain_param.order == ORDER_CALLER)
+ j = ui->idx - i - 1;
+
+ err = ui->entries[j].ip ? ui->cb(&ui->entries[j], ui->arg) : 0;
+ }
+
out:
if (err)
pr_debug("unwind: failed with '%s'\n", dwfl_errmsg(-1));

- dwfl_end(ui.dwfl);
+ dwfl_end(ui->dwfl);
+ free(ui);
return 0;
}
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 417a1426f3ad..58328669ed16 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -16,6 +16,8 @@ struct unwind_info {
unwind_entry_cb_t cb;
void *arg;
int max_stack;
+ int idx;
+ struct unwind_entry entries[];
};

#endif /* __PERF_UNWIND_LIBDW_H */
--
2.4.3

2015-11-19 12:11:16

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH 4/3] perf tools: Add callchain order support for libdw DWARF unwinder



On 2015/11/19 19:22, Jiri Olsa wrote:
> On Tue, Nov 17, 2015 at 04:05:36PM +0100, Jiri Olsa wrote:
>
> SNIP
>
>> Tested on x86_64. The change is in generic code only,
>> so it should not affect other archs. Still it would be
>> nice to have some confirmation.. Wang Nan? ;-)
>>
>> It'd be nice to have this for libdw unwind as well,
>> but it looks like it's out of reach for perf code.. Jan?
>>
>> Also available in:
>> git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
>> perf/callchain_1
> adding also libdw support.. test with 'make NO_LIBUNWIND=1'
>
> Also available in:
> git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> perf/callchain_3
>
> thanks,
> jirka
>
>
> ---
> As reported by Milian, currently for DWARF unwind (both libdw
> and libunwind) we display callchain in callee order only.
>
> Adding the support to follow callchain order setup to libdw
> DWARF unwinder, so we could get following output for report:
>
> $ perf record --call-graph dwarf ls
> ...
>
> $ perf report --no-children --stdio
>
> 13.63% ls [kernel.vmlinux] [k] __rb_insert_augmented
> |
> ---__rb_insert_augmented
> __vma_link_rb
> vma_link
> do_brk
> vm_brk
> load_elf_binary
> search_binary_handler
> do_execveat_common.isra.29
> sys_execve
> return_from_execve
>
> $ perf report --stdio --no-children -g caller
>
> 13.63% ls [kernel.vmlinux] [k] __rb_insert_augmented
> |
> ---return_from_execve
> sys_execve
> do_execveat_common.isra.29
> search_binary_handler
> load_elf_binary
> vm_brk
> do_brk
> vma_link
> __vma_link_rb
> __rb_insert_augmented
>
> Reported-by: Milian Wolff <[email protected]>
> Link: http://lkml.kernel.org/n/[email protected]
> Signed-off-by: Jiri Olsa <[email protected]>

Tested-by: Wang Nan <[email protected]>

And I suggest you put a user space call graph to the commit message
to show the result of this patch. What you show now is not related
to this patch, right?

In addition, just for your information, I find libdw can unwind call
chain I described yesterday in [1].

Thank you.

[1] https://lkml.kernel.org/g/[email protected]

2015-11-19 12:18:51

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH 4/3] perf tools: Add callchain order support for libdw DWARF unwinder



On 2015/11/19 20:10, Wangnan (F) wrote:
>
>
> On 2015/11/19 19:22, Jiri Olsa wrote:
>> On Tue, Nov 17, 2015 at 04:05:36PM +0100, Jiri Olsa wrote:
>>

[SNIP]

> In addition, just for your information, I find libdw can unwind call
> chain I described yesterday in [1].

Please forget it. The problem is still their even with libdw. I forgot
to add -fomit-frame-pointer.

Thank you.
>
> Thank you.
>
> [1] https://lkml.kernel.org/g/[email protected]

2015-11-19 13:01:27

by Jiri Olsa

[permalink] [raw]
Subject: [PATCHv2 4/3] perf tools: Add callchain order support for libdw DWARF unwinder

On Thu, Nov 19, 2015 at 08:10:45PM +0800, Wangnan (F) wrote:

SNIP

> > load_elf_binary
> > vm_brk
> > do_brk
> > vma_link
> > __vma_link_rb
> > __rb_insert_augmented
> >
> >Reported-by: Milian Wolff <[email protected]>
> >Link: http://lkml.kernel.org/n/[email protected]
> >Signed-off-by: Jiri Olsa <[email protected]>
>
> Tested-by: Wang Nan <[email protected]>
>
> And I suggest you put a user space call graph to the commit message
> to show the result of this patch. What you show now is not related
> to this patch, right?

oopsie.. sry ;-) attached v2

thanks,
jirka


---
As reported by Milian, currently for DWARF unwind (both libdw
and libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libdw
DWARF unwinder, so we could get following output for report:

$ perf record --call-graph dwarf ls
...

$ perf report --no-children --stdio

21.12% ls libc-2.21.so [.] __strcoll_l
|
---__strcoll_l
mpsort_with_tmp
mpsort_with_tmp
mpsort_with_tmp
sort_files
main
__libc_start_main
_start

$ perf report --stdio --no-children -g caller

21.12% ls libc-2.21.so [.] __strcoll_l
|
---_start
__libc_start_main
main
sort_files
mpsort_with_tmp
mpsort_with_tmp
mpsort_with_tmp
__strcoll_l

Tested-by: Wang Nan <[email protected]>
Reported-by: Milian Wolff <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Jiri Olsa <[email protected]>
---
tools/perf/util/unwind-libdw.c | 53 ++++++++++++++++++++++++++++++------------
tools/perf/util/unwind-libdw.h | 2 ++
2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 2dcfe9a7c8d0..db8142ba7cb9 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
#include "event.h"
#include "perf_regs.h"
+#include "callchain.h"

static char *debuginfo_path;

@@ -52,25 +53,28 @@ static int report_module(u64 ip, struct unwind_info *ui)
return __report_module(&al, ip, ui);
}

+/*
+ * Store all entries within entries array,
+ * we will process it after we finish unwind.
+ */
static int entry(u64 ip, struct unwind_info *ui)

{
- struct unwind_entry e;
+ struct unwind_entry *e = &ui->entries[ui->idx++];
struct addr_location al;

if (__report_module(&al, ip, ui))
return -1;

- e.ip = ip;
- e.map = al.map;
- e.sym = al.sym;
+ e->ip = ip;
+ e->map = al.map;
+ e->sym = al.sym;

pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
al.sym ? al.sym->name : "''",
ip,
al.map ? al.map->map_ip(al.map, ip) : (u64) 0);
-
- return ui->cb(&e, ui->arg);
+ return 0;
}

static pid_t next_thread(Dwfl *dwfl, void *arg, void **thread_argp)
@@ -168,7 +172,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
struct perf_sample *data,
int max_stack)
{
- struct unwind_info ui = {
+ struct unwind_info *ui, ui_buf = {
.sample = data,
.thread = thread,
.machine = thread->mg->machine,
@@ -177,35 +181,54 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
.max_stack = max_stack,
};
Dwarf_Word ip;
- int err = -EINVAL;
+ int err = -EINVAL, i;

if (!data->user_regs.regs)
return -EINVAL;

- ui.dwfl = dwfl_begin(&offline_callbacks);
- if (!ui.dwfl)
+ ui = zalloc(sizeof(ui_buf) + sizeof(ui_buf.entries[0]) * max_stack);
+ if (!ui)
+ return -ENOMEM;
+
+ *ui = ui_buf;
+
+ ui->dwfl = dwfl_begin(&offline_callbacks);
+ if (!ui->dwfl)
goto out;

err = perf_reg_value(&ip, &data->user_regs, PERF_REG_IP);
if (err)
goto out;

- err = report_module(ip, &ui);
+ err = report_module(ip, ui);
if (err)
goto out;

- if (!dwfl_attach_state(ui.dwfl, EM_NONE, thread->tid, &callbacks, &ui))
+ if (!dwfl_attach_state(ui->dwfl, EM_NONE, thread->tid, &callbacks, ui))
goto out;

- err = dwfl_getthread_frames(ui.dwfl, thread->tid, frame_callback, &ui);
+ err = dwfl_getthread_frames(ui->dwfl, thread->tid, frame_callback, ui);

- if (err && !ui.max_stack)
+ if (err && !ui->max_stack)
err = 0;

+ /*
+ * Display what we got based on the order setup.
+ */
+ for (i = 0; i < ui->idx && !err; i++) {
+ int j = i;
+
+ if (callchain_param.order == ORDER_CALLER)
+ j = ui->idx - i - 1;
+
+ err = ui->entries[j].ip ? ui->cb(&ui->entries[j], ui->arg) : 0;
+ }
+
out:
if (err)
pr_debug("unwind: failed with '%s'\n", dwfl_errmsg(-1));

- dwfl_end(ui.dwfl);
+ dwfl_end(ui->dwfl);
+ free(ui);
return 0;
}
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 417a1426f3ad..58328669ed16 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -16,6 +16,8 @@ struct unwind_info {
unwind_entry_cb_t cb;
void *arg;
int max_stack;
+ int idx;
+ struct unwind_entry entries[];
};

#endif /* __PERF_UNWIND_LIBDW_H */
--
2.4.3

2015-11-19 19:10:45

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 0/3] perf tools DWARF libunwind: Add callchain order support

Em Tue, Nov 17, 2015 at 04:05:36PM +0100, Jiri Olsa escreveu:
> hi,
> as reported by Milian, currently for DWARF unwind (both libdw
> and libunwind) we display callchain in callee order only.

Applied the latest versions, please check my perf/core branch to check
that all is there as you expect.

- Arnaldo

2015-11-20 08:38:12

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH 0/3] perf tools DWARF libunwind: Add callchain order support

On Thu, Nov 19, 2015 at 04:10:37PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, Nov 17, 2015 at 04:05:36PM +0100, Jiri Olsa escreveu:
> > hi,
> > as reported by Milian, currently for DWARF unwind (both libdw
> > and libunwind) we display callchain in callee order only.
>
> Applied the latest versions, please check my perf/core branch to check
> that all is there as you expect.

looks ok, thanks

jirka

2015-11-22 15:27:46

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf tools: Add callchain order support for libunwind DWARF unwinder

Hello,

Sorry for late reply, I missed this in the mailbox..

On Wed, Nov 18, 2015 at 03:26:54PM +0800, Wangnan (F) wrote:
> On 2015/11/18 13:41, Namhyung Kim wrote:
> >On Wed, Nov 18, 2015 at 12:13:08PM +0800, Wangnan (F) wrote:
> >>
> >>On 2015/11/17 23:05, Jiri Olsa wrote:
> >>>From: Jiri Olsa <[email protected]>
> >>>
> >>>As reported by Milian, currently for DWARF unwind (both libdw
> >>>and libunwind) we display callchain in callee order only.
> >>>
> >>>Adding the support to follow callchain order setup to libunwind
> >>>DWARF unwinder, so we could get following output for report:
> >>>
> >>> $ perf record --call-graph dwarf ls
> >>> ...
> >>> $ perf report --no-children --stdio
> >>>
> >>> 39.26% ls libc-2.21.so [.] __strcoll_l
> >>> |
> >>> ---__strcoll_l
> >>> mpsort_with_tmp
> >>> mpsort_with_tmp
> >>> sort_files
> >>> main
> >>> __libc_start_main
> >>> _start
> >>> 0
> >>>
> >>> $ perf report -g caller --no-children --stdio
> >>> ...
> >>> 39.26% ls libc-2.21.so [.] __strcoll_l
> >>> |
> >>> ---0
> >>> _start
> >>> __libc_start_main
> >>> main
> >>> sort_files
> >>> mpsort_with_tmp
> >>> mpsort_with_tmp
> >>> __strcoll_l
> >>>
> >>>Reported-by: Milian Wolff <[email protected]>
> >>>Based-on-patch-by: Milian Wolff <[email protected]>
> >>>Link: http://lkml.kernel.org/n/[email protected]
> >>>Signed-off-by: Jiri Olsa <[email protected]>
> >>>---
> >>> tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
> >>> 1 file changed, 30 insertions(+), 17 deletions(-)
> >>>
> >>>diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
> >>>index 0ae8844fe7a6..705e1c19f1ea 100644
> >>>--- a/tools/perf/util/unwind-libunwind.c
> >>>+++ b/tools/perf/util/unwind-libunwind.c
> >>[SNIP]
> >>
> >>>- unw_get_reg(&c, UNW_REG_IP, &ip);
> >>>- ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
> >>In original code if ip == 0 entry() won't be called.
> >>
> >>>+ if (callchain_param.order == ORDER_CALLER)
> >>>+ j = max_stack - i - 1;
> >>>+ ret = entry(ips[j], ui->thread, cb, arg);
> >>But in new code event if ips[j] == 0 an entry will be built, which causes
> >>a behavior changes user noticable:
> >>
> >>Before this patch:
> >>
> >>
> >># perf report --no-children --stdio --call-graph=callee
> >>...
> >> 3.38% a.out a.out [.] funcc
> >> |
> >> ---funcc
> >> |
> >> --2.70%-- funcb
> >> funca
> >> main
> >> __libc_start_main
> >> _start
> >>
> >>After this patch:
> >>
> >># perf report --no-children --stdio --call-graph=callee
> >>...
> >> 3.38% a.out a.out [.] funcc
> >> |
> >> ---funcc
> >> |
> >> |--2.70%-- funcb
> >> | funca
> >> | main
> >> | __libc_start_main
> >> | _start
> >> |
> >> --0.68%-- 0
> >>
> >>
> >>I'm not sure whether we can regard this behavior changing as a bugfix? I
> >>think
> >>there may be some reason the original code explicitly avoid creating an '0'
> >>entry.
> >I think callchain value being 0 is an error or marker for the end of
> >callchain. So it'd be better avoiding 0 entry.
> >
> >But unfortunately, we have many 0 entries (and broken callchain after
> >them) with fp recording on optimized binaries. I think we should omit
> >those callchains.
> >
> >Maybe something like this?
> >
> >
> >diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> >index 5ef90be2a249..22642c5719ab 100644
> >--- a/tools/perf/util/machine.c
> >+++ b/tools/perf/util/machine.c
> >@@ -1850,6 +1850,15 @@ static int thread__resolve_callchain_sample(struct thread *thread,
> > #endif
> > ip = chain->ips[j];
> >+ /* callchain value inside zero page means it's broken, stop */
> >+ if (ip < 4096) {
> >+ if (callchain_param.order == ORDER_CALLER) {
> >+ callchain_cursor_reset(&callchain_cursor);
> >+ continue;
> >+ } else
> >+ break;
> >+ }
> >+
> > err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);
> > if (err)
>
> Then we totally get rid of 0 entries, but how can we explain
> the sum of overhead of different branches?
>
> Is it possible to explicitly tell user the place where perf
> failed to unwind call stack? For example:
>
> 3.38% a.out a.out [.] funcc
> |
> ---funcc
> |
> |--2.70%-- funcb
> | funca
> | main
> | __libc_start_main
> | _start
> |
> --0.68%-- (unwind failure)

Hmm.. we have something similar in the fractal callchain mode, but it
doesn't say about the failure.

I think it's hard for perf to know whether a given callchain is broken
or not. It seems that correct callchains end with 0 entry, but broken
chains all can have 0 entries in the middle.

Thanks,
Namhyung

2015-11-22 19:19:11

by Milian Wolff

[permalink] [raw]
Subject: Re: [PATCH 0/3] perf tools DWARF libunwind: Add callchain order support

On Donnerstag, 19. November 2015 16:10:37 CET Arnaldo Carvalho de Melo wrote:
> Em Tue, Nov 17, 2015 at 04:05:36PM +0100, Jiri Olsa escreveu:
> > hi,
> > as reported by Milian, currently for DWARF unwind (both libdw
> > and libunwind) we display callchain in callee order only.
>
> Applied the latest versions, please check my perf/core branch to check
> that all is there as you expect.

Sorry for the late reply. I've just tested the current code in perf/core -
works like a charm! Many thanks to Jiri for improving my initial take on this
problem and also getting it fixed for libdw. Much appreciated!

Cheers

--
Milian Wolff | [email protected] | Software Engineer
KDAB (Deutschland) GmbH&Co KG, a KDAB Group company
Tel: +49-30-521325470
KDAB - The Qt Experts


Attachments:
smime.p7s (5.76 kB)
Subject: [tip:perf/core] perf callchain: Move initial entry call into get_entries function

Commit-ID: b26b218a1e9c5815cb8964e180b7fba3cd9bd509
Gitweb: http://git.kernel.org/tip/b26b218a1e9c5815cb8964e180b7fba3cd9bd509
Author: Jiri Olsa <[email protected]>
AuthorDate: Tue, 17 Nov 2015 16:05:37 +0100
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 23 Nov 2015 18:30:10 -0300

perf callchain: Move initial entry call into get_entries function

Moving initial entry call into get_entries function so all entries
processing is on one place. It will be useful for next change that adds
ordering logic.

Signed-off-by: Jiri Olsa <[email protected]>
Tested-by: Milian Wolff <[email protected]>
Cc: David Ahern <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Wang Nan <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/util/unwind-libunwind.c | 25 +++++++++++++++----------
1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index c83832b..0ae8844 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -614,10 +614,22 @@ void unwind__finish_access(struct thread *thread)
static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
void *arg, int max_stack)
{
+ u64 val;
unw_addr_space_t addr_space;
unw_cursor_t c;
int ret;

+ ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
+ if (ret)
+ return ret;
+
+ ret = entry(val, ui->thread, cb, arg);
+ if (ret)
+ return -ENOMEM;
+
+ if (--max_stack == 0)
+ return 0;
+
addr_space = thread__priv(ui->thread);
if (addr_space == NULL)
return -1;
@@ -640,24 +652,17 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
struct thread *thread,
struct perf_sample *data, int max_stack)
{
- u64 ip;
struct unwind_info ui = {
.sample = data,
.thread = thread,
.machine = thread->mg->machine,
};
- int ret;

if (!data->user_regs.regs)
return -EINVAL;

- ret = perf_reg_value(&ip, &data->user_regs, PERF_REG_IP);
- if (ret)
- return ret;
-
- ret = entry(ip, thread, cb, arg);
- if (ret)
- return -ENOMEM;
+ if (max_stack <= 0)
+ return -EINVAL;

- return --max_stack > 0 ? get_entries(&ui, cb, arg, max_stack) : 0;
+ return get_entries(&ui, cb, arg, max_stack);
}

Subject: [tip:perf/core] perf callchain: Add order support for libunwind DWARF unwinder

Commit-ID: cb1dc22dce6e54dbd1eac213c9216e1aa57084da
Gitweb: http://git.kernel.org/tip/cb1dc22dce6e54dbd1eac213c9216e1aa57084da
Author: Jiri Olsa <[email protected]>
AuthorDate: Wed, 18 Nov 2015 08:52:47 +0100
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 23 Nov 2015 18:30:41 -0300

perf callchain: Add order support for libunwind DWARF unwinder

As reported by Milian, currently for DWARF unwind (both libdw and
libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libunwind DWARF
unwinder, so we could get following output for report:

$ perf record --call-graph dwarf ls
...
$ perf report --no-children --stdio

39.26% ls libc-2.21.so [.] __strcoll_l
|
---__strcoll_l
mpsort_with_tmp
mpsort_with_tmp
sort_files
main
__libc_start_main
_start
0

$ perf report -g caller --no-children --stdio
...
39.26% ls libc-2.21.so [.] __strcoll_l
|
---0
_start
__libc_start_main
main
sort_files
mpsort_with_tmp
mpsort_with_tmp
__strcoll_l

Based-on-patch-by: Milian Wolff <[email protected]>
Reported-and-Tested-by: Milian Wolff <[email protected]>
Signed-off-by: Jiri Olsa <[email protected]>
Tested-by: Arnaldo Carvalho de Melo <[email protected]>
Tested-by: Wang Nan <[email protected]>
Cc: David Ahern <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Wang Nan <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 0ae8844..3c258a0 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -615,34 +615,47 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
void *arg, int max_stack)
{
u64 val;
+ unw_word_t ips[max_stack];
unw_addr_space_t addr_space;
unw_cursor_t c;
- int ret;
+ int ret, i = 0;

ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
if (ret)
return ret;

- ret = entry(val, ui->thread, cb, arg);
- if (ret)
- return -ENOMEM;
+ ips[i++] = (unw_word_t) val;

- if (--max_stack == 0)
- return 0;
-
- addr_space = thread__priv(ui->thread);
- if (addr_space == NULL)
- return -1;
+ /*
+ * If we need more than one entry, do the DWARF
+ * unwind itself.
+ */
+ if (max_stack - 1 > 0) {
+ addr_space = thread__priv(ui->thread);
+ if (addr_space == NULL)
+ return -1;
+
+ ret = unw_init_remote(&c, addr_space, ui);
+ if (ret)
+ display_error(ret);
+
+ while (!ret && (unw_step(&c) > 0) && i < max_stack) {
+ unw_get_reg(&c, UNW_REG_IP, &ips[i]);
+ ++i;
+ }

- ret = unw_init_remote(&c, addr_space, ui);
- if (ret)
- display_error(ret);
+ max_stack = i;
+ }

- while (!ret && (unw_step(&c) > 0) && max_stack--) {
- unw_word_t ip;
+ /*
+ * Display what we got based on the order setup.
+ */
+ for (i = 0; i < max_stack && !ret; i++) {
+ int j = i;

- unw_get_reg(&c, UNW_REG_IP, &ip);
- ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
+ if (callchain_param.order == ORDER_CALLER)
+ j = max_stack - i - 1;
+ ret = ips[j] ? entry(ips[j], ui->thread, cb, arg) : 0;
}

return ret;

Subject: [tip:perf/core] perf test: Add callchain order setup for DWARF unwinder test

Commit-ID: 8dc0564d809e3903834950e2d12f6d1d2fcff708
Gitweb: http://git.kernel.org/tip/8dc0564d809e3903834950e2d12f6d1d2fcff708
Author: Jiri Olsa <[email protected]>
AuthorDate: Tue, 17 Nov 2015 16:05:39 +0100
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 23 Nov 2015 18:31:01 -0300

perf test: Add callchain order setup for DWARF unwinder test

Adding callchain order setup for DWARF unwinder test. The test now runs
unwinder for both callee and caller orders.

Signed-off-by: Jiri Olsa <[email protected]>
Tested-by: Milian Wolff <[email protected]>
Cc: David Ahern <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Wang Nan <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/tests/dwarf-unwind.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 01f0b61..b2357e8 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -51,6 +51,12 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
"krava_1",
"test__dwarf_unwind"
};
+ /*
+ * The funcs[MAX_STACK] array index, based on the
+ * callchain order setup.
+ */
+ int idx = callchain_param.order == ORDER_CALLER ?
+ MAX_STACK - *cnt - 1 : *cnt;

if (*cnt >= MAX_STACK) {
pr_debug("failed: crossed the max stack value %d\n", MAX_STACK);
@@ -63,8 +69,10 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
return -1;
}

- pr_debug("got: %s 0x%" PRIx64 "\n", symbol, entry->ip);
- return strcmp((const char *) symbol, funcs[(*cnt)++]);
+ (*cnt)++;
+ pr_debug("got: %s 0x%" PRIx64 ", expecting %s\n",
+ symbol, entry->ip, funcs[idx]);
+ return strcmp((const char *) symbol, funcs[idx]);
}

__attribute__ ((noinline))
@@ -105,8 +113,16 @@ static int compare(void *p1, void *p2)
/* Any possible value should be 'thread' */
struct thread *thread = *(struct thread **)p1;

- if (global_unwind_retval == -INT_MAX)
+ if (global_unwind_retval == -INT_MAX) {
+ /* Call unwinder twice for both callchain orders. */
+ callchain_param.order = ORDER_CALLER;
+
global_unwind_retval = unwind_thread(thread);
+ if (!global_unwind_retval) {
+ callchain_param.order = ORDER_CALLEE;
+ global_unwind_retval = unwind_thread(thread);
+ }
+ }

return p1 - p2;
}

Subject: [tip:perf/core] perf callchain: Add order support for libdw DWARF unwinder

Commit-ID: 8bd508b001629a5d836987d9a0702a6bfc4fc705
Gitweb: http://git.kernel.org/tip/8bd508b001629a5d836987d9a0702a6bfc4fc705
Author: Jiri Olsa <[email protected]>
AuthorDate: Thu, 19 Nov 2015 14:01:19 +0100
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 23 Nov 2015 18:31:13 -0300

perf callchain: Add order support for libdw DWARF unwinder

As reported by Milian, currently for DWARF unwind (both libdw and
libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libdw DWARF
unwinder, so we could get following output for report:

$ perf record --call-graph dwarf ls
...

$ perf report --no-children --stdio

21.12% ls libc-2.21.so [.] __strcoll_l
|
---__strcoll_l
mpsort_with_tmp
mpsort_with_tmp
mpsort_with_tmp
sort_files
main
__libc_start_main
_start

$ perf report --stdio --no-children -g caller

21.12% ls libc-2.21.so [.] __strcoll_l
|
---_start
__libc_start_main
main
sort_files
mpsort_with_tmp
mpsort_with_tmp
mpsort_with_tmp
__strcoll_l

Reported-and-Tested-by: Milian Wolff <[email protected]>
Signed-off-by: Jiri Olsa <[email protected]>
Tested-by: Wang Nan <[email protected]>
Cc: David Ahern <[email protected]>
Cc: Jan Kratochvil <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/util/unwind-libdw.c | 53 ++++++++++++++++++++++++++++++------------
tools/perf/util/unwind-libdw.h | 2 ++
2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 2dcfe9a..db8142b 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
#include "event.h"
#include "perf_regs.h"
+#include "callchain.h"

static char *debuginfo_path;

@@ -52,25 +53,28 @@ static int report_module(u64 ip, struct unwind_info *ui)
return __report_module(&al, ip, ui);
}

+/*
+ * Store all entries within entries array,
+ * we will process it after we finish unwind.
+ */
static int entry(u64 ip, struct unwind_info *ui)

{
- struct unwind_entry e;
+ struct unwind_entry *e = &ui->entries[ui->idx++];
struct addr_location al;

if (__report_module(&al, ip, ui))
return -1;

- e.ip = ip;
- e.map = al.map;
- e.sym = al.sym;
+ e->ip = ip;
+ e->map = al.map;
+ e->sym = al.sym;

pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
al.sym ? al.sym->name : "''",
ip,
al.map ? al.map->map_ip(al.map, ip) : (u64) 0);
-
- return ui->cb(&e, ui->arg);
+ return 0;
}

static pid_t next_thread(Dwfl *dwfl, void *arg, void **thread_argp)
@@ -168,7 +172,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
struct perf_sample *data,
int max_stack)
{
- struct unwind_info ui = {
+ struct unwind_info *ui, ui_buf = {
.sample = data,
.thread = thread,
.machine = thread->mg->machine,
@@ -177,35 +181,54 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
.max_stack = max_stack,
};
Dwarf_Word ip;
- int err = -EINVAL;
+ int err = -EINVAL, i;

if (!data->user_regs.regs)
return -EINVAL;

- ui.dwfl = dwfl_begin(&offline_callbacks);
- if (!ui.dwfl)
+ ui = zalloc(sizeof(ui_buf) + sizeof(ui_buf.entries[0]) * max_stack);
+ if (!ui)
+ return -ENOMEM;
+
+ *ui = ui_buf;
+
+ ui->dwfl = dwfl_begin(&offline_callbacks);
+ if (!ui->dwfl)
goto out;

err = perf_reg_value(&ip, &data->user_regs, PERF_REG_IP);
if (err)
goto out;

- err = report_module(ip, &ui);
+ err = report_module(ip, ui);
if (err)
goto out;

- if (!dwfl_attach_state(ui.dwfl, EM_NONE, thread->tid, &callbacks, &ui))
+ if (!dwfl_attach_state(ui->dwfl, EM_NONE, thread->tid, &callbacks, ui))
goto out;

- err = dwfl_getthread_frames(ui.dwfl, thread->tid, frame_callback, &ui);
+ err = dwfl_getthread_frames(ui->dwfl, thread->tid, frame_callback, ui);

- if (err && !ui.max_stack)
+ if (err && !ui->max_stack)
err = 0;

+ /*
+ * Display what we got based on the order setup.
+ */
+ for (i = 0; i < ui->idx && !err; i++) {
+ int j = i;
+
+ if (callchain_param.order == ORDER_CALLER)
+ j = ui->idx - i - 1;
+
+ err = ui->entries[j].ip ? ui->cb(&ui->entries[j], ui->arg) : 0;
+ }
+
out:
if (err)
pr_debug("unwind: failed with '%s'\n", dwfl_errmsg(-1));

- dwfl_end(ui.dwfl);
+ dwfl_end(ui->dwfl);
+ free(ui);
return 0;
}
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 417a142..5832866 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -16,6 +16,8 @@ struct unwind_info {
unwind_entry_cb_t cb;
void *arg;
int max_stack;
+ int idx;
+ struct unwind_entry entries[];
};

#endif /* __PERF_UNWIND_LIBDW_H */