When executing perf stat with armv8_pmu events with a workload, it will
report a segfault as result.
(gdb) bt
#0 0x0000000000603fc8 in perf_evsel__close_fd_cpu (evsel=<optimized out>,
cpu=<optimized out>) at evsel.c:122
#1 perf_evsel__close_cpu (evsel=evsel@entry=0x716e950, cpu=7) at evsel.c:156
#2 0x00000000004d4718 in evlist__close (evlist=0x70a7cb0) at util/evlist.c:1242
#3 0x0000000000453404 in __run_perf_stat (argc=3, argc@entry=1, argv=0x30,
argv@entry=0xfffffaea2f90, run_idx=119, run_idx@entry=1701998435)
at builtin-stat.c:929
#4 0x0000000000455058 in run_perf_stat (run_idx=1701998435, argv=0xfffffaea2f90,
argc=1) at builtin-stat.c:947
#5 cmd_stat (argc=1, argv=0xfffffaea2f90) at builtin-stat.c:2357
#6 0x00000000004bb888 in run_builtin (p=p@entry=0x9764b8 <commands+288>,
argc=argc@entry=4, argv=argv@entry=0xfffffaea2f90) at perf.c:312
#7 0x00000000004bbb54 in handle_internal_command (argc=argc@entry=4,
argv=argv@entry=0xfffffaea2f90) at perf.c:364
#8 0x0000000000435378 in run_argv (argcp=<synthetic pointer>,
argv=<synthetic pointer>) at perf.c:408
#9 main (argc=4, argv=0xfffffaea2f90) at perf.c:538
After debugging, i found the root reason is that the xyarray fd is created
by evsel__open_per_thread() ignoring the cpu passed in
create_perf_stat_counter(), while the evsel' cpumap is assigned as the
corresponding PMU's cpumap in __add_event(). Thus, the xyarray fd is created
with ncpus of dummy cpumap and an out of bounds 'cpu' index will be used in
perf_evsel__close_fd_cpu().
To address this, add a flag to mark this situation and avoid using the
affinity technique when closing/enabling/disabling events.
Fixes: 7736627b865d ("perf stat: Use affinity for closing file descriptors")
Fixes: 704e2f5b700d ("perf stat: Use affinity for enabling/disabling events")
Signed-off-by: Wei Li <[email protected]>
---
tools/lib/perf/include/internal/evlist.h | 1 +
tools/perf/builtin-stat.c | 3 +++
tools/perf/util/evlist.c | 23 ++++++++++++++++++++++-
3 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/tools/lib/perf/include/internal/evlist.h b/tools/lib/perf/include/internal/evlist.h
index 2d0fa02b036f..c02d7e583846 100644
--- a/tools/lib/perf/include/internal/evlist.h
+++ b/tools/lib/perf/include/internal/evlist.h
@@ -17,6 +17,7 @@ struct perf_evlist {
struct list_head entries;
int nr_entries;
bool has_user_cpus;
+ bool open_per_thread;
struct perf_cpu_map *cpus;
struct perf_cpu_map *all_cpus;
struct perf_thread_map *threads;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index fddc97cac984..6e6ceacce634 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -725,6 +725,9 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
if (group)
perf_evlist__set_leader(evsel_list);
+ if (!(target__has_cpu(&target) && !target__has_per_thread(&target)))
+ evsel_list->core.open_per_thread = true;
+
if (affinity__setup(&affinity) < 0)
return -1;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index e3fa3bf7498a..bf8a3ccc599f 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -383,6 +383,15 @@ void evlist__disable(struct evlist *evlist)
int cpu, i, imm = 0;
bool has_imm = false;
+ if (evlist->core.open_per_thread) {
+ evlist__for_each_entry(evlist, pos) {
+ if (pos->disabled || !evsel__is_group_leader(pos) || !pos->core.fd)
+ continue;
+ evsel__disable(pos);
+ }
+ goto out;
+ }
+
if (affinity__setup(&affinity) < 0)
return;
@@ -414,6 +423,7 @@ void evlist__disable(struct evlist *evlist)
pos->disabled = true;
}
+out:
evlist->enabled = false;
}
@@ -423,6 +433,15 @@ void evlist__enable(struct evlist *evlist)
struct affinity affinity;
int cpu, i;
+ if (evlist->core.open_per_thread) {
+ evlist__for_each_entry(evlist, pos) {
+ if (!evsel__is_group_leader(pos) || !pos->core.fd)
+ continue;
+ evsel__enable(pos);
+ }
+ goto out;
+ }
+
if (affinity__setup(&affinity) < 0)
return;
@@ -444,6 +463,7 @@ void evlist__enable(struct evlist *evlist)
pos->disabled = false;
}
+out:
evlist->enabled = true;
}
@@ -1223,9 +1243,10 @@ void evlist__close(struct evlist *evlist)
/*
* With perf record core.cpus is usually NULL.
+ * Or perf stat may open events per-thread.
* Use the old method to handle this for now.
*/
- if (!evlist->core.cpus) {
+ if (evlist->core.open_per_thread || !evlist->core.cpus) {
evlist__for_each_entry_reverse(evlist, evsel)
evsel__close(evsel);
return;
--
2.17.1
> After debugging, i found the root reason is that the xyarray fd is created
> by evsel__open_per_thread() ignoring the cpu passed in
> create_perf_stat_counter(), while the evsel' cpumap is assigned as the
> corresponding PMU's cpumap in __add_event(). Thus, the xyarray fd is created
> with ncpus of dummy cpumap and an out of bounds 'cpu' index will be used in
> perf_evsel__close_fd_cpu().
>
> To address this, add a flag to mark this situation and avoid using the
> affinity technique when closing/enabling/disabling events.
The flag seems like a hack. How about figuring out the correct number of
CPUs and using that?
-Andi
On Tue, Sep 22, 2020 at 12:23:21PM -0700, Andi Kleen wrote:
> > After debugging, i found the root reason is that the xyarray fd is created
> > by evsel__open_per_thread() ignoring the cpu passed in
> > create_perf_stat_counter(), while the evsel' cpumap is assigned as the
> > corresponding PMU's cpumap in __add_event(). Thus, the xyarray fd is created
> > with ncpus of dummy cpumap and an out of bounds 'cpu' index will be used in
> > perf_evsel__close_fd_cpu().
> >
> > To address this, add a flag to mark this situation and avoid using the
> > affinity technique when closing/enabling/disabling events.
>
> The flag seems like a hack. How about figuring out the correct number of
> CPUs and using that?
Also would like to understand what's different on ARM64 than other architectures.
Or could this happen on x86 too?
-Andi
On Tue, Sep 22, 2020 at 11:13:45AM +0800, Wei Li wrote:
> When executing perf stat with armv8_pmu events with a workload, it will
> report a segfault as result.
please share the perf stat command line you see that segfault for
thanks,
jirka
>
> (gdb) bt
> #0 0x0000000000603fc8 in perf_evsel__close_fd_cpu (evsel=<optimized out>,
> cpu=<optimized out>) at evsel.c:122
> #1 perf_evsel__close_cpu (evsel=evsel@entry=0x716e950, cpu=7) at evsel.c:156
> #2 0x00000000004d4718 in evlist__close (evlist=0x70a7cb0) at util/evlist.c:1242
> #3 0x0000000000453404 in __run_perf_stat (argc=3, argc@entry=1, argv=0x30,
> argv@entry=0xfffffaea2f90, run_idx=119, run_idx@entry=1701998435)
> at builtin-stat.c:929
> #4 0x0000000000455058 in run_perf_stat (run_idx=1701998435, argv=0xfffffaea2f90,
> argc=1) at builtin-stat.c:947
> #5 cmd_stat (argc=1, argv=0xfffffaea2f90) at builtin-stat.c:2357
> #6 0x00000000004bb888 in run_builtin (p=p@entry=0x9764b8 <commands+288>,
> argc=argc@entry=4, argv=argv@entry=0xfffffaea2f90) at perf.c:312
> #7 0x00000000004bbb54 in handle_internal_command (argc=argc@entry=4,
> argv=argv@entry=0xfffffaea2f90) at perf.c:364
> #8 0x0000000000435378 in run_argv (argcp=<synthetic pointer>,
> argv=<synthetic pointer>) at perf.c:408
> #9 main (argc=4, argv=0xfffffaea2f90) at perf.c:538
>
> After debugging, i found the root reason is that the xyarray fd is created
> by evsel__open_per_thread() ignoring the cpu passed in
> create_perf_stat_counter(), while the evsel' cpumap is assigned as the
> corresponding PMU's cpumap in __add_event(). Thus, the xyarray fd is created
> with ncpus of dummy cpumap and an out of bounds 'cpu' index will be used in
> perf_evsel__close_fd_cpu().
>
> To address this, add a flag to mark this situation and avoid using the
> affinity technique when closing/enabling/disabling events.
>
> Fixes: 7736627b865d ("perf stat: Use affinity for closing file descriptors")
> Fixes: 704e2f5b700d ("perf stat: Use affinity for enabling/disabling events")
> Signed-off-by: Wei Li <[email protected]>
> ---
> tools/lib/perf/include/internal/evlist.h | 1 +
> tools/perf/builtin-stat.c | 3 +++
> tools/perf/util/evlist.c | 23 ++++++++++++++++++++++-
> 3 files changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/tools/lib/perf/include/internal/evlist.h b/tools/lib/perf/include/internal/evlist.h
> index 2d0fa02b036f..c02d7e583846 100644
> --- a/tools/lib/perf/include/internal/evlist.h
> +++ b/tools/lib/perf/include/internal/evlist.h
> @@ -17,6 +17,7 @@ struct perf_evlist {
> struct list_head entries;
> int nr_entries;
> bool has_user_cpus;
> + bool open_per_thread;
> struct perf_cpu_map *cpus;
> struct perf_cpu_map *all_cpus;
> struct perf_thread_map *threads;
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index fddc97cac984..6e6ceacce634 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -725,6 +725,9 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
> if (group)
> perf_evlist__set_leader(evsel_list);
>
> + if (!(target__has_cpu(&target) && !target__has_per_thread(&target)))
> + evsel_list->core.open_per_thread = true;
> +
> if (affinity__setup(&affinity) < 0)
> return -1;
>
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index e3fa3bf7498a..bf8a3ccc599f 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -383,6 +383,15 @@ void evlist__disable(struct evlist *evlist)
> int cpu, i, imm = 0;
> bool has_imm = false;
>
> + if (evlist->core.open_per_thread) {
> + evlist__for_each_entry(evlist, pos) {
> + if (pos->disabled || !evsel__is_group_leader(pos) || !pos->core.fd)
> + continue;
> + evsel__disable(pos);
> + }
> + goto out;
> + }
> +
> if (affinity__setup(&affinity) < 0)
> return;
>
> @@ -414,6 +423,7 @@ void evlist__disable(struct evlist *evlist)
> pos->disabled = true;
> }
>
> +out:
> evlist->enabled = false;
> }
>
> @@ -423,6 +433,15 @@ void evlist__enable(struct evlist *evlist)
> struct affinity affinity;
> int cpu, i;
>
> + if (evlist->core.open_per_thread) {
> + evlist__for_each_entry(evlist, pos) {
> + if (!evsel__is_group_leader(pos) || !pos->core.fd)
> + continue;
> + evsel__enable(pos);
> + }
> + goto out;
> + }
> +
> if (affinity__setup(&affinity) < 0)
> return;
>
> @@ -444,6 +463,7 @@ void evlist__enable(struct evlist *evlist)
> pos->disabled = false;
> }
>
> +out:
> evlist->enabled = true;
> }
>
> @@ -1223,9 +1243,10 @@ void evlist__close(struct evlist *evlist)
>
> /*
> * With perf record core.cpus is usually NULL.
> + * Or perf stat may open events per-thread.
> * Use the old method to handle this for now.
> */
> - if (!evlist->core.cpus) {
> + if (evlist->core.open_per_thread || !evlist->core.cpus) {
> evlist__for_each_entry_reverse(evlist, evsel)
> evsel__close(evsel);
> return;
> --
> 2.17.1
>
On Wed, Sep 23, 2020 at 2:44 PM Jiri Olsa <[email protected]> wrote:
>
> On Tue, Sep 22, 2020 at 11:13:45AM +0800, Wei Li wrote:
> > When executing perf stat with armv8_pmu events with a workload, it will
> > report a segfault as result.
>
> please share the perf stat command line you see that segfault for
It seems the description in the patch 0/2 already has it:
[root@localhost hulk]# tools/perf/perf stat -e
armv8_pmuv3_0/ll_cache_rd/,armv8_pmuv3_0/ll_cache_miss_rd/ ls >
/dev/null
Segmentation fault
Thanks
Namhyun
>
> thanks,
> jirka
>
> >
> > (gdb) bt
> > #0 0x0000000000603fc8 in perf_evsel__close_fd_cpu (evsel=<optimized out>,
> > cpu=<optimized out>) at evsel.c:122
> > #1 perf_evsel__close_cpu (evsel=evsel@entry=0x716e950, cpu=7) at evsel.c:156
> > #2 0x00000000004d4718 in evlist__close (evlist=0x70a7cb0) at util/evlist.c:1242
> > #3 0x0000000000453404 in __run_perf_stat (argc=3, argc@entry=1, argv=0x30,
> > argv@entry=0xfffffaea2f90, run_idx=119, run_idx@entry=1701998435)
> > at builtin-stat.c:929
> > #4 0x0000000000455058 in run_perf_stat (run_idx=1701998435, argv=0xfffffaea2f90,
> > argc=1) at builtin-stat.c:947
> > #5 cmd_stat (argc=1, argv=0xfffffaea2f90) at builtin-stat.c:2357
> > #6 0x00000000004bb888 in run_builtin (p=p@entry=0x9764b8 <commands+288>,
> > argc=argc@entry=4, argv=argv@entry=0xfffffaea2f90) at perf.c:312
> > #7 0x00000000004bbb54 in handle_internal_command (argc=argc@entry=4,
> > argv=argv@entry=0xfffffaea2f90) at perf.c:364
> > #8 0x0000000000435378 in run_argv (argcp=<synthetic pointer>,
> > argv=<synthetic pointer>) at perf.c:408
> > #9 main (argc=4, argv=0xfffffaea2f90) at perf.c:538
> >
> > After debugging, i found the root reason is that the xyarray fd is created
> > by evsel__open_per_thread() ignoring the cpu passed in
> > create_perf_stat_counter(), while the evsel' cpumap is assigned as the
> > corresponding PMU's cpumap in __add_event(). Thus, the xyarray fd is created
> > with ncpus of dummy cpumap and an out of bounds 'cpu' index will be used in
> > perf_evsel__close_fd_cpu().
> >
> > To address this, add a flag to mark this situation and avoid using the
> > affinity technique when closing/enabling/disabling events.
> >
> > Fixes: 7736627b865d ("perf stat: Use affinity for closing file descriptors")
> > Fixes: 704e2f5b700d ("perf stat: Use affinity for enabling/disabling events")
> > Signed-off-by: Wei Li <[email protected]>
> > ---
On Wed, Sep 23, 2020 at 10:49:52PM +0900, Namhyung Kim wrote:
> On Wed, Sep 23, 2020 at 2:44 PM Jiri Olsa <[email protected]> wrote:
> >
> > On Tue, Sep 22, 2020 at 11:13:45AM +0800, Wei Li wrote:
> > > When executing perf stat with armv8_pmu events with a workload, it will
> > > report a segfault as result.
> >
> > please share the perf stat command line you see that segfault for
>
> It seems the description in the patch 0/2 already has it:
>
> [root@localhost hulk]# tools/perf/perf stat -e
> armv8_pmuv3_0/ll_cache_rd/,armv8_pmuv3_0/ll_cache_miss_rd/ ls >
> /dev/null
> Segmentation fault
yea I found it, but can't reproduce it.. I see the issue from
patch 2, but not sure what's the problem so far
jirka
>
> Thanks
> Namhyun
>
>
> >
> > thanks,
> > jirka
> >
> > >
> > > (gdb) bt
> > > #0 0x0000000000603fc8 in perf_evsel__close_fd_cpu (evsel=<optimized out>,
> > > cpu=<optimized out>) at evsel.c:122
> > > #1 perf_evsel__close_cpu (evsel=evsel@entry=0x716e950, cpu=7) at evsel.c:156
> > > #2 0x00000000004d4718 in evlist__close (evlist=0x70a7cb0) at util/evlist.c:1242
> > > #3 0x0000000000453404 in __run_perf_stat (argc=3, argc@entry=1, argv=0x30,
> > > argv@entry=0xfffffaea2f90, run_idx=119, run_idx@entry=1701998435)
> > > at builtin-stat.c:929
> > > #4 0x0000000000455058 in run_perf_stat (run_idx=1701998435, argv=0xfffffaea2f90,
> > > argc=1) at builtin-stat.c:947
> > > #5 cmd_stat (argc=1, argv=0xfffffaea2f90) at builtin-stat.c:2357
> > > #6 0x00000000004bb888 in run_builtin (p=p@entry=0x9764b8 <commands+288>,
> > > argc=argc@entry=4, argv=argv@entry=0xfffffaea2f90) at perf.c:312
> > > #7 0x00000000004bbb54 in handle_internal_command (argc=argc@entry=4,
> > > argv=argv@entry=0xfffffaea2f90) at perf.c:364
> > > #8 0x0000000000435378 in run_argv (argcp=<synthetic pointer>,
> > > argv=<synthetic pointer>) at perf.c:408
> > > #9 main (argc=4, argv=0xfffffaea2f90) at perf.c:538
> > >
> > > After debugging, i found the root reason is that the xyarray fd is created
> > > by evsel__open_per_thread() ignoring the cpu passed in
> > > create_perf_stat_counter(), while the evsel' cpumap is assigned as the
> > > corresponding PMU's cpumap in __add_event(). Thus, the xyarray fd is created
> > > with ncpus of dummy cpumap and an out of bounds 'cpu' index will be used in
> > > perf_evsel__close_fd_cpu().
> > >
> > > To address this, add a flag to mark this situation and avoid using the
> > > affinity technique when closing/enabling/disabling events.
> > >
> > > Fixes: 7736627b865d ("perf stat: Use affinity for closing file descriptors")
> > > Fixes: 704e2f5b700d ("perf stat: Use affinity for enabling/disabling events")
> > > Signed-off-by: Wei Li <[email protected]>
> > > ---
>
On Wed, Sep 23, 2020 at 11:08 PM Jiri Olsa <[email protected]> wrote:
>
> On Wed, Sep 23, 2020 at 10:49:52PM +0900, Namhyung Kim wrote:
> > On Wed, Sep 23, 2020 at 2:44 PM Jiri Olsa <[email protected]> wrote:
> > >
> > > On Tue, Sep 22, 2020 at 11:13:45AM +0800, Wei Li wrote:
> > > > When executing perf stat with armv8_pmu events with a workload, it will
> > > > report a segfault as result.
> > >
> > > please share the perf stat command line you see that segfault for
> >
> > It seems the description in the patch 0/2 already has it:
> >
> > [root@localhost hulk]# tools/perf/perf stat -e
> > armv8_pmuv3_0/ll_cache_rd/,armv8_pmuv3_0/ll_cache_miss_rd/ ls >
> > /dev/null
> > Segmentation fault
>
> yea I found it, but can't reproduce it.. I see the issue from
> patch 2, but not sure what's the problem so far
I think the problem is that armv8_pmu has a cpumask,
and the user requested per-task events.
The code tried to open the event with a dummy cpu map
since it's not a cpu event, but the pmu has cpu map and
it's passed to evsel. So there's confusion somewhere
whether it should use evsel->cpus or a dummy map.
Thanks
Namhyung
On Wed, Sep 23, 2020 at 11:15:06PM +0900, Namhyung Kim wrote:
> On Wed, Sep 23, 2020 at 11:08 PM Jiri Olsa <[email protected]> wrote:
> >
> > On Wed, Sep 23, 2020 at 10:49:52PM +0900, Namhyung Kim wrote:
> > > On Wed, Sep 23, 2020 at 2:44 PM Jiri Olsa <[email protected]> wrote:
> > > >
> > > > On Tue, Sep 22, 2020 at 11:13:45AM +0800, Wei Li wrote:
> > > > > When executing perf stat with armv8_pmu events with a workload, it will
> > > > > report a segfault as result.
> > > >
> > > > please share the perf stat command line you see that segfault for
> > >
> > > It seems the description in the patch 0/2 already has it:
> > >
> > > [root@localhost hulk]# tools/perf/perf stat -e
> > > armv8_pmuv3_0/ll_cache_rd/,armv8_pmuv3_0/ll_cache_miss_rd/ ls >
> > > /dev/null
> > > Segmentation fault
> >
> > yea I found it, but can't reproduce it.. I see the issue from
> > patch 2, but not sure what's the problem so far
>
> I think the problem is that armv8_pmu has a cpumask,
> and the user requested per-task events.
>
> The code tried to open the event with a dummy cpu map
> since it's not a cpu event, but the pmu has cpu map and
> it's passed to evsel. So there's confusion somewhere
> whether it should use evsel->cpus or a dummy map.
you're right, I have following cpus file in pmu:
# cat /sys/devices/armv8_pmuv3_0/cpus
0-3
covering all the cpus.. and once you have cpumask/cpus file,
you're system wide by default in current code, but we should
not crash ;-)
I tried to cover this case in patch below and I probably broke
some other use cases, but perhaps we could allow to open counters
per cpus for given workload
I'll try to look at this more tomorrow
jirka
---
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7f8d756d9408..0c7f16a673c2 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -379,12 +379,7 @@ static int read_affinity_counters(struct timespec *rs)
if (affinity__setup(&affinity) < 0)
return -1;
- ncpus = perf_cpu_map__nr(evsel_list->core.all_cpus);
- if (!target__has_cpu(&target) || target__has_per_thread(&target))
- ncpus = 1;
evlist__for_each_cpu(evsel_list, i, cpu) {
- if (i >= ncpus)
- break;
affinity__set(&affinity, cpu);
evlist__for_each_entry(evsel_list, counter) {
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index fd865002cbbd..ef525eb2f619 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1861,6 +1861,16 @@ void evsel__close(struct evsel *evsel)
perf_evsel__free_id(&evsel->core);
}
+int evsel__open_threads_per_cpu(struct evsel *evsel, struct perf_thread_map *threads,
+ struct perf_cpu_map *cpus, int cpu)
+{
+ if (cpu == -1)
+ return evsel__open_cpu(evsel, cpus, threads, 0,
+ cpus ? cpus->nr : 1);
+
+ return evsel__open_cpu(evsel, cpus, threads, cpu, cpu + 1);
+}
+
int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu)
{
if (cpu == -1)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 35e3f6d66085..1d055699bd1f 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -231,6 +231,8 @@ int evsel__enable(struct evsel *evsel);
int evsel__disable(struct evsel *evsel);
int evsel__disable_cpu(struct evsel *evsel, int cpu);
+int evsel__open_threads_per_cpu(struct evsel *evsel, struct perf_thread_map *threads,
+ struct perf_cpu_map *cpus, int cpu);
int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu);
int evsel__open_per_thread(struct evsel *evsel, struct perf_thread_map *threads);
int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index cdb154381a87..2b17f1315cfb 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -560,6 +560,11 @@ int create_perf_stat_counter(struct evsel *evsel,
attr->enable_on_exec = 1;
}
+ if (evsel->core.own_cpus && evsel->core.threads) {
+ return evsel__open_threads_per_cpu(evsel, evsel->core.threads,
+ evsel__cpus(evsel), cpu);
+ }
+
if (target__has_cpu(target) && !target__has_per_thread(target))
return evsel__open_per_cpu(evsel, evsel__cpus(evsel), cpu);
Hi Andi,
On 2020/9/23 3:50, Andi Kleen wrote:
> On Tue, Sep 22, 2020 at 12:23:21PM -0700, Andi Kleen wrote:
>>> After debugging, i found the root reason is that the xyarray fd is created
>>> by evsel__open_per_thread() ignoring the cpu passed in
>>> create_perf_stat_counter(), while the evsel' cpumap is assigned as the
>>> corresponding PMU's cpumap in __add_event(). Thus, the xyarray fd is created
>>> with ncpus of dummy cpumap and an out of bounds 'cpu' index will be used in
>>> perf_evsel__close_fd_cpu().
>>>
>>> To address this, add a flag to mark this situation and avoid using the
>>> affinity technique when closing/enabling/disabling events.
>>
>> The flag seems like a hack. How about figuring out the correct number of
>> CPUs and using that?
>
> Also would like to understand what's different on ARM64 than other architectures.
> Or could this happen on x86 too?
>
The problem is that when the user requests per-task events, the cpumask is expected
as NULL(dummy), while the armv8_pmu do has a cpumask which inherited by evsel.
The armv8_pmu's cpumask was added for heterogeneous systems. So this issue can not
happen on x86.
In fact, the cpumask is correct indeed, but it should't be used when we requesting
per-task events. As these events should be install on all cores, i doubt how much we
can benefit from the affinity technique, so i choosed to add a flag.
I also did a test on hisilicon arm64 d06 board, with 2 sockets 128 cores.
Testing the following command 3 times, with/without the affinity technique:
time tools/perf/perf stat -ddd -C 0-127 --per-core --timeout=5000 2> /dev/null
* (HEAD detached at 7074674e7338) perf cpumap: Maintain cpumaps ordered and without dups
real 0m8.039s
user 0m0.402s
sys 0m2.582s
real 0m7.939s
user 0m0.360s
sys 0m2.560s
real 0m7.997s
user 0m0.358s
sys 0m2.586s
* (HEAD detached at 704e2f5b700d) perf stat: Use affinity for enabling/disabling events
real 0m7.954s
user 0m0.308s
sys 0m2.590s
real 0m12.959s
user 0m0.332s
sys 0m2.582s
real 0m18.009s
user 0m0.346s
sys 0m2.562s
The offcpu time is much longer when using affinity, i think that's what migration costs,
could you please share me your test case?
Thanks,
Wei
On Wed, Sep 23, 2020 at 10:19:00PM +0200, Jiri Olsa wrote:
> On Wed, Sep 23, 2020 at 11:15:06PM +0900, Namhyung Kim wrote:
> > I think the problem is that armv8_pmu has a cpumask,
> > and the user requested per-task events.
> >
> > The code tried to open the event with a dummy cpu map
> > since it's not a cpu event, but the pmu has cpu map and
> > it's passed to evsel. So there's confusion somewhere
> > whether it should use evsel->cpus or a dummy map.
>
> you're right, I have following cpus file in pmu:
>
> # cat /sys/devices/armv8_pmuv3_0/cpus
> 0-3
>
> covering all the cpus.. and once you have cpumask/cpus file,
> you're system wide by default in current code, but we should
> not crash ;-)
>
> I tried to cover this case in patch below and I probably broke
> some other use cases, but perhaps we could allow to open counters
> per cpus for given workload
>
> I'll try to look at this more tomorrow
I'm thinking about a different approach, we can ignore cpu map
for the ARM cpu PMU and use the dummy, not tested ;-)
Thanks
Namhyung
diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index 2208444ecb44..cfcdbd7be066 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -45,6 +45,9 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
if (!evsel->own_cpus || evlist->has_user_cpus) {
perf_cpu_map__put(evsel->cpus);
evsel->cpus = perf_cpu_map__get(evlist->cpus);
+ } else if (!evsel->system_wide && perf_cpu_map__empty(evlist->cpus)) {
+ perf_cpu_map__put(evsel->cpus);
+ evsel->cpus = perf_cpu_map__get(evlist->cpus);
} else if (evsel->cpus != evsel->own_cpus) {
perf_cpu_map__put(evsel->cpus);
evsel->cpus = perf_cpu_map__get(evsel->own_cpus);
On Thu, Sep 24, 2020 at 11:36:23PM +0900, Namhyung Kim wrote:
> On Wed, Sep 23, 2020 at 10:19:00PM +0200, Jiri Olsa wrote:
> > On Wed, Sep 23, 2020 at 11:15:06PM +0900, Namhyung Kim wrote:
> > > I think the problem is that armv8_pmu has a cpumask,
> > > and the user requested per-task events.
> > >
> > > The code tried to open the event with a dummy cpu map
> > > since it's not a cpu event, but the pmu has cpu map and
> > > it's passed to evsel. So there's confusion somewhere
> > > whether it should use evsel->cpus or a dummy map.
> >
> > you're right, I have following cpus file in pmu:
> >
> > # cat /sys/devices/armv8_pmuv3_0/cpus
> > 0-3
> >
> > covering all the cpus.. and once you have cpumask/cpus file,
> > you're system wide by default in current code, but we should
> > not crash ;-)
> >
> > I tried to cover this case in patch below and I probably broke
> > some other use cases, but perhaps we could allow to open counters
> > per cpus for given workload
> >
> > I'll try to look at this more tomorrow
>
> I'm thinking about a different approach, we can ignore cpu map
> for the ARM cpu PMU and use the dummy, not tested ;-)
>
> Thanks
> Namhyung
>
>
> diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
> index 2208444ecb44..cfcdbd7be066 100644
> --- a/tools/lib/perf/evlist.c
> +++ b/tools/lib/perf/evlist.c
> @@ -45,6 +45,9 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
> if (!evsel->own_cpus || evlist->has_user_cpus) {
> perf_cpu_map__put(evsel->cpus);
> evsel->cpus = perf_cpu_map__get(evlist->cpus);
> + } else if (!evsel->system_wide && perf_cpu_map__empty(evlist->cpus)) {
> + perf_cpu_map__put(evsel->cpus);
> + evsel->cpus = perf_cpu_map__get(evlist->cpus);
but I like this fix, because mine messes up scaling ;-)
> } else if (evsel->cpus != evsel->own_cpus) {
> perf_cpu_map__put(evsel->cpus);
> evsel->cpus = perf_cpu_map__get(evsel->own_cpus);
it looks like that cpus (/sys/devices/armv8_pmuv3_0/cpus) file
can have some cpus switched off.. from brief scan of:
drivers/perf/arm_pmu_platform.c (search for supported_cpus)
but it looks like it's just check for interrupt, so counting
might still work..?
thanks,
jirka
On Thu, Sep 24, 2020 at 11:36:23PM +0900, Namhyung Kim wrote:
> On Wed, Sep 23, 2020 at 10:19:00PM +0200, Jiri Olsa wrote:
> > On Wed, Sep 23, 2020 at 11:15:06PM +0900, Namhyung Kim wrote:
> > > I think the problem is that armv8_pmu has a cpumask,
> > > and the user requested per-task events.
> > >
> > > The code tried to open the event with a dummy cpu map
> > > since it's not a cpu event, but the pmu has cpu map and
> > > it's passed to evsel. So there's confusion somewhere
> > > whether it should use evsel->cpus or a dummy map.
> >
> > you're right, I have following cpus file in pmu:
> >
> > # cat /sys/devices/armv8_pmuv3_0/cpus
> > 0-3
> >
> > covering all the cpus.. and once you have cpumask/cpus file,
> > you're system wide by default in current code, but we should
> > not crash ;-)
> >
> > I tried to cover this case in patch below and I probably broke
> > some other use cases, but perhaps we could allow to open counters
> > per cpus for given workload
> >
> > I'll try to look at this more tomorrow
>
> I'm thinking about a different approach, we can ignore cpu map
> for the ARM cpu PMU and use the dummy, not tested ;-)
>
> Thanks
> Namhyung
>
>
> diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
> index 2208444ecb44..cfcdbd7be066 100644
> --- a/tools/lib/perf/evlist.c
> +++ b/tools/lib/perf/evlist.c
> @@ -45,6 +45,9 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
> if (!evsel->own_cpus || evlist->has_user_cpus) {
> perf_cpu_map__put(evsel->cpus);
> evsel->cpus = perf_cpu_map__get(evlist->cpus);
> + } else if (!evsel->system_wide && perf_cpu_map__empty(evlist->cpus)) {
> + perf_cpu_map__put(evsel->cpus);
> + evsel->cpus = perf_cpu_map__get(evlist->cpus);
> } else if (evsel->cpus != evsel->own_cpus) {
> perf_cpu_map__put(evsel->cpus);
> evsel->cpus = perf_cpu_map__get(evsel->own_cpus);
>
Wei Li,
is this fixing your problem?
thanks,
jirka
> -----Original Message-----
> From: Jiri Olsa [mailto:[email protected]]
> Sent: Friday, October 2, 2020 10:00 PM
> To: Namhyung Kim <[email protected]>; liwei (GF)
> <[email protected]>
> Cc: Mark Rutland <[email protected]>; Andi Kleen <[email protected]>;
> Alexander Shishkin <[email protected]>; Alexey Budankov
> <[email protected]>; Adrian Hunter
> <[email protected]>; Arnaldo Carvalho de Melo <[email protected]>;
> linux-kernel <[email protected]>; Peter Zijlstra
> <[email protected]>; Andi Kleen <[email protected]>; Libin (Huawei)
> <[email protected]>; Ingo Molnar <[email protected]>;
> [email protected]
> Subject: Re: [PATCH 1/2] perf stat: Fix segfault when counting armv8_pmu
> events
>
> On Thu, Sep 24, 2020 at 11:36:23PM +0900, Namhyung Kim wrote:
> > On Wed, Sep 23, 2020 at 10:19:00PM +0200, Jiri Olsa wrote:
> > > On Wed, Sep 23, 2020 at 11:15:06PM +0900, Namhyung Kim wrote:
> > > > I think the problem is that armv8_pmu has a cpumask,
> > > > and the user requested per-task events.
> > > >
> > > > The code tried to open the event with a dummy cpu map
> > > > since it's not a cpu event, but the pmu has cpu map and
> > > > it's passed to evsel. So there's confusion somewhere
> > > > whether it should use evsel->cpus or a dummy map.
> > >
> > > you're right, I have following cpus file in pmu:
> > >
> > > # cat /sys/devices/armv8_pmuv3_0/cpus
> > > 0-3
> > >
> > > covering all the cpus.. and once you have cpumask/cpus file,
> > > you're system wide by default in current code, but we should
> > > not crash ;-)
> > >
> > > I tried to cover this case in patch below and I probably broke
> > > some other use cases, but perhaps we could allow to open counters
> > > per cpus for given workload
> > >
> > > I'll try to look at this more tomorrow
> >
> > I'm thinking about a different approach, we can ignore cpu map
> > for the ARM cpu PMU and use the dummy, not tested ;-)
> >
> > Thanks
> > Namhyung
> >
> >
> > diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
> > index 2208444ecb44..cfcdbd7be066 100644
> > --- a/tools/lib/perf/evlist.c
> > +++ b/tools/lib/perf/evlist.c
> > @@ -45,6 +45,9 @@ static void __perf_evlist__propagate_maps(struct
> perf_evlist *evlist,
> > if (!evsel->own_cpus || evlist->has_user_cpus) {
> > perf_cpu_map__put(evsel->cpus);
> > evsel->cpus = perf_cpu_map__get(evlist->cpus);
> > + } else if (!evsel->system_wide &&
> perf_cpu_map__empty(evlist->cpus)) {
> > + perf_cpu_map__put(evsel->cpus);
> > + evsel->cpus = perf_cpu_map__get(evlist->cpus);
> > } else if (evsel->cpus != evsel->own_cpus) {
> > perf_cpu_map__put(evsel->cpus);
> > evsel->cpus = perf_cpu_map__get(evsel->own_cpus);
> >
>
> Wei Li,
> is this fixing your problem?
As I have seen the same crash and suggested another patch:
[PATCH] perf evlist: fix memory corruption for Kernel PMU event
https://lore.kernel.org/lkml/[email protected]/
Also, I have tested Namhyung Kim's patch on ARM64. It does fix the crash for me. So:
Tested-by: Barry Song <[email protected]>
Please put the below fixes-tag in commit log:
Fixes: 7736627b865d ("perf stat: Use affinity for closing file descriptors")
Thanks
Barry