From: Kan Liang <[email protected]>
perf top reads all threads' /proc/xxx/maps. If there is any threads
which generating a keeping growing huge /proc/xxx/maps, perf will do
infinite loop in perf_event__synthesize_mmap_events.
This patch fixes this issue by adding a time out to force stop this kind
of endless mmap processing.
Reported-by: Huang, Ying <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/util/event.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 793b150..0df7f2a9 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -213,6 +213,8 @@ static int perf_event__synthesize_fork(struct perf_tool *tool,
return 0;
}
+#define MMAP_TIMEOUT (50 * 1000000ULL)
+
int perf_event__synthesize_mmap_events(struct perf_tool *tool,
union perf_event *event,
pid_t pid, pid_t tgid,
@@ -222,6 +224,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
{
char filename[PATH_MAX];
FILE *fp;
+ unsigned long long t;
int rc = 0;
if (machine__is_default_guest(machine))
@@ -240,6 +243,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
}
event->header.type = PERF_RECORD_MMAP2;
+ t = rdclock();
while (1) {
char bf[BUFSIZ];
@@ -250,7 +254,8 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
size_t size;
ssize_t n;
- if (fgets(bf, sizeof(bf), fp) == NULL)
+ if ((fgets(bf, sizeof(bf), fp) == NULL) ||
+ ((rdclock() - t) > MMAP_TIMEOUT))
break;
/* ensure null termination since stack will be reused. */
--
1.8.3.1
Em Wed, Jun 10, 2015 at 03:46:04AM -0400, [email protected] escreveu:
> perf top reads all threads' /proc/xxx/maps. If there is any threads
> which generating a keeping growing huge /proc/xxx/maps, perf will do
> infinite loop in perf_event__synthesize_mmap_events.
> This patch fixes this issue by adding a time out to force stop this kind
> of endless mmap processing.
>
> Reported-by: Huang, Ying <[email protected]>
> Signed-off-by: Kan Liang <[email protected]>
So we will silently stop processing those events?
We will make progress, no doubt, but I think the user needs to be warned
about this situation, so that later on when/if samples for those maps
appear and don't get resolved at least we will know that this is the
reason.
- Arnaldo
> ---
> tools/perf/util/event.c | 7 ++++++-
> 1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
> index 793b150..0df7f2a9 100644
> --- a/tools/perf/util/event.c
> +++ b/tools/perf/util/event.c
> @@ -213,6 +213,8 @@ static int perf_event__synthesize_fork(struct perf_tool *tool,
> return 0;
> }
>
> +#define MMAP_TIMEOUT (50 * 1000000ULL)
> +
> int perf_event__synthesize_mmap_events(struct perf_tool *tool,
> union perf_event *event,
> pid_t pid, pid_t tgid,
> @@ -222,6 +224,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
> {
> char filename[PATH_MAX];
> FILE *fp;
> + unsigned long long t;
> int rc = 0;
>
> if (machine__is_default_guest(machine))
> @@ -240,6 +243,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
> }
>
> event->header.type = PERF_RECORD_MMAP2;
> + t = rdclock();
>
> while (1) {
> char bf[BUFSIZ];
> @@ -250,7 +254,8 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
> size_t size;
> ssize_t n;
>
> - if (fgets(bf, sizeof(bf), fp) == NULL)
> + if ((fgets(bf, sizeof(bf), fp) == NULL) ||
> + ((rdclock() - t) > MMAP_TIMEOUT))
> break;
>
> /* ensure null termination since stack will be reused. */
> --
> 1.8.3.1
>
> Em Wed, Jun 10, 2015 at 03:46:04AM -0400, [email protected] escreveu:
> > perf top reads all threads' /proc/xxx/maps. If there is any threads
> > which generating a keeping growing huge /proc/xxx/maps, perf will do
> > infinite loop in perf_event__synthesize_mmap_events.
> > This patch fixes this issue by adding a time out to force stop this
> > kind of endless mmap processing.
> >
> > Reported-by: Huang, Ying <[email protected]>
> > Signed-off-by: Kan Liang <[email protected]>
>
> So we will silently stop processing those events?
>
> We will make progress, no doubt, but I think the user needs to be warned
> about this situation, so that later on when/if samples for those maps
> appear and don't get resolved at least we will know that this is the reason.
>
I will add warning message for this kind of time out.
Thanks,
Kan
> - Arnaldo
>
> > ---
> > tools/perf/util/event.c | 7 ++++++-
> > 1 file changed, 6 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index
> > 793b150..0df7f2a9 100644
> > --- a/tools/perf/util/event.c
> > +++ b/tools/perf/util/event.c
> > @@ -213,6 +213,8 @@ static int perf_event__synthesize_fork(struct
> perf_tool *tool,
> > return 0;
> > }
> >
> > +#define MMAP_TIMEOUT (50 * 1000000ULL)
> > +
> > int perf_event__synthesize_mmap_events(struct perf_tool *tool,
> > union perf_event *event,
> > pid_t pid, pid_t tgid,
> > @@ -222,6 +224,7 @@ int
> perf_event__synthesize_mmap_events(struct
> > perf_tool *tool, {
> > char filename[PATH_MAX];
> > FILE *fp;
> > + unsigned long long t;
> > int rc = 0;
> >
> > if (machine__is_default_guest(machine))
> > @@ -240,6 +243,7 @@ int
> perf_event__synthesize_mmap_events(struct perf_tool *tool,
> > }
> >
> > event->header.type = PERF_RECORD_MMAP2;
> > + t = rdclock();
> >
> > while (1) {
> > char bf[BUFSIZ];
> > @@ -250,7 +254,8 @@ int
> perf_event__synthesize_mmap_events(struct perf_tool *tool,
> > size_t size;
> > ssize_t n;
> >
> > - if (fgets(bf, sizeof(bf), fp) == NULL)
> > + if ((fgets(bf, sizeof(bf), fp) == NULL) ||
> > + ((rdclock() - t) > MMAP_TIMEOUT))
> > break;
> >
> > /* ensure null termination since stack will be reused. */
> > --
> > 1.8.3.1
On 6/11/15 8:06 AM, Arnaldo Carvalho de Melo wrote:
> Em Wed, Jun 10, 2015 at 03:46:04AM -0400, [email protected] escreveu:
>> perf top reads all threads' /proc/xxx/maps. If there is any threads
>> which generating a keeping growing huge /proc/xxx/maps, perf will do
>> infinite loop in perf_event__synthesize_mmap_events.
>> This patch fixes this issue by adding a time out to force stop this kind
>> of endless mmap processing.
>>
>> Reported-by: Huang, Ying <[email protected]>
>> Signed-off-by: Kan Liang <[email protected]>
>
> So we will silently stop processing those events?
>
> We will make progress, no doubt, but I think the user needs to be warned
> about this situation, so that later on when/if samples for those maps
> appear and don't get resolved at least we will know that this is the
> reason.
Can you elaborate on an example? I don't see how this can happen reading
a maps file. And it does not read maps for all threads only thread group
leaders.
Em Thu, Jun 11, 2015 at 02:27:40PM +0000, Liang, Kan escreveu:
> > Em Wed, Jun 10, 2015 at 03:46:04AM -0400, [email protected] escreveu:
> > > perf top reads all threads' /proc/xxx/maps. If there is any threads
> > > which generating a keeping growing huge /proc/xxx/maps, perf will do
> > > infinite loop in perf_event__synthesize_mmap_events.
> > > This patch fixes this issue by adding a time out to force stop this
> > > kind of endless mmap processing.
> > >
> > > Reported-by: Huang, Ying <[email protected]>
> > > Signed-off-by: Kan Liang <[email protected]>
> >
> > So we will silently stop processing those events?
> >
> > We will make progress, no doubt, but I think the user needs to be warned
> > about this situation, so that later on when/if samples for those maps
> > appear and don't get resolved at least we will know that this is the reason.
> I will add warning message for this kind of time out.
Thanks!
- Arnaldo
> Can you elaborate on an example? I don't see how this can happen
> reading a maps file. And it does not read maps for all threads only
> thread group leaders.
This is with a stress test case that generates lots of small mappings
at very high speed and frees them again. So the maps file keeps changing
faster than the proc reader can keep it and it can end up with
a live lock.
-Andi
--
[email protected] -- Speaking for myself only.
On 6/11/15 12:47 PM, Andi Kleen wrote:
>> Can you elaborate on an example? I don't see how this can happen
>> reading a maps file. And it does not read maps for all threads only
>> thread group leaders.
>
> This is with a stress test case that generates lots of small mappings
> at very high speed and frees them again. So the maps file keeps changing
> faster than the proc reader can keep it and it can end up with
> a live lock.
Can you pass it along? I'd like to see how the task_diag proposal
handles it.
https://github.com/dsahern/linux/commits/task_diag-wip
>
> On 6/11/15 12:47 PM, Andi Kleen wrote:
> >> Can you elaborate on an example? I don't see how this can happen
> >> reading a maps file. And it does not read maps for all threads only
> >> thread group leaders.
> >
> > This is with a stress test case that generates lots of small mappings
> > at very high speed and frees them again. So the maps file keeps
> > changing faster than the proc reader can keep it and it can end up
> > with a live lock.
>
> Can you pass it along? I'd like to see how the task_diag proposal handles it.
>
> https://github.com/dsahern/linux/commits/task_diag-wip
Hi David,
I tried the task_diag on my platform, but it shows error message when I
run perf top. " Message handling failed: rc -1, errno 25".
And it looks perf top failed to get maps information.
20.88% [unknown] [.] 0x00000000004992c4
6.77% [unknown] [.] 0x000000000048f9e7
4.92% [unknown] [.] 0x000000000043789c
4.68% [unknown] [.] 0x0000003002f32918
4.31% [unknown] [.] 0x00000000004992d6
1.85% [kernel] [k] _raw_spin_lock
1.84% [unknown] [.] 0x00000000004d8190
1.84% [unknown] [.] 0x00000000004378ad
1.40% [kernel] [k] clear_page_c_e
1.23% [unknown] [.] 0x00000000004992d0
1.23% [unknown] [.] 0x00000000004378cf
I installed the task_diag kernel and libnl3-3.2.22-3.
Thanks,
Kan
On 6/12/15 8:42 AM, Liang, Kan wrote:
>
>>
>> On 6/11/15 12:47 PM, Andi Kleen wrote:
>>>> Can you elaborate on an example? I don't see how this can happen
>>>> reading a maps file. And it does not read maps for all threads only
>>>> thread group leaders.
>>>
>>> This is with a stress test case that generates lots of small mappings
>>> at very high speed and frees them again. So the maps file keeps
>>> changing faster than the proc reader can keep it and it can end up
>>> with a live lock.
>>
>> Can you pass it along? I'd like to see how the task_diag proposal handles it.
>>
>> https://github.com/dsahern/linux/commits/task_diag-wip
>
> Hi David,
>
> I tried the task_diag on my platform, but it shows error message when I
> run perf top. " Message handling failed: rc -1, errno 25".
> And it looks perf top failed to get maps information.
Not surprising; it's only half-baked. Can you try perf-record? So far
that is the only one I have tested.
Also, while running that kernel you can build the test programs under
tools/testing/selftests/task_diag/ and try task_diag_all. I am away from
my dev box at the moment. As I recall you will want to try
'task_diag_all o $pid' or 'task_diag_all a'
I take this to mean you don't want to share the test program? I am
curious as to how other tools handle this use case.
>
> On 6/12/15 8:42 AM, Liang, Kan wrote:
> >
> >>
> >> On 6/11/15 12:47 PM, Andi Kleen wrote:
> >>>> Can you elaborate on an example? I don't see how this can happen
> >>>> reading a maps file. And it does not read maps for all threads only
> >>>> thread group leaders.
> >>>
> >>> This is with a stress test case that generates lots of small
> >>> mappings at very high speed and frees them again. So the maps file
> >>> keeps changing faster than the proc reader can keep it and it can
> >>> end up with a live lock.
> >>
> >> Can you pass it along? I'd like to see how the task_diag proposal handles
> it.
> >>
> >> https://github.com/dsahern/linux/commits/task_diag-wip
> >
> > Hi David,
> >
> > I tried the task_diag on my platform, but it shows error message when
> > I run perf top. " Message handling failed: rc -1, errno 25".
> > And it looks perf top failed to get maps information.
>
> Not surprising; it's only half-baked. Can you try perf-record? So far that is
> the only one I have tested.
>
Perf record cannot reproduce the infinite loop which found in perf top.
But we can observe that synthesized threads took very long time in perf record.
According to test result as below, current perf cost 13s to read the maps,
while task_diag cost 14s to synthesized thread.
(Note: The time will increase with the test run.)
So it looks task_diag doesn't help on this issue.
[perf]$ sudo ./perf record -e instructions:pp --pid 14560
Reading /proc/14560/maps cost 13.12690599 s
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.108 MB perf.data (2783 samples) ]
[perf]$ sudo ./perf_task_diag record -e instructions:pp --pid 14560
synthesized threads took 14.435450 sec
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.035 MB perf.data (885 samples) ]
> Also, while running that kernel you can build the test programs under
> tools/testing/selftests/task_diag/ and try task_diag_all. I am away from my
> dev box at the moment. As I recall you will want to try 'task_diag_all o $pid'
> or 'task_diag_all a'
>
Neither options work on my platform.
[task_diag]$ sudo ./task_diag_all a
Unable to receive message: Operation not supported
[task_diag]$ sudo ./task_diag_all o 14751
Unable to receive message: Operation not supported
> I take this to mean you don't want to share the test program? I am curious
> as to how other tools handle this use case.
It's our internal test case. I'm afraid I cannot share it.
But if you want to do more tests, I'd like to do it.
Thanks,
Kan
On 6/12/15 11:05 AM, Liang, Kan wrote:
>>
>> On 6/12/15 8:42 AM, Liang, Kan wrote:
>>>
>>>>
>>>> On 6/11/15 12:47 PM, Andi Kleen wrote:
>>>>>> Can you elaborate on an example? I don't see how this can happen
>>>>>> reading a maps file. And it does not read maps for all threads only
>>>>>> thread group leaders.
>>>>>
>>>>> This is with a stress test case that generates lots of small
>>>>> mappings at very high speed and frees them again. So the maps file
>>>>> keeps changing faster than the proc reader can keep it and it can
>>>>> end up with a live lock.
>>>>
>>>> Can you pass it along? I'd like to see how the task_diag proposal handles
>> it.
>>>>
>>>> https://github.com/dsahern/linux/commits/task_diag-wip
>>>
>>> Hi David,
>>>
>>> I tried the task_diag on my platform, but it shows error message when
>>> I run perf top. " Message handling failed: rc -1, errno 25".
>>> And it looks perf top failed to get maps information.
>>
>> Not surprising; it's only half-baked. Can you try perf-record? So far that is
>> the only one I have tested.
>>
>
> Perf record cannot reproduce the infinite loop which found in perf top.
> But we can observe that synthesized threads took very long time in perf record.
>
> According to test result as below, current perf cost 13s to read the maps,
> while task_diag cost 14s to synthesized thread.
> (Note: The time will increase with the test run.)
>
> So it looks task_diag doesn't help on this issue.
>
> [perf]$ sudo ./perf record -e instructions:pp --pid 14560
> Reading /proc/14560/maps cost 13.12690599 s
> ^C[ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.108 MB perf.data (2783 samples) ]
so perf was able to read the proc file?
>
> [perf]$ sudo ./perf_task_diag record -e instructions:pp --pid 14560
> synthesized threads took 14.435450 sec
> ^C[ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.035 MB perf.data (885 samples) ]
>
>
>> Also, while running that kernel you can build the test programs under
>> tools/testing/selftests/task_diag/ and try task_diag_all. I am away from my
>> dev box at the moment. As I recall you will want to try 'task_diag_all o $pid'
>> or 'task_diag_all a'
>>
> Neither options work on my platform.
>
> [task_diag]$ sudo ./task_diag_all a
> Unable to receive message: Operation not supported
> [task_diag]$ sudo ./task_diag_all o 14751
> Unable to receive message: Operation not supported
Are you sure task_diag is enabled? There is an option under General I
believe:
config TASK_DIAG
bool "Export task/process properties through netlink"
depends on NET && TASKSTATS
default n
help
Export selected properties for tasks/processes through the
generic netlink interface. Unlike the proc file system, task_diag
returns information in a binary format, allows to specify which
information are required.
Say N if unsure.
David
>
> On 6/12/15 11:05 AM, Liang, Kan wrote:
> >>
> >> On 6/12/15 8:42 AM, Liang, Kan wrote:
> >>>
> >>>>
> >>>> On 6/11/15 12:47 PM, Andi Kleen wrote:
> >>>>>> Can you elaborate on an example? I don't see how this can
> happen
> >>>>>> reading a maps file. And it does not read maps for all threads
> >>>>>> only thread group leaders.
> >>>>>
> >>>>> This is with a stress test case that generates lots of small
> >>>>> mappings at very high speed and frees them again. So the maps file
> >>>>> keeps changing faster than the proc reader can keep it and it can
> >>>>> end up with a live lock.
> >>>>
> >>>> Can you pass it along? I'd like to see how the task_diag proposal
> >>>> handles
> >> it.
> >>>>
> >>>> https://github.com/dsahern/linux/commits/task_diag-wip
> >>>
> >>> Hi David,
> >>>
> >>> I tried the task_diag on my platform, but it shows error message
> >>> when I run perf top. " Message handling failed: rc -1, errno 25".
> >>> And it looks perf top failed to get maps information.
> >>
> >> Not surprising; it's only half-baked. Can you try perf-record? So far
> >> that is the only one I have tested.
> >>
> >
> > Perf record cannot reproduce the infinite loop which found in perf top.
> > But we can observe that synthesized threads took very long time in perf
> record.
> >
> > According to test result as below, current perf cost 13s to read the
> > maps, while task_diag cost 14s to synthesized thread.
> > (Note: The time will increase with the test run.)
> >
> > So it looks task_diag doesn't help on this issue.
> >
> > [perf]$ sudo ./perf record -e instructions:pp --pid 14560 Reading
> > /proc/14560/maps cost 13.12690599 s ^C[ perf record: Woken up 1 times
> > to write data ] [ perf record: Captured and wrote 0.108 MB perf.data
> > (2783 samples) ]
>
> so perf was able to read the proc file?
Yes, perf always can read proc file. The problem is that the proc file is
huge and keep growing faster than proc reader.
So perf top do loop in perf_event__synthesize_mmap_events until the
test case exit.
>
> >
> > [perf]$ sudo ./perf_task_diag record -e instructions:pp --pid 14560
> > synthesized threads took 14.435450 sec ^C[ perf record: Woken up 1
> > times to write data ] [ perf record: Captured and wrote 0.035 MB
> > perf.data (885 samples) ]
> >
> >
> >> Also, while running that kernel you can build the test programs under
> >> tools/testing/selftests/task_diag/ and try task_diag_all. I am away
> >> from my dev box at the moment. As I recall you will want to try
> 'task_diag_all o $pid'
> >> or 'task_diag_all a'
> >>
> > Neither options work on my platform.
> >
> > [task_diag]$ sudo ./task_diag_all a
> > Unable to receive message: Operation not supported [task_diag]$ sudo
> > ./task_diag_all o 14751 Unable to receive message: Operation not
> > supported
>
> Are you sure task_diag is enabled? There is an option under General I
> believe:
> config TASK_DIAG
> bool "Export task/process properties through netlink"
> depends on NET && TASKSTATS
> default n
> help
> Export selected properties for tasks/processes through the
> generic netlink interface. Unlike the proc file system, task_diag
> returns information in a binary format, allows to specify which
> information are required.
>
> Say N if unsure.
>
It works now.
Thanks,
Kan
> David
On 6/12/15 12:19 PM, Liang, Kan wrote:
>>> [perf]$ sudo ./perf record -e instructions:pp --pid 14560 Reading
>>> /proc/14560/maps cost 13.12690599 s ^C[ perf record: Woken up 1 times
>>> to write data ] [ perf record: Captured and wrote 0.108 MB perf.data
>>> (2783 samples) ]
>>
>> so perf was able to read the proc file?
>
> Yes, perf always can read proc file. The problem is that the proc file is
> huge and keep growing faster than proc reader.
> So perf top do loop in perf_event__synthesize_mmap_events until the
> test case exit.
I'm confused. How are you getting the above time to read /proc maps if
it never finishes?
>> Are you sure task_diag is enabled? There is an option under General I
>> believe:
>> config TASK_DIAG
>> bool "Export task/process properties through netlink"
>> depends on NET && TASKSTATS
>> default n
>> help
>> Export selected properties for tasks/processes through the
>> generic netlink interface. Unlike the proc file system, task_diag
>> returns information in a binary format, allows to specify which
>> information are required.
>>
>> Say N if unsure.
>>
> It works now.
for this test case how does perf-record compare between proc and
task_diag? You can use my command for both. It defaults to using
task_diag and then you can add --no-task_diag to have it read /proc. And
as mentioned before it is only setup for 'perf record -a' case. So
launch your test program
perf record -a -- usleep 1
perf record -a --no-task_diag -- usleep 1
> I'm confused. How are you getting the above time to read /proc maps
> if it never finishes?
Obviously he times it outside perf.
-Andi
>
> On 6/12/15 12:19 PM, Liang, Kan wrote:
> >>> [perf]$ sudo ./perf record -e instructions:pp --pid 14560 Reading
> >>> /proc/14560/maps cost 13.12690599 s ^C[ perf record: Woken up 1
> >>> times to write data ] [ perf record: Captured and wrote 0.108 MB
> >>> perf.data
> >>> (2783 samples) ]
> >>
> >> so perf was able to read the proc file?
> >
> > Yes, perf always can read proc file. The problem is that the proc file
> > is huge and keep growing faster than proc reader.
> > So perf top do loop in perf_event__synthesize_mmap_events until the
> > test case exit.
>
> I'm confused. How are you getting the above time to read /proc maps if it
> never finishes?
I just tried to simplify the issue for perf record. So you may noticed that
I only read one thread. There are several threads in the system.
Also, I do the perf record test when starting the test case.
The proc file is not that big.
For perf top, it will monitor whole system. So it never finishes.
>
> for this test case how does perf-record compare between proc and
> task_diag? You can use my command for both. It defaults to using
> task_diag and then you can add --no-task_diag to have it read /proc. And
> as mentioned before it is only setup for 'perf record -a' case. So
>
> launch your test program
> perf record -a -- usleep 1
> perf record -a --no-task_diag -- usleep 1
>
Here are the test results.
Please note that I get "synthesized threads took..." after the test case exit.
It means both way have the same issue.
[perf]$ sudo ./perf record -a -- usleep 1
synthesized threads took 278.780762 sec
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.617 MB perf.data (7974 samples) ]
[perf]$ sudo ./perf record -a --no-task_diag -- usleep 1
synthesized threads took 315.612403 sec
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.443 MB perf.data (2754 samples) ]
Thanks,
Kan
On 6/12/15 2:39 PM, Liang, Kan wrote:
> Here are the test results.
> Please note that I get "synthesized threads took..." after the test case exit.
> It means both way have the same issue.
Got it. So what you really mean is launching perf on an already running
process perf never finishes initializing. There are several types of
problems like this. For example on a sparc system with a 1024 cpus if I
launch perf (top or record) after starting a kernel build with make -j
1024 the build finishes before perf starts collecting samples. ie., it
never finishes walking /proc until the build is complete. task_diag does
not solve that problem either and in general the procps tools can't
handle it either (ps or top for example).
For your test case what happens if you run:
perf record -- test-app
Is perf overloaded with mmap samples? does it keep up or do you have to
jack the mmap size (-m arg)?
>
> [perf]$ sudo ./perf record -a -- usleep 1
> synthesized threads took 278.780762 sec
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.617 MB perf.data (7974 samples) ]
>
> [perf]$ sudo ./perf record -a --no-task_diag -- usleep 1
> synthesized threads took 315.612403 sec
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.443 MB perf.data (2754 samples) ]
>
ok.
Thanks,
David
>
> On 6/12/15 2:39 PM, Liang, Kan wrote:
> > Here are the test results.
> > Please note that I get "synthesized threads took..." after the test case
> exit.
> > It means both way have the same issue.
>
> Got it. So what you really mean is launching perf on an already running
> process perf never finishes initializing. There are several types of problems
> like this. For example on a sparc system with a 1024 cpus if I launch perf
> (top or record) after starting a kernel build with make -j
> 1024 the build finishes before perf starts collecting samples. ie., it never
> finishes walking /proc until the build is complete. task_diag does not solve
> that problem either and in general the procps tools can't handle it either
> (ps or top for example).
>
We should not stop using system wide perf top/record just because there
are some threads which have huge/growing maps.
The maps information is not critical for sampling.
If task_diag does not solve this problem, I think we still need a time out
to force stop endless mmap processing. It's the simplest working
solution so far.
> For your test case what happens if you run:
> perf record -- test-app
>
> Is perf overloaded with mmap samples? does it keep up or do you have to
> jack the mmap size (-m arg)?
No.
synthesize_threads will not be called unless you specify the process/threads/cpu.
(Please refer to __machine__synthesize_threads.)
So it works well.
perf record -- ./run case-small-allocs
[ perf record: Woken up 3844 times to write data ]
[ perf record: Captured and wrote 1001.897 MB perf.data (26259688 samples) ]
Thanks,
Kan
On 6/12/15 4:41 PM, Liang, Kan wrote:
>>
>> On 6/12/15 2:39 PM, Liang, Kan wrote:
>>> Here are the test results.
>>> Please note that I get "synthesized threads took..." after the test case
>> exit.
>>> It means both way have the same issue.
>>
>> Got it. So what you really mean is launching perf on an already running
>> process perf never finishes initializing. There are several types of problems
>> like this. For example on a sparc system with a 1024 cpus if I launch perf
>> (top or record) after starting a kernel build with make -j
>> 1024 the build finishes before perf starts collecting samples. ie., it never
>> finishes walking /proc until the build is complete. task_diag does not solve
>> that problem either and in general the procps tools can't handle it either
>> (ps or top for example).
>>
>
> We should not stop using system wide perf top/record just because there
> are some threads which have huge/growing maps.
I have not said anything to that effect. I am trying to understand the
fundamental points here for a test app you can't / won't distribute.
And, I am also pointing out similar problems that perf and other tools
can't handle.
> The maps information is not critical for sampling.
But is for correlating the addresses in those samples.
>
> If task_diag does not solve this problem, I think we still need a time out
> to force stop endless mmap processing. It's the simplest working
> solution so far.
I disagree with the timeout. For example an overloaded system where perf
is not getting scheduled could trigger the same.
Also, in the spirit of perf if you are going to drop information you
need to generate an event that says information was lost and have the
analysis tools show a message that information was lost. You can't
simply bail out and have "[unknown]" shown for symbols / dsos. I get
tons of user comments about perf showing callchains properly; the
proposed patch just adds to that confusion.
David
coming back to this ...
On 6/12/15 2:39 PM, Liang, Kan wrote:
>>> Yes, perf always can read proc file. The problem is that the proc file
>>> is huge and keep growing faster than proc reader.
>>> So perf top do loop in perf_event__synthesize_mmap_events until the
>>> test case exit.
>>
>> I'm confused. How are you getting the above time to read /proc maps if it
>> never finishes?
>
> I just tried to simplify the issue for perf record. So you may noticed that
> I only read one thread. There are several threads in the system.
> Also, I do the perf record test when starting the test case.
> The proc file is not that big.
> For perf top, it will monitor whole system. So it never finishes.
If the proc file is not that big for perf-record why is it a problem for
perf-top? Both should only be reading the maps file for the thread group
leader once and after it is processed getting MMAP events for changes.
Why do you say perf-top can't handle it but perf-record can?
David
>
> > The maps information is not critical for sampling.
>
> But is for correlating the addresses in those samples.
We still have the maps information for other threads in system.
Only part of the information for some thread is lost.
>
> >
> > If task_diag does not solve this problem, I think we still need a time
> > out to force stop endless mmap processing. It's the simplest working
> > solution so far.
>
> I disagree with the timeout. For example an overloaded system where
> perf is not getting scheduled could trigger the same.
It's not a perfect solution for all tools. But a working solution for perf.
Without timeout, we get nothing from perf top/record.
With timeout, we can do sampling. We can correlate the addresses in
most samples. It's better than nothing.
>
> Also, in the spirit of perf if you are going to drop information you need to
> generate an event that says information was lost and have the analysis
> tools show a message that information was lost. You can't simply bail out
> and have "[unknown]" shown for symbols / dsos. I get tons of user
> comments about perf showing callchains properly; the proposed patch just
> adds to that confusion.
Currently, it will print a warning in perf top/record. I think I can do more and
print warning in perf_session__warn_about_errors as Arnaldo suggested.
Thanks,
Kan
>
> David
>
> coming back to this ...
>
> On 6/12/15 2:39 PM, Liang, Kan wrote:
> >>> Yes, perf always can read proc file. The problem is that the proc
> >>> file is huge and keep growing faster than proc reader.
> >>> So perf top do loop in perf_event__synthesize_mmap_events until
> the
> >>> test case exit.
> >>
> >> I'm confused. How are you getting the above time to read /proc maps
> >> if it never finishes?
> >
> > I just tried to simplify the issue for perf record. So you may noticed
> > that I only read one thread. There are several threads in the system.
> > Also, I do the perf record test when starting the test case.
> > The proc file is not that big.
> > For perf top, it will monitor whole system. So it never finishes.
>
> If the proc file is not that big for perf-record why is it a problem for perf-
> top? Both should only be reading the maps file for the thread group leader
> once and after it is processed getting MMAP events for changes.
> Why do you say perf-top can't handle it but perf-record can?
I once wanted to simplify the case. So I limited the perf record for one thread
and sampled the test at very beginning. So we can see the processing time.
But it makes things more complicate and confusing. :(
Anyway, both system-wide-monitorings have this issue. The system-wide
monitorings include perf top and perf record -a.
Thanks,
Kan
>
> David
Em Fri, Jun 12, 2015 at 10:24:36PM -0600, David Ahern escreveu:
> coming back to this ...
> On 6/12/15 2:39 PM, Liang, Kan wrote:
> >>>Yes, perf always can read proc file. The problem is that the proc file
> >>>is huge and keep growing faster than proc reader.
> >>>So perf top do loop in perf_event__synthesize_mmap_events until the
> >>>test case exit.
> >>I'm confused. How are you getting the above time to read /proc maps if it
> >>never finishes?
> >I just tried to simplify the issue for perf record. So you may noticed that
> >I only read one thread. There are several threads in the system.
> >Also, I do the perf record test when starting the test case.
> >The proc file is not that big.
> >For perf top, it will monitor whole system. So it never finishes.
>
> If the proc file is not that big for perf-record why is it a problem for
> perf-top? Both should only be reading the maps file for the thread group
> leader once and after it is processed getting MMAP events for changes. Why
> do you say perf-top can't handle it but perf-record can?
'perf top' does more than 'perf record', so it is conceivable that in
some circumstances 'perf record' can go thru, while top struggles.
That being said this happens when synthesizing PERF_RECORD_ events for
existing threads, i.e. at tool start time, for both top and record, so,
for this specific case, there should be no difference, if the workloads
running in both cases are the same at tool start up phase.
Then, that being said, having a sane upper limit on the time for
processing those events makes the tool more robust and allows it to do
most of its work, just samples for the maps not synthesized will fail to
get resolved to symbols/DSOs.
For those cases we should, during synthesizing, do both what Kan did in
his patch, i.e. emit a log warning with the COMM/PID that we are
truncating /proc/PID/maps parsing, and increment a counter that, just
after we finish synthesizing we should report, in a similar way as we
do in perf_session__warn_about_errors() after processing events,
something like:
+--------------------------------------------------------+
| %d map information files for pre-existing threads were |
| not processed, if there are samples for addresses they |
| will not be resolved, you may find out which are these |
| threads by running with -v and redirecting the output |
| to a file. |
+--------------------------------------------------------+
Ideally, as an extra step, we could flip a flag on the 'struct thread'
where these maps got truncated and add some visual cue to the
hist_entry instances (lines in the top UI).
Perhaps we should add a per-thread-proc-map-processing timeout parameter
to the synthesizing routines instead of having that hardcoded, i.e.
allow the tool to specify what is reasonable for it, but that wouldn't
be strictly required for a first patch, emitting the dialog box above
after synthesizing, if truncation happened, is.
Agreed?
- Arnaldo
> Perhaps we should add a per-thread-proc-map-processing timeout parameter
> to the synthesizing routines instead of having that hardcoded, i.e.
> allow the tool to specify what is reasonable for it, but that wouldn't
> be strictly required for a first patch, emitting the dialog box above
> after synthesizing, if truncation happened, is.
>
> Agreed?
If you make top emit a dialog box please make it appear only once and have a
timeout so that it doesn't block it forever
I personally would prefer of top just doing best effort and not giving
such warnings which the user cannot do anything about anyways.
-Andi
On 6/16/15 9:11 AM, Arnaldo Carvalho de Melo wrote:
> Then, that being said, having a sane upper limit on the time for
> processing those events makes the tool more robust and allows it to do
> most of its work, just samples for the maps not synthesized will fail to
> get resolved to symbols/DSOs.
If you are going to use timeouts then you need a sane upper limit on
walking /proc altogether as well. i.e, one time limit for individual
proc files (ie, time limit per task), and one for all of /proc (i.e,
time limit for all of synthesized_threads). What is a reasonable time
limit for each? Will it be configurable or hardcoded?
If perf aborts data collection for either a case the user should get a
warning.
>
> For those cases we should, during synthesizing, do both what Kan did in
> his patch, i.e. emit a log warning with the COMM/PID that we are
> truncating /proc/PID/maps parsing, and increment a counter that, just
> after we finish synthesizing we should report, in a similar way as we
> do in perf_session__warn_about_errors() after processing events,
> something like:
>
> +--------------------------------------------------------+
> | %d map information files for pre-existing threads were |
> | not processed, if there are samples for addresses they |
> | will not be resolved, you may find out which are these |
> | threads by running with -v and redirecting the output |
> | to a file. |
> +--------------------------------------------------------+
>
> Ideally, as an extra step, we could flip a flag on the 'struct thread'
> where these maps got truncated and add some visual cue to the
> hist_entry instances (lines in the top UI).
>
> Perhaps we should add a per-thread-proc-map-processing timeout parameter
> to the synthesizing routines instead of having that hardcoded, i.e.
> allow the tool to specify what is reasonable for it, but that wouldn't
> be strictly required for a first patch, emitting the dialog box above
> after synthesizing, if truncation happened, is.
>
> Agreed?
And then report side there should be a warning as well (record can be
done by one person and analysis by another).
David
> Em Fri, Jun 12, 2015 at 10:24:36PM -0600, David Ahern escreveu:
> > coming back to this ...
>
> > On 6/12/15 2:39 PM, Liang, Kan wrote:
> > >>>Yes, perf always can read proc file. The problem is that the proc
> > >>>file is huge and keep growing faster than proc reader.
> > >>>So perf top do loop in perf_event__synthesize_mmap_events until
> the
> > >>>test case exit.
>
> > >>I'm confused. How are you getting the above time to read /proc maps
> > >>if it never finishes?
>
> > >I just tried to simplify the issue for perf record. So you may
> > >noticed that I only read one thread. There are several threads in the
> system.
> > >Also, I do the perf record test when starting the test case.
> > >The proc file is not that big.
> > >For perf top, it will monitor whole system. So it never finishes.
> >
> > If the proc file is not that big for perf-record why is it a problem
> > for perf-top? Both should only be reading the maps file for the thread
> > group leader once and after it is processed getting MMAP events for
> > changes. Why do you say perf-top can't handle it but perf-record can?
>
> 'perf top' does more than 'perf record', so it is conceivable that in some
> circumstances 'perf record' can go thru, while top struggles.
>
> That being said this happens when synthesizing PERF_RECORD_ events for
> existing threads, i.e. at tool start time, for both top and record, so, for this
> specific case, there should be no difference, if the workloads running in
> both cases are the same at tool start up phase.
>
> Then, that being said, having a sane upper limit on the time for processing
> those events makes the tool more robust and allows it to do most of its
> work, just samples for the maps not synthesized will fail to get resolved to
> symbols/DSOs.
>
> For those cases we should, during synthesizing, do both what Kan did in his
> patch, i.e. emit a log warning with the COMM/PID that we are truncating
> /proc/PID/maps parsing, and increment a counter that, just after we finish
> synthesizing we should report, in a similar way as we do in
> perf_session__warn_about_errors() after processing events, something
> like:
>
> +--------------------------------------------------------+
> | %d map information files for pre-existing threads were |
> | not processed, if there are samples for addresses they |
> | will not be resolved, you may find out which are these |
> | threads by running with -v and redirecting the output |
> | to a file. |
> +--------------------------------------------------------+
>
> Ideally, as an extra step, we could flip a flag on the 'struct thread'
> where these maps got truncated and add some visual cue to the hist_entry
> instances (lines in the top UI).
>
> Perhaps we should add a per-thread-proc-map-processing timeout
> parameter to the synthesizing routines instead of having that hardcoded,
> i.e.
> allow the tool to specify what is reasonable for it, but that wouldn't be
> strictly required for a first patch, emitting the dialog box above after
> synthesizing, if truncation happened, is.
>
> Agreed?
Yes, we can print the warning in perf_session__warn_about_errors,
so the user will get warning from both perf record and perf report.
perf top will not call perf_session__warn_about_errors. So I think I
will still use pr_warning in V2 patch to notify user. Because if we use
ui__warning, the user has to press any key to close the dialog box.
In my test, I have 48 threads with huge maps. It feels awful to press
48 space to close warning box. Furthermore, as Andi said the user cannot
do anything about this warning. So pr_warning should be good enough.
Regarding to timeout value, I will add a per-thread-proc-map-processing
timeout parameter in next version. The default value will be 50ms.
We still need a way to notify the perf report that some map is incomplete.
I plan to add a bit PERF_RECORD_MISC_MMAP_TIME_OUT (1 << 12) for
event->header.misc.
When timeout detect, it generates a MMAP2 record as below:
The perf tool will check the bit to know which mmap is incomplete and
update evlist-status for perf_session__warn_about_errors
@@ -253,6 +258,11 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
if (fgets(bf, sizeof(bf), fp) == NULL)
break;
+ if ((rdclock() - t) > MMAP_TIMEOUT) {
+ timeout = true;
+ goto out;
+ }
+
/* ensure null termination since stack will be reused. */
strcpy(execname, "");
@@ -301,6 +311,10 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
event->header.misc |= PERF_RECORD_MISC_MMAP_DATA;
}
+out:
+ if (timeout)
+ event->header.misc |= PERF_RECORD_MISC_MMAP_TIME_OUT;
+
if (!strcmp(execname, ""))
strcpy(execname, anonstr);
@@ -319,6 +333,9 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
rc = -1;
break;
}
+
+ if (timeout)
+ break;
}
fclose(fp);
Thanks,
Kan
>
> - Arnaldo
Em Tue, Jun 16, 2015 at 04:42:49PM +0000, Liang, Kan escreveu:
> > Em Fri, Jun 12, 2015 at 10:24:36PM -0600, David Ahern escreveu:
> > > coming back to this ...
> >
> > > On 6/12/15 2:39 PM, Liang, Kan wrote:
> > > >>>Yes, perf always can read proc file. The problem is that the proc
> > > >>>file is huge and keep growing faster than proc reader.
> > > >>>So perf top do loop in perf_event__synthesize_mmap_events until
> > the
> > > >>>test case exit.
> >
> > > >>I'm confused. How are you getting the above time to read /proc maps
> > > >>if it never finishes?
> >
> > > >I just tried to simplify the issue for perf record. So you may
> > > >noticed that I only read one thread. There are several threads in the
> > system.
> > > >Also, I do the perf record test when starting the test case.
> > > >The proc file is not that big.
> > > >For perf top, it will monitor whole system. So it never finishes.
> > >
> > > If the proc file is not that big for perf-record why is it a problem
> > > for perf-top? Both should only be reading the maps file for the thread
> > > group leader once and after it is processed getting MMAP events for
> > > changes. Why do you say perf-top can't handle it but perf-record can?
> >
> > 'perf top' does more than 'perf record', so it is conceivable that in some
> > circumstances 'perf record' can go thru, while top struggles.
> >
> > That being said this happens when synthesizing PERF_RECORD_ events for
> > existing threads, i.e. at tool start time, for both top and record, so, for this
> > specific case, there should be no difference, if the workloads running in
> > both cases are the same at tool start up phase.
> >
> > Then, that being said, having a sane upper limit on the time for processing
> > those events makes the tool more robust and allows it to do most of its
> > work, just samples for the maps not synthesized will fail to get resolved to
> > symbols/DSOs.
> >
> > For those cases we should, during synthesizing, do both what Kan did in his
> > patch, i.e. emit a log warning with the COMM/PID that we are truncating
> > /proc/PID/maps parsing, and increment a counter that, just after we finish
> > synthesizing we should report, in a similar way as we do in
> > perf_session__warn_about_errors() after processing events, something
> > like:
> >
> > +--------------------------------------------------------+
> > | %d map information files for pre-existing threads were |
> > | not processed, if there are samples for addresses they |
> > | will not be resolved, you may find out which are these |
> > | threads by running with -v and redirecting the output |
> > | to a file. |
> > +--------------------------------------------------------+
> >
> > Ideally, as an extra step, we could flip a flag on the 'struct thread'
> > where these maps got truncated and add some visual cue to the hist_entry
> > instances (lines in the top UI).
> >
> > Perhaps we should add a per-thread-proc-map-processing timeout
> > parameter to the synthesizing routines instead of having that hardcoded,
> > i.e.
> > allow the tool to specify what is reasonable for it, but that wouldn't be
> > strictly required for a first patch, emitting the dialog box above after
> > synthesizing, if truncation happened, is.
> >
> > Agreed?
>
> Yes, we can print the warning in perf_session__warn_about_errors,
> so the user will get warning from both perf record and perf report.
>
> perf top will not call perf_session__warn_about_errors. So I think I
> will still use pr_warning in V2 patch to notify user. Because if we use
> ui__warning, the user has to press any key to close the dialog box.
> In my test, I have 48 threads with huge maps. It feels awful to press
> 48 space to close warning box. Furthermore, as Andi said the user cannot
Sure, that would be overkill, way annoying and completely unnecessary...
> do anything about this warning. So pr_warning should be good enough.
I think it is the right interface, its only problem when used with the
TUI is that it doesn't go to a file that could be accessible via a
hotkey in a TUI pager, searchable, etc, like less on stdio.
> Regarding to timeout value, I will add a per-thread-proc-map-processing
> timeout parameter in next version. The default value will be 50ms.
Ok.
> We still need a way to notify the perf report that some map is incomplete.
> I plan to add a bit PERF_RECORD_MISC_MMAP_TIME_OUT (1 << 12) for
> event->header.misc.
> When timeout detect, it generates a MMAP2 record as below:
> The perf tool will check the bit to know which mmap is incomplete and
> update evlist-status for perf_session__warn_about_errors
Right, looks ok if we want to do this without passing an extra
"synthesize_stats" parameter that we would look at the end of the
synthesizing on a perf_event__warn_about_synth_errors() that would
receive this synthesize_stats struct.
> @@ -253,6 +258,11 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
> if (fgets(bf, sizeof(bf), fp) == NULL)
> break;
>
> + if ((rdclock() - t) > MMAP_TIMEOUT) {
> + timeout = true;
> + goto out;
> + }
> +
> /* ensure null termination since stack will be reused. */
> strcpy(execname, "");
>
> @@ -301,6 +311,10 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
> event->header.misc |= PERF_RECORD_MISC_MMAP_DATA;
> }
>
> +out:
> + if (timeout)
> + event->header.misc |= PERF_RECORD_MISC_MMAP_TIME_OUT;
> +
> if (!strcmp(execname, ""))
> strcpy(execname, anonstr);
>
> @@ -319,6 +333,9 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
> rc = -1;
> break;
> }
> +
> + if (timeout)
> + break;
> }
>
> fclose(fp);
>
>
> Thanks,
> Kan
>
> >
> > - Arnaldo