Hello,
This is to optimize the event synthesis during perf record.
The first patch is to reduce memory usage when many threads are used.
The second is to avoid unncessary syscalls for kernel threads. And
the last one is to reduce the number of threads to iterate when new
threads are being created at the same time.
Unfortunately there's no dramatic improvement here but I can see ~5%
gain in the 'perf bench internals synthesize' on a big machine.
(The numbers are not stable though)
Before:
# perf bench internals synthesize --mt -M1 -I 100
# Running 'internals/synthesize' benchmark:
Computing performance of multi threaded perf event synthesis by
synthesizing events on CPU 0:
Number of synthesis threads: 1
Average synthesis took: 68831.480 usec (+- 101.450 usec)
Average num. events: 9982.000 (+- 0.000)
Average time per event 6.896 usec
After:
# perf bench internals synthesize --mt -M1 -I 100
# Running 'internals/synthesize' benchmark:
Computing performance of multi threaded perf event synthesis by
synthesizing events on CPU 0:
Number of synthesis threads: 1
Average synthesis took: 65036.370 usec (+- 158.121 usec)
Average num. events: 9982.000 (+- 0.000)
Average time per event 6.515 usec
Thanks,
Namhyung
Namhyung Kim (3):
perf tools: Use /proc/<PID>/task/<TID>/status for synthesis
perf tools: Skip MMAP record synthesis for kernel threads
perf tools: Use scandir() to iterate threads
tools/perf/util/synthetic-events.c | 88 ++++++++++++++++++++----------
1 file changed, 58 insertions(+), 30 deletions(-)
--
2.29.2.684.gfbc64c5ab5-goog
To synthesize information to resolve sample IPs, it needs to scan task
and mmap info from the /proc filesystem. For each process, it
opens (and reads) status and maps file respectively. But as kernel
threads don't have memory maps so we can skip the maps file.
To find kernel threads, check "VmPeak:" line in /proc/<PID>/status
file. It's about the peak virtual memory usage so only user-level
tasks have that. Also check "Threads:" line (which follows the VmPeak
line whether or not it exists) to be sure it's read enough data - just
in case of deeply nested pid namespaces or large number of
supplementary groups are involved.
This is for user process:
$ head -40 /proc/1/status
Name: systemd
Umask: 0000
State: S (sleeping)
Tgid: 1
Ngid: 0
Pid: 1
PPid: 0
TracerPid: 0
Uid: 0 0 0 0
Gid: 0 0 0 0
FDSize: 256
Groups:
NStgid: 1
NSpid: 1
NSpgid: 1
NSsid: 1
VmPeak: 234192 kB <-- here
VmSize: 169964 kB
VmLck: 0 kB
VmPin: 0 kB
VmHWM: 29528 kB
VmRSS: 6104 kB
RssAnon: 2756 kB
RssFile: 3348 kB
RssShmem: 0 kB
VmData: 19776 kB
VmStk: 1036 kB
VmExe: 784 kB
VmLib: 9532 kB
VmPTE: 116 kB
VmSwap: 2400 kB
HugetlbPages: 0 kB
CoreDumping: 0
THP_enabled: 1
Threads: 1 <-- and here
SigQ: 1/62808
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
SigBlk: 7be3c0fe28014a03
SigIgn: 0000000000001000
And this is for kernel thread:
$ head -20 /proc/2/status
Name: kthreadd
Umask: 0000
State: S (sleeping)
Tgid: 2
Ngid: 0
Pid: 2
PPid: 0
TracerPid: 0
Uid: 0 0 0 0
Gid: 0 0 0 0
FDSize: 64
Groups:
NStgid: 2
NSpid: 2
NSpgid: 0
NSsid: 0
Threads: 1 <-- here
SigQ: 1/62808
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/synthetic-events.c | 32 +++++++++++++++++++++---------
1 file changed, 23 insertions(+), 9 deletions(-)
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 515d145a4303..153a822f411a 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -70,13 +70,13 @@ int perf_tool__process_synth_event(struct perf_tool *tool,
* the comm, tgid and ppid.
*/
static int perf_event__get_comm_ids(pid_t pid, pid_t tid, char *comm, size_t len,
- pid_t *tgid, pid_t *ppid)
+ pid_t *tgid, pid_t *ppid, bool *kernel)
{
char bf[4096];
int fd;
size_t size = 0;
ssize_t n;
- char *name, *tgids, *ppids;
+ char *name, *tgids, *ppids, *vmpeak, *threads;
*tgid = -1;
*ppid = -1;
@@ -102,8 +102,14 @@ static int perf_event__get_comm_ids(pid_t pid, pid_t tid, char *comm, size_t len
bf[n] = '\0';
name = strstr(bf, "Name:");
- tgids = strstr(bf, "Tgid:");
- ppids = strstr(bf, "PPid:");
+ tgids = strstr(name ?: bf, "Tgid:");
+ ppids = strstr(tgids ?: bf, "PPid:");
+ vmpeak = strstr(ppids ?: bf, "VmPeak:");
+
+ if (vmpeak)
+ threads = NULL;
+ else
+ threads = strstr(ppids ?: bf, "Threads:");
if (name) {
char *nl;
@@ -141,12 +147,17 @@ static int perf_event__get_comm_ids(pid_t pid, pid_t tid, char *comm, size_t len
pr_debug("PPid: string not found for pid %d\n", tid);
}
+ if (!vmpeak && threads)
+ *kernel = true;
+ else
+ *kernel = false;
+
return 0;
}
static int perf_event__prepare_comm(union perf_event *event, pid_t pid, pid_t tid,
struct machine *machine,
- pid_t *tgid, pid_t *ppid)
+ pid_t *tgid, pid_t *ppid, bool *kernel)
{
size_t size;
@@ -157,7 +168,7 @@ static int perf_event__prepare_comm(union perf_event *event, pid_t pid, pid_t ti
if (machine__is_host(machine)) {
if (perf_event__get_comm_ids(pid, tid, event->comm.comm,
sizeof(event->comm.comm),
- tgid, ppid) != 0) {
+ tgid, ppid, kernel) != 0) {
return -1;
}
} else {
@@ -187,8 +198,10 @@ pid_t perf_event__synthesize_comm(struct perf_tool *tool,
struct machine *machine)
{
pid_t tgid, ppid;
+ bool kernel_thread;
- if (perf_event__prepare_comm(event, 0, pid, machine, &tgid, &ppid) != 0)
+ if (perf_event__prepare_comm(event, 0, pid, machine, &tgid, &ppid,
+ &kernel_thread) != 0)
return -1;
if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
@@ -703,6 +716,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
while ((dirent = readdir(tasks)) != NULL) {
char *end;
pid_t _pid;
+ bool kernel_thread;
_pid = strtol(dirent->d_name, &end, 10);
if (*end)
@@ -710,7 +724,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
rc = -1;
if (perf_event__prepare_comm(comm_event, pid, _pid, machine,
- &tgid, &ppid) != 0)
+ &tgid, &ppid, &kernel_thread) != 0)
break;
if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
@@ -728,7 +742,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
break;
rc = 0;
- if (_pid == pid) {
+ if (_pid == pid && !kernel_thread) {
/* process the parent's maps too */
rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
process, machine, mmap_data);
--
2.29.2.684.gfbc64c5ab5-goog
Like in __event__synthesize_thread(), I think it's better to use
scandir() instead of the readdir() loop. In case some malicious task
continues to create new threads, the readdir() loop will run over and
over to collect tids. The scandir() also has the problem but the
window is much smaller since it doesn't do much work during the
iteration.
Also add filter_task() function as we only care the tasks.
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/synthetic-events.c | 28 +++++++++++++++++-----------
1 file changed, 17 insertions(+), 11 deletions(-)
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 153a822f411a..36cda93318a4 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -664,6 +664,11 @@ int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t
return rc;
}
+static int filter_task(const struct dirent *dirent)
+{
+ return isdigit(dirent->d_name[0]);
+}
+
static int __event__synthesize_thread(union perf_event *comm_event,
union perf_event *mmap_event,
union perf_event *fork_event,
@@ -672,10 +677,10 @@ static int __event__synthesize_thread(union perf_event *comm_event,
struct perf_tool *tool, struct machine *machine, bool mmap_data)
{
char filename[PATH_MAX];
- DIR *tasks;
- struct dirent *dirent;
+ struct dirent **dirent;
pid_t tgid, ppid;
int rc = 0;
+ int i, n;
/* special case: only send one comm event using passed in pid */
if (!full) {
@@ -707,18 +712,16 @@ static int __event__synthesize_thread(union perf_event *comm_event,
snprintf(filename, sizeof(filename), "%s/proc/%d/task",
machine->root_dir, pid);
- tasks = opendir(filename);
- if (tasks == NULL) {
- pr_debug("couldn't open %s\n", filename);
- return 0;
- }
+ n = scandir(filename, &dirent, filter_task, alphasort);
+ if (n < 0)
+ return n;
- while ((dirent = readdir(tasks)) != NULL) {
+ for (i = 0; i < n; i++) {
char *end;
pid_t _pid;
bool kernel_thread;
- _pid = strtol(dirent->d_name, &end, 10);
+ _pid = strtol(dirent[i]->d_name, &end, 10);
if (*end)
continue;
@@ -751,7 +754,10 @@ static int __event__synthesize_thread(union perf_event *comm_event,
}
}
- closedir(tasks);
+ for (i = 0; i < n; i++)
+ zfree(&dirent[i]);
+ free(dirent);
+
return rc;
}
@@ -936,7 +942,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
return 0;
snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
- n = scandir(proc_path, &dirent, 0, alphasort);
+ n = scandir(proc_path, &dirent, filter_task, alphasort);
if (n < 0)
return err;
--
2.29.2.684.gfbc64c5ab5-goog
On Mon, Dec 21, 2020 at 04:00:28PM +0900, Namhyung Kim wrote:
> To synthesize information to resolve sample IPs, it needs to scan task
> and mmap info from the /proc filesystem. For each process, it
> opens (and reads) status and maps file respectively. But as kernel
> threads don't have memory maps so we can skip the maps file.
>
> To find kernel threads, check "VmPeak:" line in /proc/<PID>/status
> file. It's about the peak virtual memory usage so only user-level
> tasks have that. Also check "Threads:" line (which follows the VmPeak
> line whether or not it exists) to be sure it's read enough data - just
> in case of deeply nested pid namespaces or large number of
> supplementary groups are involved.
how much does this save? for kernel threads the maps file is empty
and we just open the file and close
also perhaps just stat(".....maps") and check the size would be easier?
jirka
>
> This is for user process:
>
> $ head -40 /proc/1/status
> Name: systemd
> Umask: 0000
> State: S (sleeping)
> Tgid: 1
> Ngid: 0
> Pid: 1
> PPid: 0
> TracerPid: 0
> Uid: 0 0 0 0
> Gid: 0 0 0 0
> FDSize: 256
> Groups:
> NStgid: 1
> NSpid: 1
> NSpgid: 1
> NSsid: 1
> VmPeak: 234192 kB <-- here
> VmSize: 169964 kB
> VmLck: 0 kB
> VmPin: 0 kB
> VmHWM: 29528 kB
> VmRSS: 6104 kB
> RssAnon: 2756 kB
> RssFile: 3348 kB
> RssShmem: 0 kB
> VmData: 19776 kB
> VmStk: 1036 kB
> VmExe: 784 kB
> VmLib: 9532 kB
> VmPTE: 116 kB
> VmSwap: 2400 kB
> HugetlbPages: 0 kB
> CoreDumping: 0
> THP_enabled: 1
> Threads: 1 <-- and here
> SigQ: 1/62808
> SigPnd: 0000000000000000
> ShdPnd: 0000000000000000
> SigBlk: 7be3c0fe28014a03
> SigIgn: 0000000000001000
>
> And this is for kernel thread:
>
> $ head -20 /proc/2/status
> Name: kthreadd
> Umask: 0000
> State: S (sleeping)
> Tgid: 2
> Ngid: 0
> Pid: 2
> PPid: 0
> TracerPid: 0
> Uid: 0 0 0 0
> Gid: 0 0 0 0
> FDSize: 64
> Groups:
> NStgid: 2
> NSpid: 2
> NSpgid: 0
> NSsid: 0
> Threads: 1 <-- here
> SigQ: 1/62808
> SigPnd: 0000000000000000
> ShdPnd: 0000000000000000
>
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> tools/perf/util/synthetic-events.c | 32 +++++++++++++++++++++---------
> 1 file changed, 23 insertions(+), 9 deletions(-)
>
> diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
> index 515d145a4303..153a822f411a 100644
> --- a/tools/perf/util/synthetic-events.c
> +++ b/tools/perf/util/synthetic-events.c
> @@ -70,13 +70,13 @@ int perf_tool__process_synth_event(struct perf_tool *tool,
> * the comm, tgid and ppid.
> */
> static int perf_event__get_comm_ids(pid_t pid, pid_t tid, char *comm, size_t len,
> - pid_t *tgid, pid_t *ppid)
> + pid_t *tgid, pid_t *ppid, bool *kernel)
> {
> char bf[4096];
> int fd;
> size_t size = 0;
> ssize_t n;
> - char *name, *tgids, *ppids;
> + char *name, *tgids, *ppids, *vmpeak, *threads;
>
> *tgid = -1;
> *ppid = -1;
> @@ -102,8 +102,14 @@ static int perf_event__get_comm_ids(pid_t pid, pid_t tid, char *comm, size_t len
> bf[n] = '\0';
>
> name = strstr(bf, "Name:");
> - tgids = strstr(bf, "Tgid:");
> - ppids = strstr(bf, "PPid:");
> + tgids = strstr(name ?: bf, "Tgid:");
> + ppids = strstr(tgids ?: bf, "PPid:");
> + vmpeak = strstr(ppids ?: bf, "VmPeak:");
> +
> + if (vmpeak)
> + threads = NULL;
> + else
> + threads = strstr(ppids ?: bf, "Threads:");
>
> if (name) {
> char *nl;
> @@ -141,12 +147,17 @@ static int perf_event__get_comm_ids(pid_t pid, pid_t tid, char *comm, size_t len
> pr_debug("PPid: string not found for pid %d\n", tid);
> }
>
> + if (!vmpeak && threads)
> + *kernel = true;
> + else
> + *kernel = false;
> +
> return 0;
> }
>
> static int perf_event__prepare_comm(union perf_event *event, pid_t pid, pid_t tid,
> struct machine *machine,
> - pid_t *tgid, pid_t *ppid)
> + pid_t *tgid, pid_t *ppid, bool *kernel)
> {
> size_t size;
>
> @@ -157,7 +168,7 @@ static int perf_event__prepare_comm(union perf_event *event, pid_t pid, pid_t ti
> if (machine__is_host(machine)) {
> if (perf_event__get_comm_ids(pid, tid, event->comm.comm,
> sizeof(event->comm.comm),
> - tgid, ppid) != 0) {
> + tgid, ppid, kernel) != 0) {
> return -1;
> }
> } else {
> @@ -187,8 +198,10 @@ pid_t perf_event__synthesize_comm(struct perf_tool *tool,
> struct machine *machine)
> {
> pid_t tgid, ppid;
> + bool kernel_thread;
>
> - if (perf_event__prepare_comm(event, 0, pid, machine, &tgid, &ppid) != 0)
> + if (perf_event__prepare_comm(event, 0, pid, machine, &tgid, &ppid,
> + &kernel_thread) != 0)
> return -1;
>
> if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
> @@ -703,6 +716,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
> while ((dirent = readdir(tasks)) != NULL) {
> char *end;
> pid_t _pid;
> + bool kernel_thread;
>
> _pid = strtol(dirent->d_name, &end, 10);
> if (*end)
> @@ -710,7 +724,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
>
> rc = -1;
> if (perf_event__prepare_comm(comm_event, pid, _pid, machine,
> - &tgid, &ppid) != 0)
> + &tgid, &ppid, &kernel_thread) != 0)
> break;
>
> if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
> @@ -728,7 +742,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
> break;
>
> rc = 0;
> - if (_pid == pid) {
> + if (_pid == pid && !kernel_thread) {
> /* process the parent's maps too */
> rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
> process, machine, mmap_data);
> --
> 2.29.2.684.gfbc64c5ab5-goog
>
On Mon, Dec 28, 2020 at 8:50 PM Jiri Olsa <[email protected]> wrote:
>
> On Mon, Dec 21, 2020 at 04:00:28PM +0900, Namhyung Kim wrote:
> > To synthesize information to resolve sample IPs, it needs to scan task
> > and mmap info from the /proc filesystem. For each process, it
> > opens (and reads) status and maps file respectively. But as kernel
> > threads don't have memory maps so we can skip the maps file.
> >
> > To find kernel threads, check "VmPeak:" line in /proc/<PID>/status
> > file. It's about the peak virtual memory usage so only user-level
> > tasks have that. Also check "Threads:" line (which follows the VmPeak
> > line whether or not it exists) to be sure it's read enough data - just
> > in case of deeply nested pid namespaces or large number of
> > supplementary groups are involved.
>
> how much does this save? for kernel threads the maps file is empty
> and we just open the file and close
>
> also perhaps just stat(".....maps") and check the size would be easier?
The numbers are in the cover letter and it's around 5% on an idle
machine which has mostly kernel threads. I think most of the win
came from this change.
It's just to avoid those syscalls, so I wanted to use the available
info in the status file.
Thanks,
Namhyung
On Mon, Dec 21, 2020 at 04:00:26PM +0900, Namhyung Kim wrote:
> Hello,
>
> This is to optimize the event synthesis during perf record.
>
> The first patch is to reduce memory usage when many threads are used.
> The second is to avoid unncessary syscalls for kernel threads. And
> the last one is to reduce the number of threads to iterate when new
> threads are being created at the same time.
>
> Unfortunately there's no dramatic improvement here but I can see ~5%
> gain in the 'perf bench internals synthesize' on a big machine.
> (The numbers are not stable though)
Looks all good to me. The VmPeak assumption might be slightly
fragile, but I guess there's nothing better currently.
Reviewed-by: Andi Kleen <[email protected]>
-Andi