2024-02-21 17:05:28

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: [PATCH 1/1] perf trace: Collect sys_nanosleep first argument

That is a 'struct timespec' passed from userspace to the kernel as we
can see with a system wide syscall tracing:

root@number:~# perf trace -e nanosleep
0.000 (10.102 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
38.924 (10.077 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
100.177 (10.107 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
139.171 (10.063 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
200.603 (10.105 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
239.399 (10.064 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
300.994 (10.096 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
339.584 (10.067 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
401.335 (10.057 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
439.758 (10.166 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
501.814 (10.110 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
539.983 (10.227 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
602.284 (10.199 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
640.208 (10.105 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
702.662 (10.163 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
740.440 (10.107 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
802.993 (10.159 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
^Croot@number:~# strace -p 9150 -e nanosleep

If we then use the ptrace method to look at that podman process:

root@number:~# strace -p 9150 -e nanosleep
strace: Process 9150 attached
nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
^Cstrace: Process 9150 detached
root@number:~#

With some changes we can get something closer to the strace output,
still in system wide mode:

root@number:~# perf config trace.show_arg_names=false
root@number:~# perf config trace.show_duration=false
root@number:~# perf config trace.show_timestamp=false
root@number:~# perf config trace.show_zeros=true
root@number:~# perf config trace.args_alignment=0
root@number:~# perf trace -e nanosleep --max-events=10
podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
root@number:~#
root@number:~# perf config
trace.show_arg_names=false
trace.show_duration=false
trace.show_timestamp=false
trace.show_zeros=true
trace.args_alignment=0
root@number:~# cat ~/.perfconfig
# this file is auto-generated.
[trace]
show_arg_names = false
show_duration = false
show_timestamp = false
show_zeros = true
args_alignment = 0
root@number:~#

This will not get reused by any other syscall as nanosleep is the only
one to have as its first argument a 'struct timespec" pointer argument
passed from userspace to the kernel:

root@number:~# grep timespec /sys/kernel/tracing/events/syscalls/sys_enter_*/format | grep offset:16
/sys/kernel/tracing/events/syscalls/sys_enter_nanosleep/format: field:struct __kernel_timespec * rqtp; offset:16; size:8; signed:0;
root@number:~#

BTF based pretty printing will simplify all this, but then lets just get
the low hanging fruits first.

Cc: Adrian Hunter <[email protected]>
Cc: Ian Rogers <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/builtin-trace.c | 2 ++
.../bpf_skel/augmented_raw_syscalls.bpf.c | 21 +++++++++++++++++++
2 files changed, 23 insertions(+)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 109b8e64fe69ae32..6abe280dc38f1921 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1048,6 +1048,8 @@ static const struct syscall_fmt syscall_fmts[] = {
.arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
{ .name = "name_to_handle_at",
.arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
+ { .name = "nanosleep",
+ .arg = { [0] = { .scnprintf = SCA_TIMESPEC, /* req */ }, }, },
{ .name = "newfstatat",
.arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
{ .name = "open",
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 2872f9bc07850bd7..0acbd74e8c760956 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -341,6 +341,27 @@ int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
return 1; /* Failure: don't filter */
}

+SEC("tp/syscalls/sys_enter_nanosleep")
+int sys_enter_nanosleep(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *req_arg = (const void *)args->args[0];
+ unsigned int len = sizeof(augmented_args->args);
+ __u32 size = sizeof(struct timespec64);
+
+ if (augmented_args == NULL)
+ goto failure;
+
+ if (size > sizeof(augmented_args->__data))
+ goto failure;
+
+ bpf_probe_read_user(&augmented_args->__data, size, req_arg);
+
+ return augmented__output(args, augmented_args, len + size);
+failure:
+ return 1; /* Failure: don't filter */
+}
+
static pid_t getpid(void)
{
return bpf_get_current_pid_tgid();
--
2.43.0



2024-02-21 17:15:55

by Ian Rogers

[permalink] [raw]
Subject: Re: [PATCH 1/1] perf trace: Collect sys_nanosleep first argument

On Wed, Feb 21, 2024 at 9:04 AM Arnaldo Carvalho de Melo
<[email protected]> wrote:
>
> That is a 'struct timespec' passed from userspace to the kernel as we
> can see with a system wide syscall tracing:
>
> root@number:~# perf trace -e nanosleep
> 0.000 (10.102 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 38.924 (10.077 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 100.177 (10.107 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 139.171 (10.063 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 200.603 (10.105 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 239.399 (10.064 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 300.994 (10.096 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 339.584 (10.067 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 401.335 (10.057 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 439.758 (10.166 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 501.814 (10.110 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 539.983 (10.227 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 602.284 (10.199 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 640.208 (10.105 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 702.662 (10.163 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 740.440 (10.107 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> 802.993 (10.159 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> ^Croot@number:~# strace -p 9150 -e nanosleep
>
> If we then use the ptrace method to look at that podman process:
>
> root@number:~# strace -p 9150 -e nanosleep
> strace: Process 9150 attached
> nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> ^Cstrace: Process 9150 detached
> root@number:~#
>
> With some changes we can get something closer to the strace output,
> still in system wide mode:
>
> root@number:~# perf config trace.show_arg_names=false
> root@number:~# perf config trace.show_duration=false
> root@number:~# perf config trace.show_timestamp=false
> root@number:~# perf config trace.show_zeros=true
> root@number:~# perf config trace.args_alignment=0
> root@number:~# perf trace -e nanosleep --max-events=10
> podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> root@number:~#
> root@number:~# perf config
> trace.show_arg_names=false
> trace.show_duration=false
> trace.show_timestamp=false
> trace.show_zeros=true
> trace.args_alignment=0
> root@number:~# cat ~/.perfconfig
> # this file is auto-generated.
> [trace]
> show_arg_names = false
> show_duration = false
> show_timestamp = false
> show_zeros = true
> args_alignment = 0
> root@number:~#
>
> This will not get reused by any other syscall as nanosleep is the only
> one to have as its first argument a 'struct timespec" pointer argument
> passed from userspace to the kernel:
>
> root@number:~# grep timespec /sys/kernel/tracing/events/syscalls/sys_enter_*/format | grep offset:16
> /sys/kernel/tracing/events/syscalls/sys_enter_nanosleep/format: field:struct __kernel_timespec * rqtp; offset:16; size:8; signed:0;
> root@number:~#
>
> BTF based pretty printing will simplify all this, but then lets just get
> the low hanging fruits first.
>
> Cc: Adrian Hunter <[email protected]>
> Cc: Ian Rogers <[email protected]>
> Cc: Jiri Olsa <[email protected]>
> Cc: Namhyung Kim <[email protected]>
> Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>

v2? https://lore.kernel.org/lkml/[email protected]/

Thanks,
Ian

2024-02-21 17:21:54

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/1] perf trace: Collect sys_nanosleep first argument

On Wed, Feb 21, 2024 at 09:15:01AM -0800, Ian Rogers wrote:
> On Wed, Feb 21, 2024 at 9:04 AM Arnaldo Carvalho de Melo
> <[email protected]> wrote:
> >
> > That is a 'struct timespec' passed from userspace to the kernel as we
> > can see with a system wide syscall tracing:
> >
> > root@number:~# perf trace -e nanosleep
> > 0.000 (10.102 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 38.924 (10.077 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 100.177 (10.107 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 139.171 (10.063 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 200.603 (10.105 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 239.399 (10.064 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 300.994 (10.096 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 339.584 (10.067 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 401.335 (10.057 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 439.758 (10.166 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 501.814 (10.110 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 539.983 (10.227 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 602.284 (10.199 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 640.208 (10.105 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 702.662 (10.163 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 740.440 (10.107 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > 802.993 (10.159 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0
> > ^Croot@number:~# strace -p 9150 -e nanosleep
> >
> > If we then use the ptrace method to look at that podman process:
> >
> > root@number:~# strace -p 9150 -e nanosleep
> > strace: Process 9150 attached
> > nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> > nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> > nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> > nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> > nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> > nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> > nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
> > ^Cstrace: Process 9150 detached
> > root@number:~#
> >
> > With some changes we can get something closer to the strace output,
> > still in system wide mode:
> >
> > root@number:~# perf config trace.show_arg_names=false
> > root@number:~# perf config trace.show_duration=false
> > root@number:~# perf config trace.show_timestamp=false
> > root@number:~# perf config trace.show_zeros=true
> > root@number:~# perf config trace.args_alignment=0
> > root@number:~# perf trace -e nanosleep --max-events=10
> > podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0
> > root@number:~#
> > root@number:~# perf config
> > trace.show_arg_names=false
> > trace.show_duration=false
> > trace.show_timestamp=false
> > trace.show_zeros=true
> > trace.args_alignment=0
> > root@number:~# cat ~/.perfconfig
> > # this file is auto-generated.
> > [trace]
> > show_arg_names = false
> > show_duration = false
> > show_timestamp = false
> > show_zeros = true
> > args_alignment = 0
> > root@number:~#
> >
> > This will not get reused by any other syscall as nanosleep is the only
> > one to have as its first argument a 'struct timespec" pointer argument
> > passed from userspace to the kernel:
> >
> > root@number:~# grep timespec /sys/kernel/tracing/events/syscalls/sys_enter_*/format | grep offset:16
> > /sys/kernel/tracing/events/syscalls/sys_enter_nanosleep/format: field:struct __kernel_timespec * rqtp; offset:16; size:8; signed:0;
> > root@number:~#
> >
> > BTF based pretty printing will simplify all this, but then lets just get
> > the low hanging fruits first.
> >
> > Cc: Adrian Hunter <[email protected]>
> > Cc: Ian Rogers <[email protected]>
> > Cc: Jiri Olsa <[email protected]>
> > Cc: Namhyung Kim <[email protected]>
> > Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
>
> v2? https://lore.kernel.org/lkml/[email protected]/

Nope, I thought I hadn't submitted and when doing a

git rebase perf-tools-next/perf-tools-next


It was rebased, so I thought I really hadn't submitted it and sent it.

- Arnaldo