LinuxLists.cc - [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

2023-10-16 04:42:57

Subject: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

The -G/--cgroups option is to put sender and receiver in different
cgroups in order to measure cgroup context switch overheads.

Users need to make sure the cgroups exist and accessible.

# perf stat -e context-switches,cgroup-switches \
> taskset -c 0 perf bench sched pipe -l 10000 > /dev/null

Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000':

20,001 context-switches
2 cgroup-switches

0.053449651 seconds time elapsed

0.011286000 seconds user
0.041869000 seconds sys

# perf stat -e context-switches,cgroup-switches \
> taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB > /dev/null

Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB':

20,001 context-switches
20,001 cgroup-switches

0.052768627 seconds time elapsed

0.006284000 seconds user
0.046266000 seconds sys

Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/Documentation/perf-bench.txt | 19 ++++
tools/perf/bench/sched-pipe.c | 118 +++++++++++++++++++++++-
2 files changed, 134 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
index ca5789625cd2..8331bd28b10e 100644
--- a/tools/perf/Documentation/perf-bench.txt
+++ b/tools/perf/Documentation/perf-bench.txt
@@ -124,6 +124,14 @@ Options of *pipe*
--loop=::
Specify number of loops.

+-G::
+--cgroups=::
+Names of cgroups for sender and receiver, separated by a comma.
+This is useful to check cgroup context switching overhead.
+Note that perf doesn't create nor delete the cgroups, so users should
+make sure that the cgroups exist and are accessible before use.
+
+
Example of *pipe*
^^^^^^^^^^^^^^^^^

@@ -141,6 +149,17 @@ Example of *pipe*
Total time:0.016 sec
16.948000 usecs/op
59004 ops/sec
+
+% perf bench sched pipe -G AAA,BBB
+(executing 1000000 pipe operations between cgroups)
+# Running 'sched/pipe' benchmark:
+# Executed 1000000 pipe operations between two processes
+
+ Total time: 6.886 [sec]
+
+ 6.886208 usecs/op
+ 145217 ops/sec
+
---------------------

SUITES FOR 'syscall'
diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index a960e7a93aec..88d20a34adb2 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -10,7 +10,9 @@
* Ported to perf by Hitoshi Mitake <[email protected]>
*/
#include <subcmd/parse-options.h>
+#include <api/fs/fs.h>
#include "bench.h"
+#include "util/cgroup.h"

#include <unistd.h>
#include <stdio.h>
@@ -19,6 +21,7 @@
#include <sys/wait.h>
#include <string.h>
#include <errno.h>
+#include <fcntl.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/types.h>
@@ -31,6 +34,7 @@ struct thread_data {
int nr;
int pipe_read;
int pipe_write;
+ bool cgroup_failed;
pthread_t pthread;
};

@@ -40,9 +44,55 @@ static int loops = LOOPS_DEFAULT;
/* Use processes by default: */
static bool threaded;

+static struct cgroup *cgrp_send = NULL;
+static struct cgroup *cgrp_recv = NULL;
+
+static int parse_two_cgroups(const struct option *opt __maybe_unused,
+ const char *str, int unset __maybe_unused)
+{
+ char *p = strdup(str);
+ char *q;
+ int ret = -1;
+
+ if (p == NULL) {
+ fprintf(stderr, "memory allocation failure");
+ return -1;
+ }
+
+ q = strchr(p, ',');
+ if (q == NULL) {
+ fprintf(stderr, "it should have two cgroup names: %s", p);
+ goto out;
+ }
+ *q = '\0';
+
+ cgrp_send = cgroup__new(p, /*do_open=*/true);
+ if (cgrp_send == NULL) {
+ fprintf(stderr, "cannot open sender cgroup: %s", p);
+ goto out;
+ }
+
+ /* skip ',' */
+ q++;
+
+ cgrp_recv = cgroup__new(q, /*do_open=*/true);
+ if (cgrp_recv == NULL) {
+ fprintf(stderr, "cannot open receiver cgroup: %s", q);
+ goto out;
+ }
+ ret = 0;
+
+out:
+ free(p);
+ return ret;
+}
+
static const struct option options[] = {
OPT_INTEGER('l', "loop", &loops, "Specify number of loops"),
OPT_BOOLEAN('T', "threaded", &threaded, "Specify threads/process based task setup"),
+ OPT_CALLBACK('G', "cgroups", NULL, "SEND,RECV",
+ "Put sender and receivers in given cgroups",
+ parse_two_cgroups),
OPT_END()
};

@@ -51,12 +101,71 @@ static const char * const bench_sched_pipe_usage[] = {
NULL
};

+static int enter_cgroup(struct cgroup *cgrp)
+{
+ char buf[32];
+ int fd, len, ret;
+ pid_t pid;
+
+ if (cgrp == NULL)
+ return 0;
+
+ if (threaded)
+ pid = syscall(__NR_gettid);
+ else
+ pid = getpid();
+
+ snprintf(buf, sizeof(buf), "%d\n", pid);
+ len = strlen(buf);
+
+ /* try cgroup v2 interface first */
+ if (threaded)
+ fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY);
+ else
+ fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY);
+
+ /* try cgroup v1 if failed */
+ if (fd < 0)
+ fd = openat(cgrp->fd, "tasks", O_WRONLY);
+
+ if (fd < 0) {
+ char mnt[PATH_MAX];
+
+ printf("Failed to open cgroup file in %s\n", cgrp->name);
+
+ if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0)
+ printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n",
+ mnt, cgrp->name);
+ return -1;
+ }
+
+ ret = write(fd, buf, len);
+ close(fd);
+
+ if (ret != len) {
+ printf("Cannot enter to cgroup: %s\n", cgrp->name);
+ return -1;
+ }
+ return 0;
+}
+
static void *worker_thread(void *__tdata)
{
struct thread_data *td = __tdata;
int m = 0, i;
int ret;

+ if (td->nr)
+ ret = enter_cgroup(cgrp_send);
+ else
+ ret = enter_cgroup(cgrp_recv);
+
+ if (ret < 0) {
+ td->cgroup_failed = true;
+ return NULL;
+ }
+ td->cgroup_failed = false;
+
for (i = 0; i < loops; i++) {
if (!td->nr) {
ret = read(td->pipe_read, &m, sizeof(int));
@@ -112,9 +221,7 @@ int bench_sched_pipe(int argc, const char **argv)
}
}

-
if (threaded) {
-
for (t = 0; t < nr_threads; t++) {
td = threads + t;

@@ -128,7 +235,6 @@ int bench_sched_pipe(int argc, const char **argv)
ret = pthread_join(td->pthread, NULL);
BUG_ON(ret);
}
-
} else {
pid = fork();
assert(pid >= 0);
@@ -147,6 +253,12 @@ int bench_sched_pipe(int argc, const char **argv)
gettimeofday(&stop, NULL);
timersub(&stop, &start, &diff);

+ cgroup__put(cgrp_send);
+ cgroup__put(cgrp_recv);
+
+ if (threads[0].cgroup_failed || threads[1].cgroup_failed)
+ return 0;
+
switch (bench_format) {
case BENCH_FORMAT_DEFAULT:
printf("# Executed %d pipe operations between two %s\n\n",
--
2.42.0.655.g421f12c284-goog

2023-10-16 09:35:55

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

* Namhyung Kim <[email protected]> wrote:

> + /* try cgroup v2 interface first */
> + if (threaded)
> + fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY);
> + else
> + fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY);
> +
> + /* try cgroup v1 if failed */
> + if (fd < 0)
> + fd = openat(cgrp->fd, "tasks", O_WRONLY);
> +
> + if (fd < 0) {
> + char mnt[PATH_MAX];
> +
> + printf("Failed to open cgroup file in %s\n", cgrp->name);
> +
> + if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0)
> + printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n",
> + mnt, cgrp->name);

Ok, this works too I suppose.

Acked-by: Ingo Molnar <[email protected]>

Thanks,

Ingo

2023-10-16 15:38:35

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

Em Mon, Oct 16, 2023 at 11:35:35AM +0200, Ingo Molnar escreveu:
>
>
> * Namhyung Kim <[email protected]> wrote:
>
> > + /* try cgroup v2 interface first */
> > + if (threaded)
> > + fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY);
> > + else
> > + fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY);
> > +
> > + /* try cgroup v1 if failed */
> > + if (fd < 0)
> > + fd = openat(cgrp->fd, "tasks", O_WRONLY);
> > +
> > + if (fd < 0) {
> > + char mnt[PATH_MAX];
> > +
> > + printf("Failed to open cgroup file in %s\n", cgrp->name);
> > +
> > + if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0)
> > + printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n",
> > + mnt, cgrp->name);
>
> Ok, this works too I suppose.
>
> Acked-by: Ingo Molnar <[email protected]>

I'm not getting that:

[root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
# Running 'sched/pipe' benchmark:
no access to cgroup /sys/fs/cgroup/AAA
cannot open sender cgroup: AAA
Usage: perf bench sched pipe <options>

-G, --cgroups <SEND,RECV>
Put sender and receivers in given cgroups
[root@five ~]#

Its better now as it bails out, but it is not emitting any message that
helps with running the test, well, there is that /sys/fs/cgroup/AAA
path, lemme try doing a mkdir:

[root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
# Running 'sched/pipe' benchmark:
no access to cgroup /sys/fs/cgroup/BBB
cannot open receiver cgroup: BBB
Usage: perf bench sched pipe <options>

-G, --cgroups <SEND,RECV>
Put sender and receivers in given cgroups
[root@five ~]#

[root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
# Running 'sched/pipe' benchmark:
[root@five ~]#

It seems to be bailing out but doesn't run the test nor emits any
warning.

I'm using v3. I'll try to debug it a bit.

- Arnaldo

2023-10-16 15:45:37

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

Em Mon, Oct 16, 2023 at 12:38:12PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Oct 16, 2023 at 11:35:35AM +0200, Ingo Molnar escreveu:
> >
> >
> > * Namhyung Kim <[email protected]> wrote:
> >
> > > + /* try cgroup v2 interface first */
> > > + if (threaded)
> > > + fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY);
> > > + else
> > > + fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY);
> > > +
> > > + /* try cgroup v1 if failed */
> > > + if (fd < 0)
> > > + fd = openat(cgrp->fd, "tasks", O_WRONLY);
> > > +
> > > + if (fd < 0) {
> > > + char mnt[PATH_MAX];
> > > +
> > > + printf("Failed to open cgroup file in %s\n", cgrp->name);
> > > +
> > > + if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0)
> > > + printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n",
> > > + mnt, cgrp->name);
> >
> > Ok, this works too I suppose.
> >
> > Acked-by: Ingo Molnar <[email protected]>
>
> I'm not getting that:
>
> [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> # Running 'sched/pipe' benchmark:
> no access to cgroup /sys/fs/cgroup/AAA
> cannot open sender cgroup: AAA
> Usage: perf bench sched pipe <options>
>
> -G, --cgroups <SEND,RECV>
> Put sender and receivers in given cgroups
> [root@five ~]#
>
> Its better now as it bails out, but it is not emitting any message that
> helps with running the test, well, there is that /sys/fs/cgroup/AAA
> path, lemme try doing a mkdir:
>
> [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> # Running 'sched/pipe' benchmark:
> no access to cgroup /sys/fs/cgroup/BBB
> cannot open receiver cgroup: BBB
> Usage: perf bench sched pipe <options>
>
> -G, --cgroups <SEND,RECV>
> Put sender and receivers in given cgroups
> [root@five ~]#
>
> [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> # Running 'sched/pipe' benchmark:
> [root@five ~]#
>
> It seems to be bailing out but doesn't run the test nor emits any
> warning.

(gdb) run bench sched pipe -l 10000
Starting program: /root/bin/perf bench sched pipe -l 10000
# Running 'sched/pipe' benchmark:
[Detaching after fork from child process 33618]

Breakpoint 1, bench_sched_pipe (argc=0, argv=0x7fffffffe3d8) at bench/sched-pipe.c:259
259 if (threads[0].cgroup_failed || threads[1].cgroup_failed)
(gdb) p threads[0].cgroup_failed
$1 = 137
(gdb) p threads[1].cgroup_failed
$2 = false
(gdb) n
260 return 0;
(gdb)

But I'm not even using cgroups?

- Arnaldo

2023-10-16 15:52:17

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

Em Mon, Oct 16, 2023 at 12:45:17PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Oct 16, 2023 at 12:38:12PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Mon, Oct 16, 2023 at 11:35:35AM +0200, Ingo Molnar escreveu:
> > > * Namhyung Kim <[email protected]> wrote:
> > >
> > > > + /* try cgroup v2 interface first */
> > > > + if (threaded)
> > > > + fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY);
> > > > + else
> > > > + fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY);
> > > > +
> > > > + /* try cgroup v1 if failed */
> > > > + if (fd < 0)
> > > > + fd = openat(cgrp->fd, "tasks", O_WRONLY);
> > > > +
> > > > + if (fd < 0) {
> > > > + char mnt[PATH_MAX];
> > > > +
> > > > + printf("Failed to open cgroup file in %s\n", cgrp->name);
> > > > +
> > > > + if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0)
> > > > + printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n",
> > > > + mnt, cgrp->name);
> > >
> > > Ok, this works too I suppose.
> > >
> > > Acked-by: Ingo Molnar <[email protected]>
> >
> > I'm not getting that:
> >
> > [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> > # Running 'sched/pipe' benchmark:
> > no access to cgroup /sys/fs/cgroup/AAA
> > cannot open sender cgroup: AAA
> > Usage: perf bench sched pipe <options>
> >
> > -G, --cgroups <SEND,RECV>
> > Put sender and receivers in given cgroups
> > [root@five ~]#
> >
> > Its better now as it bails out, but it is not emitting any message that
> > helps with running the test, well, there is that /sys/fs/cgroup/AAA
> > path, lemme try doing a mkdir:
> >
> > [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> > # Running 'sched/pipe' benchmark:
> > no access to cgroup /sys/fs/cgroup/BBB
> > cannot open receiver cgroup: BBB
> > Usage: perf bench sched pipe <options>
> >
> > -G, --cgroups <SEND,RECV>
> > Put sender and receivers in given cgroups
> > [root@five ~]#
> >
> > [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> > # Running 'sched/pipe' benchmark:
> > [root@five ~]#
> >
> > It seems to be bailing out but doesn't run the test nor emits any
> > warning.
>
> (gdb) run bench sched pipe -l 10000
> Starting program: /root/bin/perf bench sched pipe -l 10000
> # Running 'sched/pipe' benchmark:
> [Detaching after fork from child process 33618]
>
> Breakpoint 1, bench_sched_pipe (argc=0, argv=0x7fffffffe3d8) at bench/sched-pipe.c:259
> 259 if (threads[0].cgroup_failed || threads[1].cgroup_failed)
> (gdb) p threads[0].cgroup_failed
> $1 = 137
> (gdb) p threads[1].cgroup_failed
> $2 = false
> (gdb) n
> 260 return 0;
> (gdb)
>
> But I'm not even using cgroups?

So, with the patch below 'perf bench sched pipe -l 1000' is back working
for me:

[root@five ~]# perf bench sched pipe -l 1000
# Running 'sched/pipe' benchmark:
# Executed 1000 pipe operations between two processes

Total time: 0.007 [sec]

7.671000 usecs/op
130361 ops/sec
[root@five ~]#

Now back at testing with with cgroups.

- Arnaldo

2023-10-16 15:56:15

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

Em Mon, Oct 16, 2023 at 12:51:52PM -0300, Arnaldo Carvalho de Melo escreveu:
> Now back at testing with with cgroups.

Humm, even without the -G I get:

[root@five ~]# perf stat -e context-switches,cgroup-switches perf bench sched pipe -l 10000
# Running 'sched/pipe' benchmark:
# Executed 10000 pipe operations between two processes

Total time: 0.082 [sec]

8.246400 usecs/op
121265 ops/sec

Performance counter stats for 'perf bench sched pipe -l 10000':

20,002 context-switches
20,002 cgroup-switches

0.091228041 seconds time elapsed

0.007122000 seconds user
0.087700000 seconds sys

[root@five ~]#

- Arnaldo

2023-10-16 17:27:10

by Athira Rajeev

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

> On 16-Oct-2023, at 10:12 AM, Namhyung Kim <[email protected]> wrote:
>
> The -G/--cgroups option is to put sender and receiver in different
> cgroups in order to measure cgroup context switch overheads.
>
> Users need to make sure the cgroups exist and accessible.
>
> # perf stat -e context-switches,cgroup-switches \
>> taskset -c 0 perf bench sched pipe -l 10000 > /dev/null
>
> Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000':
>
> 20,001 context-switches
> 2 cgroup-switches
>
> 0.053449651 seconds time elapsed
>
> 0.011286000 seconds user
> 0.041869000 seconds sys
>
> # perf stat -e context-switches,cgroup-switches \
>> taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB > /dev/null
>
> Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB':
>
> 20,001 context-switches
> 20,001 cgroup-switches
>
> 0.052768627 seconds time elapsed
>
> 0.006284000 seconds user
> 0.046266000 seconds sys
>
> Signed-off-by: Namhyung Kim <[email protected]>

Hi Namhyung,

I tried V3 on top of perf-tools-next

# ./perf stat -e context-switches,cgroup-switches taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB > /dev/null
no access to cgroup /sys/fs/cgroup/perf_event/AAA
cannot open sender cgroup: AAA
Usage: perf bench sched pipe <options>

-G, --cgroups <SEND,RECV>
Put sender and receivers in given cgroups

Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB':

2 context-switches
0 cgroup-switches

0.007291460 seconds time elapsed

0.007438000 seconds user
0.000000000 seconds sys

I created AAA and BBB
mkdir /sys/fs/cgroup/perf_event/AAA
mkdir /sys/fs/cgroup/perf_event/BBB

Got the results below:

./perf stat -e context-switches,cgroup-switches taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB > /dev/null

Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB':

20002 context-switches
19999 cgroup-switches

0.120063986 seconds time elapsed

0.001716000 seconds user
0.065995000 seconds sys

Thanks
Athira

> ---
> tools/perf/Documentation/perf-bench.txt | 19 ++++
> tools/perf/bench/sched-pipe.c | 118 +++++++++++++++++++++++-
> 2 files changed, 134 insertions(+), 3 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
> index ca5789625cd2..8331bd28b10e 100644
> --- a/tools/perf/Documentation/perf-bench.txt
> +++ b/tools/perf/Documentation/perf-bench.txt
> @@ -124,6 +124,14 @@ Options of *pipe*
> --loop=::
> Specify number of loops.
>
> +-G::
> +--cgroups=::
> +Names of cgroups for sender and receiver, separated by a comma.
> +This is useful to check cgroup context switching overhead.
> +Note that perf doesn't create nor delete the cgroups, so users should
> +make sure that the cgroups exist and are accessible before use.
> +
> +
> Example of *pipe*
> ^^^^^^^^^^^^^^^^^
>
> @@ -141,6 +149,17 @@ Example of *pipe*
> Total time:0.016 sec
> 16.948000 usecs/op
> 59004 ops/sec
> +
> +% perf bench sched pipe -G AAA,BBB
> +(executing 1000000 pipe operations between cgroups)
> +# Running 'sched/pipe' benchmark:
> +# Executed 1000000 pipe operations between two processes
> +
> + Total time: 6.886 [sec]
> +
> + 6.886208 usecs/op
> + 145217 ops/sec
> +
> ---------------------
>
> SUITES FOR 'syscall'
> diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
> index a960e7a93aec..88d20a34adb2 100644
> --- a/tools/perf/bench/sched-pipe.c
> +++ b/tools/perf/bench/sched-pipe.c
> @@ -10,7 +10,9 @@
> * Ported to perf by Hitoshi Mitake <[email protected]>
> */
> #include <subcmd/parse-options.h>
> +#include <api/fs/fs.h>
> #include "bench.h"
> +#include "util/cgroup.h"
>
> #include <unistd.h>
> #include <stdio.h>
> @@ -19,6 +21,7 @@
> #include <sys/wait.h>
> #include <string.h>
> #include <errno.h>
> +#include <fcntl.h>
> #include <assert.h>
> #include <sys/time.h>
> #include <sys/types.h>
> @@ -31,6 +34,7 @@ struct thread_data {
> int nr;
> int pipe_read;
> int pipe_write;
> + bool cgroup_failed;
> pthread_t pthread;
> };
>
> @@ -40,9 +44,55 @@ static int loops = LOOPS_DEFAULT;
> /* Use processes by default: */
> static bool threaded;
>
> +static struct cgroup *cgrp_send = NULL;
> +static struct cgroup *cgrp_recv = NULL;
> +
> +static int parse_two_cgroups(const struct option *opt __maybe_unused,
> + const char *str, int unset __maybe_unused)
> +{
> + char *p = strdup(str);
> + char *q;
> + int ret = -1;
> +
> + if (p == NULL) {
> + fprintf(stderr, "memory allocation failure");
> + return -1;
> + }
> +
> + q = strchr(p, ',');
> + if (q == NULL) {
> + fprintf(stderr, "it should have two cgroup names: %s", p);
> + goto out;
> + }
> + *q = '\0';
> +
> + cgrp_send = cgroup__new(p, /*do_open=*/true);
> + if (cgrp_send == NULL) {
> + fprintf(stderr, "cannot open sender cgroup: %s", p);
> + goto out;
> + }
> +
> + /* skip ',' */
> + q++;
> +
> + cgrp_recv = cgroup__new(q, /*do_open=*/true);
> + if (cgrp_recv == NULL) {
> + fprintf(stderr, "cannot open receiver cgroup: %s", q);
> + goto out;
> + }
> + ret = 0;
> +
> +out:
> + free(p);
> + return ret;
> +}
> +
> static const struct option options[] = {
> OPT_INTEGER('l', "loop", &loops, "Specify number of loops"),
> OPT_BOOLEAN('T', "threaded", &threaded, "Specify threads/process based task setup"),
> + OPT_CALLBACK('G', "cgroups", NULL, "SEND,RECV",
> + "Put sender and receivers in given cgroups",
> + parse_two_cgroups),
> OPT_END()
> };
>
> @@ -51,12 +101,71 @@ static const char * const bench_sched_pipe_usage[] = {
> NULL
> };
>
> +static int enter_cgroup(struct cgroup *cgrp)
> +{
> + char buf[32];
> + int fd, len, ret;
> + pid_t pid;
> +
> + if (cgrp == NULL)
> + return 0;
> +
> + if (threaded)
> + pid = syscall(__NR_gettid);
> + else
> + pid = getpid();
> +
> + snprintf(buf, sizeof(buf), "%d\n", pid);
> + len = strlen(buf);
> +
> + /* try cgroup v2 interface first */
> + if (threaded)
> + fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY);
> + else
> + fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY);
> +
> + /* try cgroup v1 if failed */
> + if (fd < 0)
> + fd = openat(cgrp->fd, "tasks", O_WRONLY);
> +
> + if (fd < 0) {
> + char mnt[PATH_MAX];
> +
> + printf("Failed to open cgroup file in %s\n", cgrp->name);
> +
> + if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0)
> + printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n",
> + mnt, cgrp->name);
> + return -1;
> + }
> +
> + ret = write(fd, buf, len);
> + close(fd);
> +
> + if (ret != len) {
> + printf("Cannot enter to cgroup: %s\n", cgrp->name);
> + return -1;
> + }
> + return 0;
> +}
> +
> static void *worker_thread(void *__tdata)
> {
> struct thread_data *td = __tdata;
> int m = 0, i;
> int ret;
>
> + if (td->nr)
> + ret = enter_cgroup(cgrp_send);
> + else
> + ret = enter_cgroup(cgrp_recv);
> +
> + if (ret < 0) {
> + td->cgroup_failed = true;
> + return NULL;
> + }
> + td->cgroup_failed = false;
> +
> for (i = 0; i < loops; i++) {
> if (!td->nr) {
> ret = read(td->pipe_read, &m, sizeof(int));
> @@ -112,9 +221,7 @@ int bench_sched_pipe(int argc, const char **argv)
> }
> }
>
> -
> if (threaded) {
> -
> for (t = 0; t < nr_threads; t++) {
> td = threads + t;
>
> @@ -128,7 +235,6 @@ int bench_sched_pipe(int argc, const char **argv)
> ret = pthread_join(td->pthread, NULL);
> BUG_ON(ret);
> }
> -
> } else {
> pid = fork();
> assert(pid >= 0);
> @@ -147,6 +253,12 @@ int bench_sched_pipe(int argc, const char **argv)
> gettimeofday(&stop, NULL);
> timersub(&stop, &start, &diff);
>
> + cgroup__put(cgrp_send);
> + cgroup__put(cgrp_recv);
> +
> + if (threads[0].cgroup_failed || threads[1].cgroup_failed)
> + return 0;
> +
> switch (bench_format) {
> case BENCH_FORMAT_DEFAULT:
> printf("# Executed %d pipe operations between two %s\n\n",
> --
> 2.42.0.655.g421f12c284-goog
>

2023-10-16 20:36:28

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

Em Mon, Oct 16, 2023 at 12:55:33PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Oct 16, 2023 at 12:51:52PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Now back at testing with with cgroups.

> Humm, even without the -G I get:

> [root@five ~]# perf stat -e context-switches,cgroup-switches perf bench sched pipe -l 10000
> # Running 'sched/pipe' benchmark:
> # Executed 10000 pipe operations between two processes

> Total time: 0.082 [sec]

> 8.246400 usecs/op
> 121265 ops/sec

> Performance counter stats for 'perf bench sched pipe -l 10000':

> 20,002 context-switches
> 20,002 cgroup-switches

Same number, but then I forgot to add the 'taskset -c 0' part of the
command line, if I have it:

[root@five ~]# perf stat -e context-switches,cgroup-switches taskset -c 0 perf bench sched pipe -l 10000
# Running 'sched/pipe' benchmark:
# Executed 10000 pipe operations between two processes

Total time: 0.072 [sec]

7.231500 usecs/op
138283 ops/sec

Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000':

20,002 context-switches
3 cgroup-switches

0.082855113 seconds time elapsed

0.007765000 seconds user
0.074020000 seconds sys

[root@five ~]# perf stat -e context-switches,cgroup-switches taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB
# Running 'sched/pipe' benchmark:
# Executed 10000 pipe operations between two processes

Total time: 0.093 [sec]

9.341800 usecs/op
107045 ops/sec

Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB':

20,004 context-switches
20,003 cgroup-switches

0.103436330 seconds time elapsed

0.018109000 seconds user
0.063058000 seconds sys

[root@five ~]#

I.e. it works as in your results, but can you please spell out why that
'taskset -c 0' is needed to get these results?

I wasn't expecting the same number of cgroup-switches when not using
'taskset -c 0' :-\

- Arnaldo

2023-10-16 21:44:50

by Namhyung Kim

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

On Mon, Oct 16, 2023 at 1:35 PM Arnaldo Carvalho de Melo
<[email protected]> wrote:
>
> Em Mon, Oct 16, 2023 at 12:55:33PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Mon, Oct 16, 2023 at 12:51:52PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Now back at testing with with cgroups.
>
> > Humm, even without the -G I get:
>
> > [root@five ~]# perf stat -e context-switches,cgroup-switches perf bench sched pipe -l 10000
> > # Running 'sched/pipe' benchmark:
> > # Executed 10000 pipe operations between two processes
>
> > Total time: 0.082 [sec]
>
> > 8.246400 usecs/op
> > 121265 ops/sec
>
> > Performance counter stats for 'perf bench sched pipe -l 10000':
>
> > 20,002 context-switches
> > 20,002 cgroup-switches
>
> Same number, but then I forgot to add the 'taskset -c 0' part of the
> command line, if I have it:
>
> [root@five ~]# perf stat -e context-switches,cgroup-switches taskset -c 0 perf bench sched pipe -l 10000
> # Running 'sched/pipe' benchmark:
> # Executed 10000 pipe operations between two processes
>
> Total time: 0.072 [sec]
>
> 7.231500 usecs/op
> 138283 ops/sec
>
> Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000':
>
> 20,002 context-switches
> 3 cgroup-switches
>
> 0.082855113 seconds time elapsed
>
> 0.007765000 seconds user
> 0.074020000 seconds sys
>
>
> [root@five ~]# perf stat -e context-switches,cgroup-switches taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB
> # Running 'sched/pipe' benchmark:
> # Executed 10000 pipe operations between two processes
>
> Total time: 0.093 [sec]
>
> 9.341800 usecs/op
> 107045 ops/sec
>
> Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB':
>
> 20,004 context-switches
> 20,003 cgroup-switches
>
> 0.103436330 seconds time elapsed
>
> 0.018109000 seconds user
> 0.063058000 seconds sys
>
>
> [root@five ~]#
>
> I.e. it works as in your results, but can you please spell out why that
> 'taskset -c 0' is needed to get these results?
>
> I wasn't expecting the same number of cgroup-switches when not using
> 'taskset -c 0' :-\

Without taskset, each task is likely to run on different CPUs
and other tasks (including idle) on that CPU would be in
different cgroup so it'll create cgroup switches everytime
if they run in AAA or BBB.

With taskset, both sender and receiver would run on the
same CPU. So it'd see the impact of cgroup switches
with this option.

Thanks,
Namhyung

2023-10-17 08:08:31

by Athira Rajeev

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

> On 16-Oct-2023, at 9:08 PM, Arnaldo Carvalho de Melo <[email protected]> wrote:
>
> Em Mon, Oct 16, 2023 at 11:35:35AM +0200, Ingo Molnar escreveu:
>>
>>
>> * Namhyung Kim <[email protected]> wrote:
>>
>>> + /* try cgroup v2 interface first */
>>> + if (threaded)
>>> + fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY);
>>> + else
>>> + fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY);
>>> +
>>> + /* try cgroup v1 if failed */
>>> + if (fd < 0)
>>> + fd = openat(cgrp->fd, "tasks", O_WRONLY);
>>> +
>>> + if (fd < 0) {
>>> + char mnt[PATH_MAX];
>>> +
>>> + printf("Failed to open cgroup file in %s\n", cgrp->name);
>>> +
>>> + if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0)
>>> + printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n",
>>> + mnt, cgrp->name);
>>
>> Ok, this works too I suppose.
>>
>> Acked-by: Ingo Molnar <[email protected]>
>
> I'm not getting that:
>
> [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> # Running 'sched/pipe' benchmark:
> no access to cgroup /sys/fs/cgroup/AAA
> cannot open sender cgroup: AAA
> Usage: perf bench sched pipe <options>
>
> -G, --cgroups <SEND,RECV>
> Put sender and receivers in given cgroups
> [root@five ~]#
>
> Its better now as it bails out, but it is not emitting any message that
> helps with running the test, well, there is that /sys/fs/cgroup/AAA
> path, lemme try doing a mkdir:
>
> [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> # Running 'sched/pipe' benchmark:
> no access to cgroup /sys/fs/cgroup/BBB
> cannot open receiver cgroup: BBB
> Usage: perf bench sched pipe <options>
>
> -G, --cgroups <SEND,RECV>
> Put sender and receivers in given cgroups
> [root@five ~]#
>
> [root@five ~]# perf bench sched pipe -l 10000 -G AAA,BBB
> # Running 'sched/pipe' benchmark:
> [root@five ~]#
>
> It seems to be bailing out but doesn't run the test nor emits any
> warning.

In the “parse_two_cgroups” function itself it checks for :

cgrp_send = cgroup__new(p, /*do_open=*/true);
if (cgrp_send == NULL) {
fprintf(stderr, "cannot open sender cgroup: %s", p);
goto out;
}

And we fail here since the cgroup is not created. May be we can add the Hint or warning in here ?

Thanks
Athira

>
> I'm using v3. I'll try to debug it a bit.
>
> - Arnaldo

2023-10-17 11:40:21

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

* Arnaldo Carvalho de Melo <[email protected]> wrote:

> Em Mon, Oct 16, 2023 at 12:55:33PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Mon, Oct 16, 2023 at 12:51:52PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Now back at testing with with cgroups.
>
> > Humm, even without the -G I get:
>
> > [root@five ~]# perf stat -e context-switches,cgroup-switches perf bench sched pipe -l 10000
> > # Running 'sched/pipe' benchmark:
> > # Executed 10000 pipe operations between two processes
>
> > Total time: 0.082 [sec]
>
> > 8.246400 usecs/op
> > 121265 ops/sec
>
> > Performance counter stats for 'perf bench sched pipe -l 10000':
>
> > 20,002 context-switches
> > 20,002 cgroup-switches
>
> Same number, but then I forgot to add the 'taskset -c 0' part of the
> command line, if I have it:

Side note: it might make sense to add a sane cpumask/affinity setting
option to perf stat itself:

perf stat --cpumask

... or so?

We do have -C:

-C, --cpu <cpu> list of cpus to monitor in system-wide

... but that's limited to --all-cpus, right?

Perhaps we could extend --cpu to non-system-wide runs too?

Thanks,

Ingo

2023-10-17 12:16:16

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

Em Mon, Oct 16, 2023 at 02:44:23PM -0700, Namhyung Kim escreveu:
> On Mon, Oct 16, 2023 at 1:35 PM Arnaldo Carvalho de Melo <[email protected]> wrote:

> > Em Mon, Oct 16, 2023 at 12:55:33PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Mon, Oct 16, 2023 at 12:51:52PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > > Now back at testing with with cgroups.
> >
> > > Humm, even without the -G I get:
> >
> > > [root@five ~]# perf stat -e context-switches,cgroup-switches perf bench sched pipe -l 10000
> > > # Running 'sched/pipe' benchmark:
> > > # Executed 10000 pipe operations between two processes
> >
> > > Total time: 0.082 [sec]
> >
> > > 8.246400 usecs/op
> > > 121265 ops/sec
> >
> > > Performance counter stats for 'perf bench sched pipe -l 10000':
> >
> > > 20,002 context-switches
> > > 20,002 cgroup-switches
> >
> > Same number, but then I forgot to add the 'taskset -c 0' part of the
> > command line, if I have it:
> >
> > [root@five ~]# perf stat -e context-switches,cgroup-switches taskset -c 0 perf bench sched pipe -l 10000
> > # Running 'sched/pipe' benchmark:
> > # Executed 10000 pipe operations between two processes
> >
> > Total time: 0.072 [sec]
> >
> > 7.231500 usecs/op
> > 138283 ops/sec
> >
> > Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000':
> >
> > 20,002 context-switches
> > 3 cgroup-switches
> >
> > 0.082855113 seconds time elapsed
> >
> > 0.007765000 seconds user
> > 0.074020000 seconds sys
> >
> >
> > [root@five ~]# perf stat -e context-switches,cgroup-switches taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB
> > # Running 'sched/pipe' benchmark:
> > # Executed 10000 pipe operations between two processes
> >
> > Total time: 0.093 [sec]
> >
> > 9.341800 usecs/op
> > 107045 ops/sec
> >
> > Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000 -G AAA,BBB':
> >
> > 20,004 context-switches
> > 20,003 cgroup-switches
> >
> > 0.103436330 seconds time elapsed
> >
> > 0.018109000 seconds user
> > 0.063058000 seconds sys
> >
> >
> > [root@five ~]#
> >
> > I.e. it works as in your results, but can you please spell out why that
> > 'taskset -c 0' is needed to get these results?
> >
> > I wasn't expecting the same number of cgroup-switches when not using
> > 'taskset -c 0' :-\
>
> Without taskset, each task is likely to run on different CPUs
> and other tasks (including idle) on that CPU would be in
> different cgroup so it'll create cgroup switches everytime
> if they run in AAA or BBB.

Sure, and the 'perf stat' is counting _just_ that workload and its
children (no -a).

Can you please add this to the cset commit log message? I.e. describe
the test setup thoroughly to help in reviewing and for us to quickly
understand what is being tested, the purpose of the test and how the
results match our expectations.

This will be specially helpful in the future, when we do bisects, try to
understand why changes were made, etc.

Even in the man page this information would be useful in helping users
to understand the purpose of the 'perf bench' -G option. So I think its
better to have it there instead of in the cset commit log message.

> With taskset, both sender and receiver would run on the
> same CPU. So it'd see the impact of cgroup switches
> with this option.

Thanks!

- Arnaldo

2023-10-17 12:28:25

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: [perf stat] Extend --cpu to non-system-wide runs too? was Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

Em Tue, Oct 17, 2023 at 01:40:07PM +0200, Ingo Molnar escreveu:
> Side note: it might make sense to add a sane cpumask/affinity setting
> option to perf stat itself:
>
> perf stat --cpumask
>
> ... or so?
>
> We do have -C:
>
> -C, --cpu <cpu> list of cpus to monitor in system-wide
>
> ... but that's limited to --all-cpus, right?
>
> Perhaps we could extend --cpu to non-system-wide runs too?

Maybe I misunderstood your question, but its a list of cpus to limit the
counting:

On a mostly idle system (some browsers, etc):

[root@five ~]# perf stat -C 0,2 -e cycles -I 1000
# time counts unit events
1.001012960 207,999,675 cycles
2.002152464 157,058,633 cycles
3.002985969 174,590,102 cycles
4.003411871 216,250,416 cycles
5.004392310 180,537,857 cycles
6.005387846 171,036,571 cycles
7.006386564 156,461,753 cycles
8.007532366 158,010,466 cycles
9.008682339 164,971,366 cycles
^C 9.377946210 77,242,809 cycles

[root@five ~]#

Then:

[root@five ~]# perf stat -C 0 -e cycles -I 1000
# time counts unit events
1.001019469 69,833,637 cycles
2.002133490 111,297,731 cycles
3.003225211 90,416,063 cycles
4.003663853 34,189,884 cycles
5.004689751 34,583,822 cycles
6.005659918 33,284,110 cycles
7.006660396 62,080,246 cycles
^C 7.229236075 23,250,207 cycles

[root@five ~]#

But:

[root@five ~]# taskset -c 0 stress-ng --cpu 32 &
[1] 9859
[root@five ~]# stress-ng: info: [9859] defaulting to a 1 day, 0 secs run per stressor
stress-ng: info: [9859] dispatching hogs: 32 cpu

[root@five ~]#

[root@five ~]# perf stat -C 0,2 -e cycles -I 1000
# time counts unit events
1.001024379 4,838,680,041 cycles
2.008891551 4,849,936,963 cycles
3.017168975 4,835,710,170 cycles
4.025437789 4,847,294,589 cycles
5.033239780 4,825,463,385 cycles
6.039332959 4,834,989,373 cycles
^C 6.067478756 125,338,359 cycles

[root@five ~]# perf stat -C 2 -e cycles -I 1000
# time counts unit events
1.000215845 21,244,609 cycles
2.001216573 51,337,887 cycles
3.002278103 49,421,924 cycles
4.003339432 33,270,235 cycles
^C 4.338990744 14,178,759 cycles

[root@five ~]# perf stat -C 0 -e cycles -I 1000
# time counts unit events
1.000801562 4,767,090,700 cycles
2.001800540 4,761,384,154 cycles
3.002801468 4,768,816,073 cycles
^C 3.313349213 1,479,254,494 cycles

[root@five ~]#

If we try to specify a pid and cpu:

[root@five ~]# taskset -c 0 sleep 100m &
[2] 9964
[root@five ~]#
[root@five ~]# perf stat -C 0 -p 9964 -e cycles -I 1000
PID/TID switch overriding CPU
# time counts unit events
1.000929383 <not counted> cycles
2.001933839 <not counted> cycles
3.002927605 <not counted> cycles
4.003983793 <not counted> cycles
5.005051180 <not counted> cycles
6.006123168 <not counted> cycles
7.007182796 <not counted> cycles
8.008261274 <not counted> cycles
9.009324991 <not counted> cycles
^C 9.454324736 <not counted> cycles

[root@five ~]#

[root@five ~]# pidof stress-ng
9891 9890 9889 9888 9887 9886 9885 9884 9883 9882 9881 9880 9879 9878 9877 9876 9875 9874 9873 9872 9871 9870 9869 9868 9867 9866 9865 9864 9863 9862 9861 9860 9859
[root@five ~]# perf stat -C 0 -p 9860 -e cycles -I 1000
PID/TID switch overriding CPU
# time counts unit events
1.001045336 144,691,886 cycles
2.002170624 134,088,343 cycles
3.003257911 149,148,823 cycles
^C 3.301585761 40,468,152 cycles

[root@five ~]#

Do you want to profile some specific PID only when it runs on some
specific CPU?

That should work, as per man perf_event_open:

pid == 0 and cpu >= 0
This measures the calling process/thread only when running on the specified CPU.

But, as we saw above, tooling is preventing us from doing that :-\

- Arnaldo

2023-10-17 12:44:03

by Ingo Molnar

[permalink] [raw]

Subject: Re: [perf stat] Extend --cpu to non-system-wide runs too? was Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

* Arnaldo Carvalho de Melo <[email protected]> wrote:

> Em Tue, Oct 17, 2023 at 01:40:07PM +0200, Ingo Molnar escreveu:
> > Side note: it might make sense to add a sane cpumask/affinity setting
> > option to perf stat itself:
> >
> > perf stat --cpumask
> >
> > ... or so?
> >
> > We do have -C:
> >
> > -C, --cpu <cpu> list of cpus to monitor in system-wide
> >
> > ... but that's limited to --all-cpus, right?
> >
> > Perhaps we could extend --cpu to non-system-wide runs too?
>
> Maybe I misunderstood your question, but its a list of cpus to limit the
> counting:

Ok.

So I thought that "--cpumask mask/list/etc" should simply do what 'taskset'
is doing: using the sched_setaffinity() syscall to make the current
workload and all its children.

There's impact on perf stat itself: it could just call sched_setaffinity()
early on, and not bother about it?

Having it built-in into perf would simply make it easier to not forget
running 'taskset'. :-)

Thanks,

Ingo

2023-10-17 18:31:48

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [perf stat] Extend --cpu to non-system-wide runs too? was Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

Em Tue, Oct 17, 2023 at 02:43:45PM +0200, Ingo Molnar escreveu:
> * Arnaldo Carvalho de Melo <[email protected]> wrote:
> > Em Tue, Oct 17, 2023 at 01:40:07PM +0200, Ingo Molnar escreveu:
> > > Side note: it might make sense to add a sane cpumask/affinity setting
> > > option to perf stat itself:

> > > perf stat --cpumask

> > > ... or so?

> > > We do have -C:

> > > -C, --cpu <cpu> list of cpus to monitor in system-wide

> > > ... but that's limited to --all-cpus, right?

> > > Perhaps we could extend --cpu to non-system-wide runs too?

> > Maybe I misunderstood your question, but its a list of cpus to limit the
> > counting:

> Ok.

> So I thought that "--cpumask mask/list/etc" should simply do what 'taskset'
> is doing: using the sched_setaffinity() syscall to make the current
> workload and all its children.

> There's impact on perf stat itself: it could just call sched_setaffinity()
> early on, and not bother about it?

> Having it built-in into perf would simply make it easier to not forget
> running 'taskset'. :-)

Would that be the only advantage?

I think using taskset isn't that much of a burden and keeps with the
Unix tradition, no? :-\

See, using 'perf record -C', i.e. sampling, will use sched_setaffinity,
and in that case there is a clear advantage... wait, this train of
thought made me remember something, but its just about counter setup,
not about the workload:

[acme@five perf-tools-next]$ grep affinity__set tools/perf/*.c
tools/perf/builtin-stat.c: else if (affinity__setup(&saved_affinity) < 0)
tools/perf/builtin-stat.c: if (affinity__setup(&saved_affinity) < 0)
[acme@five perf-tools-next]$

/*
* perf_event_open does an IPI internally to the target CPU.
* It is more efficient to change perf's affinity to the target
* CPU and then set up all events on that CPU, so we amortize
* CPU communication.
*/
void affinity__set(struct affinity *a, int cpu)

[root@five ~]# perf trace --summary -e sched_setaffinity perf stat -e cycles -a sleep 1

Performance counter stats for 'system wide':

6,319,186,681 cycles

1.002665795 seconds time elapsed

Summary of events:

perf (24307), 396 events, 87.4%

syscall calls errors total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- -------- ------ -------- --------- --------- --------- ------
sched_setaffinity 198 0 4.544 0.006 0.023 0.042 2.30%

[root@five ~]#

[root@five ~]# perf trace --summary -e sched_setaffinity perf stat -C 1 -e cycles -a sleep 1

Performance counter stats for 'system wide':

105,311,506 cycles

1.001203282 seconds time elapsed

Summary of events:

perf (24633), 24 events, 29.6%

syscall calls errors total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- -------- ------ -------- --------- --------- --------- ------
sched_setaffinity 12 0 0.105 0.005 0.009 0.039 32.07%

[root@five ~]# perf trace --summary -e sched_setaffinity perf stat -C 1,2 -e cycles -a sleep 1

Performance counter stats for 'system wide':

131,474,375 cycles

1.001324346 seconds time elapsed

Summary of events:

perf (24636), 36 events, 38.7%

syscall calls errors total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- -------- ------ -------- --------- --------- --------- ------
sched_setaffinity 18 0 0.442 0.000 0.025 0.093 24.75%

[root@five ~]# perf trace --summary -e sched_setaffinity perf stat -C 1,2,30 -e cycles -a sleep 1

Performance counter stats for 'system wide':

191,674,889 cycles

1.001280015 seconds time elapsed

Summary of events:

perf (24639), 48 events, 45.7%

syscall calls errors total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- -------- ------ -------- --------- --------- --------- ------
sched_setaffinity 24 0 0.835 0.000 0.035 0.144 24.40%

[root@five ~]#

Too much affinity setting :-)

- Arnaldo

2023-10-17 19:06:36

by Namhyung Kim

[permalink] [raw]

Subject: Re: [perf stat] Extend --cpu to non-system-wide runs too? was Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

On Tue, Oct 17, 2023 at 11:31 AM Arnaldo Carvalho de Melo
<[email protected]> wrote:
>
> Em Tue, Oct 17, 2023 at 02:43:45PM +0200, Ingo Molnar escreveu:
> > * Arnaldo Carvalho de Melo <[email protected]> wrote:
> > > Em Tue, Oct 17, 2023 at 01:40:07PM +0200, Ingo Molnar escreveu:
> > > > Side note: it might make sense to add a sane cpumask/affinity setting
> > > > option to perf stat itself:
>
> > > > perf stat --cpumask
>
> > > > ... or so?
>
> > > > We do have -C:
>
> > > > -C, --cpu <cpu> list of cpus to monitor in system-wide
>
> > > > ... but that's limited to --all-cpus, right?
>
> > > > Perhaps we could extend --cpu to non-system-wide runs too?
>
> > > Maybe I misunderstood your question, but its a list of cpus to limit the
> > > counting:
>
> > Ok.
>
> > So I thought that "--cpumask mask/list/etc" should simply do what 'taskset'
> > is doing: using the sched_setaffinity() syscall to make the current
> > workload and all its children.
>
> > There's impact on perf stat itself: it could just call sched_setaffinity()
> > early on, and not bother about it?
>
> > Having it built-in into perf would simply make it easier to not forget
> > running 'taskset'. :-)
>
> Would that be the only advantage?
>
> I think using taskset isn't that much of a burden and keeps with the
> Unix tradition, no? :-\

Agreed. Maybe there's a usecase that wants to profile a specific
cpu while the target processes are running all available cpus.

Thanks,
Namhyung

2023-10-18 12:08:05

by Ingo Molnar

[permalink] [raw]

Subject: Re: [perf stat] Extend --cpu to non-system-wide runs too? was Re: [PATCH v3] perf bench sched pipe: Add -G/--cgroups option

* Arnaldo Carvalho de Melo <[email protected]> wrote:

> Em Tue, Oct 17, 2023 at 02:43:45PM +0200, Ingo Molnar escreveu:
> > * Arnaldo Carvalho de Melo <[email protected]> wrote:
> > > Em Tue, Oct 17, 2023 at 01:40:07PM +0200, Ingo Molnar escreveu:
> > > > Side note: it might make sense to add a sane cpumask/affinity setting
> > > > option to perf stat itself:
>
> > > > perf stat --cpumask
>
> > > > ... or so?
>
> > > > We do have -C:
>
> > > > -C, --cpu <cpu> list of cpus to monitor in system-wide
>
> > > > ... but that's limited to --all-cpus, right?
>
> > > > Perhaps we could extend --cpu to non-system-wide runs too?
>
> > > Maybe I misunderstood your question, but its a list of cpus to limit the
> > > counting:
>
> > Ok.
>
> > So I thought that "--cpumask mask/list/etc" should simply do what 'taskset'
> > is doing: using the sched_setaffinity() syscall to make the current
> > workload and all its children.
>
> > There's impact on perf stat itself: it could just call sched_setaffinity()
> > early on, and not bother about it?
>
> > Having it built-in into perf would simply make it easier to not forget
> > running 'taskset'. :-)
>
> Would that be the only advantage?

1)

Another advantage would be that perf stat could itself bind itself to the
inverse affinity mask.

This means the workload that is being executed is disturbed by perf as
little as possible.

That's not possible with 'taskset'.

2)

Plus taskset's syntax is arguably silly: why does it need a separate -c
option for a CPU list, why doesn't it figure it out by itself when there's
a comma or dash in the mask string?

A better syntax that perf could use would be to interpret it as a CPU mask
only when presented with a '0x' or '0b' prefix for a binary mask which is
IMO much more logical if we are talking masks. For example, to run on 8
full cores, using the '0b' GCC extension to specify binary literals:

perf stat --cpus 0b101010101010101

'taskset' has other syntax idiosyncracies, such as the weird inverted
argument order of PID and CPU list:

kepler:~/tip> taskset -p $$ 0b101010101010101
taskset: invalid PID argument: '0b101010101010101'

# .... erm, what??
# .... oh, taskset expects the PID argument last:

kepler:~/tip> taskset -p 0b101010101010101 $$
pid 195878's current affinity mask: ffffffffffffffff
pid 195878's new affinity mask: b101010101010101

# ... damn: taskset doesn't know the 0b prefix and blindly assumed it's
# hexadecimal ...

So I'd love it if perf stat grew a sane CPU-affinity option.

3)

As a bonus perf stat could later also grow matching memory-node-affinity
features that 'taskset' doesn't have ...

Anyway, that's my attempt to convince you guys that it's a good idea to
have this feature. :-)

Thanks,

Ingo