LinuxLists.cc - [PATCH v13 03/16] perf record: Introduce thread specific data array

2022-01-18 02:56:55

Subject: [PATCH v13 03/16] perf record: Introduce thread specific data array

Introduce thread specific data object and array of such objects
to store and manage thread local data. Implement functions to
allocate, initialize, finalize and release thread specific data.

Thread local maps and overwrite_maps arrays keep pointers to
mmap buffer objects to serve according to maps thread mask.
Thread local pollfd array keeps event fds connected to mmaps
buffers according to maps thread mask.

Thread control commands are delivered via thread local comm pipes
and ctlfd_pos fd. External control commands (--control option)
are delivered via evlist ctlfd_pos fd and handled by the main
tool thread.

Acked-by: Namhyung Kim <[email protected]>
Reviewed-by: Riccardo Mancini <[email protected]>
Tested-by: Riccardo Mancini <[email protected]>
Signed-off-by: Alexey Bayduraev <[email protected]>
---
tools/perf/builtin-record.c | 247 +++++++++++++++++++++++++++++++++++-
1 file changed, 244 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 41998f2140cd..0d4a34c66274 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -58,6 +58,9 @@
#include <poll.h>
#include <pthread.h>
#include <unistd.h>
+#ifndef HAVE_GETTID
+#include <syscall.h>
+#endif
#include <sched.h>
#include <signal.h>
#ifdef HAVE_EVENTFD_SUPPORT
@@ -92,6 +95,21 @@ struct thread_mask {
struct mmap_cpu_mask affinity;
};

+struct record_thread {
+ pid_t tid;
+ struct thread_mask *mask;
+ struct {
+ int msg[2];
+ int ack[2];
+ } pipes;
+ struct fdarray pollfd;
+ int ctlfd_pos;
+ int nr_mmaps;
+ struct mmap **maps;
+ struct mmap **overwrite_maps;
+ struct record *rec;
+};
+
struct record {
struct perf_tool tool;
struct record_opts opts;
@@ -119,6 +137,7 @@ struct record {
struct perf_debuginfod debuginfod;
int nr_threads;
struct thread_mask *thread_masks;
+ struct record_thread *thread_data;
};

static volatile int done;
@@ -131,6 +150,13 @@ static const char *affinity_tags[PERF_AFFINITY_MAX] = {
"SYS", "NODE", "CPU"
};

+#ifndef HAVE_GETTID
+static inline pid_t gettid(void)
+{
+ return (pid_t)syscall(__NR_gettid);
+}
+#endif
+
static bool switch_output_signal(struct record *rec)
{
return rec->switch_output.signal &&
@@ -848,9 +874,218 @@ static int record__kcore_copy(struct machine *machine, struct perf_data *data)
return kcore_copy(from_dir, kcore_dir);
}

+static void record__thread_data_init_pipes(struct record_thread *thread_data)
+{
+ thread_data->pipes.msg[0] = -1;
+ thread_data->pipes.msg[1] = -1;
+ thread_data->pipes.ack[0] = -1;
+ thread_data->pipes.ack[1] = -1;
+}
+
+static int record__thread_data_open_pipes(struct record_thread *thread_data)
+{
+ if (pipe(thread_data->pipes.msg))
+ return -EINVAL;
+
+ if (pipe(thread_data->pipes.ack)) {
+ close(thread_data->pipes.msg[0]);
+ thread_data->pipes.msg[0] = -1;
+ close(thread_data->pipes.msg[1]);
+ thread_data->pipes.msg[1] = -1;
+ return -EINVAL;
+ }
+
+ pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
+ thread_data->pipes.msg[0], thread_data->pipes.msg[1],
+ thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
+
+ return 0;
+}
+
+static void record__thread_data_close_pipes(struct record_thread *thread_data)
+{
+ if (thread_data->pipes.msg[0] != -1) {
+ close(thread_data->pipes.msg[0]);
+ thread_data->pipes.msg[0] = -1;
+ }
+ if (thread_data->pipes.msg[1] != -1) {
+ close(thread_data->pipes.msg[1]);
+ thread_data->pipes.msg[1] = -1;
+ }
+ if (thread_data->pipes.ack[0] != -1) {
+ close(thread_data->pipes.ack[0]);
+ thread_data->pipes.ack[0] = -1;
+ }
+ if (thread_data->pipes.ack[1] != -1) {
+ close(thread_data->pipes.ack[1]);
+ thread_data->pipes.ack[1] = -1;
+ }
+}
+
+static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
+{
+ int m, tm, nr_mmaps = evlist->core.nr_mmaps;
+ struct mmap *mmap = evlist->mmap;
+ struct mmap *overwrite_mmap = evlist->overwrite_mmap;
+ struct perf_cpu_map *cpus = evlist->core.cpus;
+
+ thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
+ thread_data->mask->maps.nbits);
+ if (mmap) {
+ thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
+ if (!thread_data->maps)
+ return -ENOMEM;
+ }
+ if (overwrite_mmap) {
+ thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
+ if (!thread_data->overwrite_maps) {
+ zfree(&thread_data->maps);
+ return -ENOMEM;
+ }
+ }
+ pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
+ thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
+
+ for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
+ if (test_bit(cpus->map[m].cpu, thread_data->mask->maps.bits)) {
+ if (thread_data->maps) {
+ thread_data->maps[tm] = &mmap[m];
+ pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
+ thread_data, cpus->map[m].cpu, tm, m);
+ }
+ if (thread_data->overwrite_maps) {
+ thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
+ pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
+ thread_data, cpus->map[m].cpu, tm, m);
+ }
+ tm++;
+ }
+ }
+
+ return 0;
+}
+
+static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
+{
+ int f, tm, pos;
+ struct mmap *map, *overwrite_map;
+
+ fdarray__init(&thread_data->pollfd, 64);
+
+ for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
+ map = thread_data->maps ? thread_data->maps[tm] : NULL;
+ overwrite_map = thread_data->overwrite_maps ?
+ thread_data->overwrite_maps[tm] : NULL;
+
+ for (f = 0; f < evlist->core.pollfd.nr; f++) {
+ void *ptr = evlist->core.pollfd.priv[f].ptr;
+
+ if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
+ pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
+ &evlist->core.pollfd);
+ if (pos < 0)
+ return pos;
+ pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
+ thread_data, pos, evlist->core.pollfd.entries[f].fd);
+ }
+ }
+ }
+
+ return 0;
+}
+
+static void record__free_thread_data(struct record *rec)
+{
+ int t;
+ struct record_thread *thread_data = rec->thread_data;
+
+ if (thread_data == NULL)
+ return;
+
+ for (t = 0; t < rec->nr_threads; t++) {
+ record__thread_data_close_pipes(&thread_data[t]);
+ zfree(&thread_data[t].maps);
+ zfree(&thread_data[t].overwrite_maps);
+ fdarray__exit(&thread_data[t].pollfd);
+ }
+
+ zfree(&rec->thread_data);
+}
+
+static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
+{
+ int t, ret;
+ struct record_thread *thread_data;
+
+ rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
+ if (!rec->thread_data) {
+ pr_err("Failed to allocate thread data\n");
+ return -ENOMEM;
+ }
+ thread_data = rec->thread_data;
+
+ for (t = 0; t < rec->nr_threads; t++)
+ record__thread_data_init_pipes(&thread_data[t]);
+
+ for (t = 0; t < rec->nr_threads; t++) {
+ thread_data[t].rec = rec;
+ thread_data[t].mask = &rec->thread_masks[t];
+ ret = record__thread_data_init_maps(&thread_data[t], evlist);
+ if (ret) {
+ pr_err("Failed to initialize thread[%d] maps\n", t);
+ goto out_free;
+ }
+ ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
+ if (ret) {
+ pr_err("Failed to initialize thread[%d] pollfd\n", t);
+ goto out_free;
+ }
+ if (t) {
+ thread_data[t].tid = -1;
+ ret = record__thread_data_open_pipes(&thread_data[t]);
+ if (ret) {
+ pr_err("Failed to open thread[%d] communication pipes\n", t);
+ goto out_free;
+ }
+ ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
+ POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
+ if (ret < 0) {
+ pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
+ goto out_free;
+ }
+ thread_data[t].ctlfd_pos = ret;
+ pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
+ thread_data, thread_data[t].ctlfd_pos,
+ thread_data[t].pipes.msg[0]);
+ } else {
+ thread_data[t].tid = gettid();
+ if (evlist->ctl_fd.pos == -1)
+ continue;
+ ret = fdarray__dup_entry_from(&thread_data[t].pollfd, evlist->ctl_fd.pos,
+ &evlist->core.pollfd);
+ if (ret < 0) {
+ pr_err("Failed to duplicate descriptor in main thread pollfd\n");
+ goto out_free;
+ }
+ thread_data[t].ctlfd_pos = ret;
+ pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
+ thread_data, thread_data[t].ctlfd_pos,
+ evlist->core.pollfd.entries[evlist->ctl_fd.pos].fd);
+ }
+ }
+
+ return 0;
+
+out_free:
+ record__free_thread_data(rec);
+
+ return ret;
+}
+
static int record__mmap_evlist(struct record *rec,
struct evlist *evlist)
{
+ int ret;
struct record_opts *opts = &rec->opts;
bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
opts->auxtrace_sample_mode;
@@ -881,6 +1116,14 @@ static int record__mmap_evlist(struct record *rec,
return -EINVAL;
}
}
+
+ if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
+ return -1;
+
+ ret = record__alloc_thread_data(rec, evlist);
+ if (ret)
+ return ret;
+
return 0;
}

@@ -1862,9 +2105,6 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
evlist__start_workload(rec->evlist);
}

- if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
- goto out_child;
-
if (opts->initial_delay) {
pr_info(EVLIST_DISABLED_MSG);
if (opts->initial_delay > 0) {
@@ -2022,6 +2262,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
out_child:
evlist__finalize_ctlfd(rec->evlist);
record__mmap_read_all(rec, true);
+ record__free_thread_data(rec);
record__aio_mmap_read_sync(rec);

if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
--
2.19.0

2022-02-01 20:51:41

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v13 03/16] perf record: Introduce thread specific data array

Em Mon, Jan 31, 2022 at 06:39:39PM -0300, Arnaldo Carvalho de Melo escreveu:
> Some changes to reduce patch size, I have them in my local tree, will
> publish later.

Its in perf/threaded at:

git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git

Will continue tomorrow, testing it and checking the speedups on my
5950x, I think the things I found so far can be fixed in follow up
patches, to make progress and have this merged sooner.

I'll try and add committer notes with the test for some 'perf bench'
workload without/with parallel recording, something I missed in your
patch descriptions.

- Arnaldo

2022-02-01 20:56:59

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v13 03/16] perf record: Introduce thread specific data array

Em Mon, Jan 17, 2022 at 09:34:23PM +0300, Alexey Bayduraev escreveu:
> Introduce thread specific data object and array of such objects
> to store and manage thread local data. Implement functions to
> allocate, initialize, finalize and release thread specific data.
>
> Thread local maps and overwrite_maps arrays keep pointers to
> mmap buffer objects to serve according to maps thread mask.
> Thread local pollfd array keeps event fds connected to mmaps
> buffers according to maps thread mask.
>
> Thread control commands are delivered via thread local comm pipes
> and ctlfd_pos fd. External control commands (--control option)
> are delivered via evlist ctlfd_pos fd and handled by the main
> tool thread.
>
> Acked-by: Namhyung Kim <[email protected]>
> Reviewed-by: Riccardo Mancini <[email protected]>
> Tested-by: Riccardo Mancini <[email protected]>
> Signed-off-by: Alexey Bayduraev <[email protected]>

Some changes to reduce patch size, I have them in my local tree, will
publish later.

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1645b40540b870aa..a8c7118a95c6a3fa 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -924,7 +924,7 @@ static void record__thread_data_close_pipes(struct record_thread *thread_data)

static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
{
- int m, tm, nr_mmaps = evlist->core.nr_mmaps;
+ int nr_mmaps = evlist->core.nr_mmaps;
struct mmap *mmap = evlist->mmap;
struct mmap *overwrite_mmap = evlist->overwrite_mmap;
struct perf_cpu_map *cpus = evlist->core.cpus;
@@ -946,7 +946,7 @@ static int record__thread_data_init_maps(struct record_thread *thread_data, stru
pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);

- for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
+ for (int m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
if (test_bit(cpus->map[m].cpu, thread_data->mask->maps.bits)) {
if (thread_data->maps) {
thread_data->maps[tm] = &mmap[m];
@@ -967,21 +967,18 @@ static int record__thread_data_init_maps(struct record_thread *thread_data, stru

static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
{
- int f, tm, pos;
- struct mmap *map, *overwrite_map;
-
fdarray__init(&thread_data->pollfd, 64);

- for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
- map = thread_data->maps ? thread_data->maps[tm] : NULL;
- overwrite_map = thread_data->overwrite_maps ?
- thread_data->overwrite_maps[tm] : NULL;
+ for (int tm = 0; tm < thread_data->nr_mmaps; tm++) {
+ struct mmap *map = thread_data->maps ? thread_data->maps[tm] : NULL,
+ *overwrite_map = thread_data->overwrite_maps ?
+ thread_data->overwrite_maps[tm] : NULL;

- for (f = 0; f < evlist->core.pollfd.nr; f++) {
+ for (int f = 0; f < evlist->core.pollfd.nr; f++) {
void *ptr = evlist->core.pollfd.priv[f].ptr;

if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
- pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
+ int pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
&evlist->core.pollfd);
if (pos < 0)
return pos;
@@ -996,13 +993,12 @@ static int record__thread_data_init_pollfd(struct record_thread *thread_data, st

static void record__free_thread_data(struct record *rec)
{
- int t;
struct record_thread *thread_data = rec->thread_data;

if (thread_data == NULL)
return;

- for (t = 0; t < rec->nr_threads; t++) {
+ for (int t = 0; t < rec->nr_threads; t++) {
record__thread_data_close_pipes(&thread_data[t]);
zfree(&thread_data[t].maps);
zfree(&thread_data[t].overwrite_maps);
@@ -1014,20 +1010,18 @@ static void record__free_thread_data(struct record *rec)

static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
{
- int t, ret;
- struct record_thread *thread_data;
+ struct record_thread *thread_data = rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
+ int ret;

- rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
if (!rec->thread_data) {
pr_err("Failed to allocate thread data\n");
return -ENOMEM;
}
- thread_data = rec->thread_data;

- for (t = 0; t < rec->nr_threads; t++)
+ for (int t = 0; t < rec->nr_threads; t++)
record__thread_data_init_pipes(&thread_data[t]);

- for (t = 0; t < rec->nr_threads; t++) {
+ for (int t = 0; t < rec->nr_threads; t++) {
thread_data[t].rec = rec;
thread_data[t].mask = &rec->thread_masks[t];
ret = record__thread_data_init_maps(&thread_data[t], evlist);

2022-02-13 02:07:35

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v13 03/16] perf record: Introduce thread specific data array

Em Mon, Jan 31, 2022 at 07:21:11PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Jan 31, 2022 at 06:39:39PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Some changes to reduce patch size, I have them in my local tree, will
> > publish later.
>
> Its in perf/threaded at:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git
>
> Will continue tomorrow, testing it and checking the speedups on my
> 5950x, I think the things I found so far can be fixed in follow up
> patches, to make progress and have this merged sooner.
>
> I'll try and add committer notes with the test for some 'perf bench'
> workload without/with parallel recording, something I missed in your
> patch descriptions.

Didn't manage to do that, but my considerations are minor at this point
and plenty of informed people acked, reviewed, tested, so I'm not going
to be the one to prevent this from going upstream.

If we find problems (oh well), we'll fix it and progress.

Thank you, Alexei Budankov, Jiri, Namhyung and Riccardo for working on
making perf scale at the record phase for so long,

I'm pushing this to perf/core, that should get into 5.18.

Also, as a heads up, I'll change 'perf/core' to 'perf/next', to align
with the kool kids out there,

Thanks,

- Arnaldo

2022-02-14 10:32:40

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v13 03/16] perf record: Introduce thread specific data array

Em Fri, Feb 11, 2022 at 01:51:16PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Jan 31, 2022 at 07:21:11PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Mon, Jan 31, 2022 at 06:39:39PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Some changes to reduce patch size, I have them in my local tree, will
> > > publish later.
> >
> > Its in perf/threaded at:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git
> >
> > Will continue tomorrow, testing it and checking the speedups on my
> > 5950x, I think the things I found so far can be fixed in follow up
> > patches, to make progress and have this merged sooner.
> >
> > I'll try and add committer notes with the test for some 'perf bench'
> > workload without/with parallel recording, something I missed in your
> > patch descriptions.
>
> Didn't manage to do that, but my considerations are minor at this point
> and plenty of informed people acked, reviewed, tested, so I'm not going
> to be the one to prevent this from going upstream.
>
> If we find problems (oh well), we'll fix it and progress.
>
> Thank you, Alexei Budankov, Jiri, Namhyung and Riccardo for working on
> making perf scale at the record phase for so long,
>
> I'm pushing this to perf/core, that should get into 5.18.
>
> Also, as a heads up, I'll change 'perf/core' to 'perf/next', to align
> with the kool kids out there,

Something I forgot to add: the current codebase, with this patchset,
passes 'perf test' and 'make -C tools/perf build-test' and also all the
container build tests on a myriad of distros.

- Arnaldo