2022-01-18 02:56:48

by Bayduraev, Alexey V

[permalink] [raw]
Subject: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

Introduce affinity and mmap thread masks. Thread affinity mask
defines CPUs that a thread is allowed to run on. Thread maps
mask defines mmap data buffers the thread serves to stream
profiling data from.

Acked-by: Andi Kleen <[email protected]>
Acked-by: Namhyung Kim <[email protected]>
Reviewed-by: Riccardo Mancini <[email protected]>
Tested-by: Riccardo Mancini <[email protected]>
Signed-off-by: Alexey Bayduraev <[email protected]>
---
tools/perf/builtin-record.c | 123 ++++++++++++++++++++++++++++++++++++
1 file changed, 123 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index bb716c953d02..41998f2140cd 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -87,6 +87,11 @@ struct switch_output {
int cur_file;
};

+struct thread_mask {
+ struct mmap_cpu_mask maps;
+ struct mmap_cpu_mask affinity;
+};
+
struct record {
struct perf_tool tool;
struct record_opts opts;
@@ -112,6 +117,8 @@ struct record {
struct mmap_cpu_mask affinity_mask;
unsigned long output_max_size; /* = 0: unlimited */
struct perf_debuginfod debuginfod;
+ int nr_threads;
+ struct thread_mask *thread_masks;
};

static volatile int done;
@@ -2204,6 +2211,47 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
return 0;
}

+static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
+{
+ mask->nbits = nr_bits;
+ mask->bits = bitmap_zalloc(mask->nbits);
+ if (!mask->bits)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
+{
+ bitmap_free(mask->bits);
+ mask->nbits = 0;
+}
+
+static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
+{
+ int ret;
+
+ ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
+ if (ret) {
+ mask->affinity.bits = NULL;
+ return ret;
+ }
+
+ ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
+ if (ret) {
+ record__mmap_cpu_mask_free(&mask->maps);
+ mask->maps.bits = NULL;
+ }
+
+ return ret;
+}
+
+static void record__thread_mask_free(struct thread_mask *mask)
+{
+ record__mmap_cpu_mask_free(&mask->maps);
+ record__mmap_cpu_mask_free(&mask->affinity);
+}
+
static int parse_output_max_size(const struct option *opt,
const char *str, int unset)
{
@@ -2683,6 +2731,73 @@ static struct option __record_options[] = {

struct option *record_options = __record_options;

+static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
+{
+ int c;
+
+ for (c = 0; c < cpus->nr; c++)
+ set_bit(cpus->map[c].cpu, mask->bits);
+}
+
+static void record__free_thread_masks(struct record *rec, int nr_threads)
+{
+ int t;
+
+ if (rec->thread_masks)
+ for (t = 0; t < nr_threads; t++)
+ record__thread_mask_free(&rec->thread_masks[t]);
+
+ zfree(&rec->thread_masks);
+}
+
+static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
+{
+ int t, ret;
+
+ rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
+ if (!rec->thread_masks) {
+ pr_err("Failed to allocate thread masks\n");
+ return -ENOMEM;
+ }
+
+ for (t = 0; t < nr_threads; t++) {
+ ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
+ if (ret) {
+ pr_err("Failed to allocate thread masks[%d]\n", t);
+ goto out_free;
+ }
+ }
+
+ return 0;
+
+out_free:
+ record__free_thread_masks(rec, nr_threads);
+
+ return ret;
+}
+
+static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
+{
+ int ret;
+
+ ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
+ if (ret)
+ return ret;
+
+ record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
+
+ rec->nr_threads = 1;
+
+ return 0;
+}
+
+static int record__init_thread_masks(struct record *rec)
+{
+ struct perf_cpu_map *cpus = rec->evlist->core.cpus;
+
+ return record__init_thread_default_masks(rec, cpus);
+}
+
int cmd_record(int argc, const char **argv)
{
int err;
@@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
goto out;
}

+ err = record__init_thread_masks(rec);
+ if (err) {
+ pr_err("Failed to initialize parallel data streaming masks\n");
+ goto out;
+ }
+
if (rec->opts.nr_cblocks > nr_cblocks_max)
rec->opts.nr_cblocks = nr_cblocks_max;
pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
@@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
symbol__exit();
auxtrace_record__free(rec->itr);
out_opts:
+ record__free_thread_masks(rec, rec->nr_threads);
+ rec->nr_threads = 0;
evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
return err;
}
--
2.19.0


2022-02-01 20:50:35

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

Em Mon, Jan 31, 2022 at 06:00:31PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Jan 17, 2022 at 09:34:21PM +0300, Alexey Bayduraev escreveu:
> > Introduce affinity and mmap thread masks. Thread affinity mask
> > defines CPUs that a thread is allowed to run on. Thread maps
> > mask defines mmap data buffers the thread serves to stream
> > profiling data from.
> >
> > Acked-by: Andi Kleen <[email protected]>
> > Acked-by: Namhyung Kim <[email protected]>
> > Reviewed-by: Riccardo Mancini <[email protected]>
> > Tested-by: Riccardo Mancini <[email protected]>
> > Signed-off-by: Alexey Bayduraev <[email protected]>
>
> Some simplifications I added here while reviewing this patchkit:

But then, why allocate these even without using them? I.e. the init
should be left for when we are sure that we'll actually use this, i.e.
when the user asks for parallel mode.

We already have lots of needless initializations, reading of files that
may not be needed, so we should avoid doing things till we really know
that we'll use those allocations, readings, etc.

Anyway, continuing to review, will leave what I have at a separata
branch so that we can continue from there.

- Arnaldo

> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 41998f2140cd5119..53b88c8600624237 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -2213,35 +2213,33 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
>
> static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> {
> - mask->nbits = nr_bits;
> mask->bits = bitmap_zalloc(mask->nbits);
> if (!mask->bits)
> return -ENOMEM;
>
> + mask->nbits = nr_bits;
> return 0;
> }
>
> static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
> {
> bitmap_free(mask->bits);
> + mask->bits = NULL;
> mask->nbits = 0;
> }
>
> static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
> {
> - int ret;
> + int ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
>
> - ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> if (ret) {
> mask->affinity.bits = NULL;
> return ret;
> }
>
> ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
> - if (ret) {
> + if (ret)
> record__mmap_cpu_mask_free(&mask->maps);
> - mask->maps.bits = NULL;
> - }
>
> return ret;
> }
> @@ -2733,18 +2731,14 @@ struct option *record_options = __record_options;
>
> static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
> {
> - int c;
> -
> - for (c = 0; c < cpus->nr; c++)
> + for (int c = 0; c < cpus->nr; c++)
> set_bit(cpus->map[c].cpu, mask->bits);
> }
>
> static void record__free_thread_masks(struct record *rec, int nr_threads)
> {
> - int t;
> -
> if (rec->thread_masks)
> - for (t = 0; t < nr_threads; t++)
> + for (int t = 0; t < nr_threads; t++)
> record__thread_mask_free(&rec->thread_masks[t]);
>
> zfree(&rec->thread_masks);
> @@ -2752,7 +2746,7 @@ static void record__free_thread_masks(struct record *rec, int nr_threads)
>
> static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> {
> - int t, ret;
> + int ret;
>
> rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> if (!rec->thread_masks) {
> @@ -2760,7 +2754,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
> return -ENOMEM;
> }
>
> - for (t = 0; t < nr_threads; t++) {
> + for (int t = 0; t < nr_threads; t++) {
> ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> if (ret) {
> pr_err("Failed to allocate thread masks[%d]\n", t);
> @@ -2778,9 +2772,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
>
> static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> {
> - int ret;
> -
> - ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> + int ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> if (ret)
> return ret;
>
>
>
> > ---
> > tools/perf/builtin-record.c | 123 ++++++++++++++++++++++++++++++++++++
> > 1 file changed, 123 insertions(+)
> >
> > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > index bb716c953d02..41998f2140cd 100644
> > --- a/tools/perf/builtin-record.c
> > +++ b/tools/perf/builtin-record.c
> > @@ -87,6 +87,11 @@ struct switch_output {
> > int cur_file;
> > };
> >
> > +struct thread_mask {
> > + struct mmap_cpu_mask maps;
> > + struct mmap_cpu_mask affinity;
> > +};
> > +
> > struct record {
> > struct perf_tool tool;
> > struct record_opts opts;
> > @@ -112,6 +117,8 @@ struct record {
> > struct mmap_cpu_mask affinity_mask;
> > unsigned long output_max_size; /* = 0: unlimited */
> > struct perf_debuginfod debuginfod;
> > + int nr_threads;
> > + struct thread_mask *thread_masks;
> > };
> >
> > static volatile int done;
> > @@ -2204,6 +2211,47 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
> > return 0;
> > }
> >
> > +static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> > +{
> > + mask->nbits = nr_bits;
> > + mask->bits = bitmap_zalloc(mask->nbits);
> > + if (!mask->bits)
> > + return -ENOMEM;
> > +
> > + return 0;
> > +}
> > +
> > +static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
> > +{
> > + bitmap_free(mask->bits);
> > + mask->nbits = 0;
> > +}
> > +
> > +static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
> > +{
> > + int ret;
> > +
> > + ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> > + if (ret) {
> > + mask->affinity.bits = NULL;
> > + return ret;
> > + }
> > +
> > + ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
> > + if (ret) {
> > + record__mmap_cpu_mask_free(&mask->maps);
> > + mask->maps.bits = NULL;
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static void record__thread_mask_free(struct thread_mask *mask)
> > +{
> > + record__mmap_cpu_mask_free(&mask->maps);
> > + record__mmap_cpu_mask_free(&mask->affinity);
> > +}
> > +
> > static int parse_output_max_size(const struct option *opt,
> > const char *str, int unset)
> > {
> > @@ -2683,6 +2731,73 @@ static struct option __record_options[] = {
> >
> > struct option *record_options = __record_options;
> >
> > +static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
> > +{
> > + int c;
> > +
> > + for (c = 0; c < cpus->nr; c++)
> > + set_bit(cpus->map[c].cpu, mask->bits);
> > +}
> > +
> > +static void record__free_thread_masks(struct record *rec, int nr_threads)
> > +{
> > + int t;
> > +
> > + if (rec->thread_masks)
> > + for (t = 0; t < nr_threads; t++)
> > + record__thread_mask_free(&rec->thread_masks[t]);
> > +
> > + zfree(&rec->thread_masks);
> > +}
> > +
> > +static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> > +{
> > + int t, ret;
> > +
> > + rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> > + if (!rec->thread_masks) {
> > + pr_err("Failed to allocate thread masks\n");
> > + return -ENOMEM;
> > + }
> > +
> > + for (t = 0; t < nr_threads; t++) {
> > + ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> > + if (ret) {
> > + pr_err("Failed to allocate thread masks[%d]\n", t);
> > + goto out_free;
> > + }
> > + }
> > +
> > + return 0;
> > +
> > +out_free:
> > + record__free_thread_masks(rec, nr_threads);
> > +
> > + return ret;
> > +}
> > +
> > +static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> > +{
> > + int ret;
> > +
> > + ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> > + if (ret)
> > + return ret;
> > +
> > + record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
> > +
> > + rec->nr_threads = 1;
> > +
> > + return 0;
> > +}
> > +
> > +static int record__init_thread_masks(struct record *rec)
> > +{
> > + struct perf_cpu_map *cpus = rec->evlist->core.cpus;
> > +
> > + return record__init_thread_default_masks(rec, cpus);
> > +}
> > +
> > int cmd_record(int argc, const char **argv)
> > {
> > int err;
> > @@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
> > goto out;
> > }
> >
> > + err = record__init_thread_masks(rec);
> > + if (err) {
> > + pr_err("Failed to initialize parallel data streaming masks\n");
> > + goto out;
> > + }
> > +
> > if (rec->opts.nr_cblocks > nr_cblocks_max)
> > rec->opts.nr_cblocks = nr_cblocks_max;
> > pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
> > @@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
> > symbol__exit();
> > auxtrace_record__free(rec->itr);
> > out_opts:
> > + record__free_thread_masks(rec, rec->nr_threads);
> > + rec->nr_threads = 0;
> > evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
> > return err;
> > }
> > --
> > 2.19.0
>
> --
>
> - Arnaldo

--

- Arnaldo

2022-02-01 20:51:23

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

Em Mon, Jan 31, 2022 at 07:03:12PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Jan 31, 2022 at 06:00:31PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Mon, Jan 17, 2022 at 09:34:21PM +0300, Alexey Bayduraev escreveu:
> > > Introduce affinity and mmap thread masks. Thread affinity mask
> > > defines CPUs that a thread is allowed to run on. Thread maps
> > > mask defines mmap data buffers the thread serves to stream
> > > profiling data from.
> > >
> > > Acked-by: Andi Kleen <[email protected]>
> > > Acked-by: Namhyung Kim <[email protected]>
> > > Reviewed-by: Riccardo Mancini <[email protected]>
> > > Tested-by: Riccardo Mancini <[email protected]>
> > > Signed-off-by: Alexey Bayduraev <[email protected]>
> >
> > Some simplifications I added here while reviewing this patchkit:
> >
> > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > index 41998f2140cd5119..53b88c8600624237 100644
> > --- a/tools/perf/builtin-record.c
> > +++ b/tools/perf/builtin-record.c
> > @@ -2213,35 +2213,33 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
> >
> > static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> > {
> > - mask->nbits = nr_bits;
> > mask->bits = bitmap_zalloc(mask->nbits);
> > if (!mask->bits)
> > return -ENOMEM;
> >
> > + mask->nbits = nr_bits;
> > return 0;
>
>
> Interesting, building it at this point in the patchkit didn't uncover
> the bug I introduced, only later when this gets used I got the compiler
> error and applied this on top:
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 53b88c8600624237..6b0e506df20c002a 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -2213,7 +2213,7 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
>
> static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> {
> - mask->bits = bitmap_zalloc(mask->nbits);
> + mask->bits = bitmap_zalloc(nbits);

Make that nr_bits, sigh :-\

> if (!mask->bits)
> return -ENOMEM;
>
>
> > }
> >
> > static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
> > {
> > bitmap_free(mask->bits);
> > + mask->bits = NULL;
> > mask->nbits = 0;
> > }
> >
> > static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
> > {
> > - int ret;
> > + int ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> >
> > - ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> > if (ret) {
> > mask->affinity.bits = NULL;
> > return ret;
> > }
> >
> > ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
> > - if (ret) {
> > + if (ret)
> > record__mmap_cpu_mask_free(&mask->maps);
> > - mask->maps.bits = NULL;
> > - }
> >
> > return ret;
> > }
> > @@ -2733,18 +2731,14 @@ struct option *record_options = __record_options;
> >
> > static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
> > {
> > - int c;
> > -
> > - for (c = 0; c < cpus->nr; c++)
> > + for (int c = 0; c < cpus->nr; c++)
> > set_bit(cpus->map[c].cpu, mask->bits);
> > }
> >
> > static void record__free_thread_masks(struct record *rec, int nr_threads)
> > {
> > - int t;
> > -
> > if (rec->thread_masks)
> > - for (t = 0; t < nr_threads; t++)
> > + for (int t = 0; t < nr_threads; t++)
> > record__thread_mask_free(&rec->thread_masks[t]);
> >
> > zfree(&rec->thread_masks);
> > @@ -2752,7 +2746,7 @@ static void record__free_thread_masks(struct record *rec, int nr_threads)
> >
> > static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> > {
> > - int t, ret;
> > + int ret;
> >
> > rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> > if (!rec->thread_masks) {
> > @@ -2760,7 +2754,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
> > return -ENOMEM;
> > }
> >
> > - for (t = 0; t < nr_threads; t++) {
> > + for (int t = 0; t < nr_threads; t++) {
> > ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> > if (ret) {
> > pr_err("Failed to allocate thread masks[%d]\n", t);
> > @@ -2778,9 +2772,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
> >
> > static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> > {
> > - int ret;
> > -
> > - ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> > + int ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> > if (ret)
> > return ret;
> >
> >
> >
> > > ---
> > > tools/perf/builtin-record.c | 123 ++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 123 insertions(+)
> > >
> > > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > > index bb716c953d02..41998f2140cd 100644
> > > --- a/tools/perf/builtin-record.c
> > > +++ b/tools/perf/builtin-record.c
> > > @@ -87,6 +87,11 @@ struct switch_output {
> > > int cur_file;
> > > };
> > >
> > > +struct thread_mask {
> > > + struct mmap_cpu_mask maps;
> > > + struct mmap_cpu_mask affinity;
> > > +};
> > > +
> > > struct record {
> > > struct perf_tool tool;
> > > struct record_opts opts;
> > > @@ -112,6 +117,8 @@ struct record {
> > > struct mmap_cpu_mask affinity_mask;
> > > unsigned long output_max_size; /* = 0: unlimited */
> > > struct perf_debuginfod debuginfod;
> > > + int nr_threads;
> > > + struct thread_mask *thread_masks;
> > > };
> > >
> > > static volatile int done;
> > > @@ -2204,6 +2211,47 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
> > > return 0;
> > > }
> > >
> > > +static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> > > +{
> > > + mask->nbits = nr_bits;
> > > + mask->bits = bitmap_zalloc(mask->nbits);
> > > + if (!mask->bits)
> > > + return -ENOMEM;
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
> > > +{
> > > + bitmap_free(mask->bits);
> > > + mask->nbits = 0;
> > > +}
> > > +
> > > +static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
> > > +{
> > > + int ret;
> > > +
> > > + ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> > > + if (ret) {
> > > + mask->affinity.bits = NULL;
> > > + return ret;
> > > + }
> > > +
> > > + ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
> > > + if (ret) {
> > > + record__mmap_cpu_mask_free(&mask->maps);
> > > + mask->maps.bits = NULL;
> > > + }
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static void record__thread_mask_free(struct thread_mask *mask)
> > > +{
> > > + record__mmap_cpu_mask_free(&mask->maps);
> > > + record__mmap_cpu_mask_free(&mask->affinity);
> > > +}
> > > +
> > > static int parse_output_max_size(const struct option *opt,
> > > const char *str, int unset)
> > > {
> > > @@ -2683,6 +2731,73 @@ static struct option __record_options[] = {
> > >
> > > struct option *record_options = __record_options;
> > >
> > > +static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
> > > +{
> > > + int c;
> > > +
> > > + for (c = 0; c < cpus->nr; c++)
> > > + set_bit(cpus->map[c].cpu, mask->bits);
> > > +}
> > > +
> > > +static void record__free_thread_masks(struct record *rec, int nr_threads)
> > > +{
> > > + int t;
> > > +
> > > + if (rec->thread_masks)
> > > + for (t = 0; t < nr_threads; t++)
> > > + record__thread_mask_free(&rec->thread_masks[t]);
> > > +
> > > + zfree(&rec->thread_masks);
> > > +}
> > > +
> > > +static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> > > +{
> > > + int t, ret;
> > > +
> > > + rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> > > + if (!rec->thread_masks) {
> > > + pr_err("Failed to allocate thread masks\n");
> > > + return -ENOMEM;
> > > + }
> > > +
> > > + for (t = 0; t < nr_threads; t++) {
> > > + ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> > > + if (ret) {
> > > + pr_err("Failed to allocate thread masks[%d]\n", t);
> > > + goto out_free;
> > > + }
> > > + }
> > > +
> > > + return 0;
> > > +
> > > +out_free:
> > > + record__free_thread_masks(rec, nr_threads);
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> > > +{
> > > + int ret;
> > > +
> > > + ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
> > > +
> > > + rec->nr_threads = 1;
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int record__init_thread_masks(struct record *rec)
> > > +{
> > > + struct perf_cpu_map *cpus = rec->evlist->core.cpus;
> > > +
> > > + return record__init_thread_default_masks(rec, cpus);
> > > +}
> > > +
> > > int cmd_record(int argc, const char **argv)
> > > {
> > > int err;
> > > @@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
> > > goto out;
> > > }
> > >
> > > + err = record__init_thread_masks(rec);
> > > + if (err) {
> > > + pr_err("Failed to initialize parallel data streaming masks\n");
> > > + goto out;
> > > + }
> > > +
> > > if (rec->opts.nr_cblocks > nr_cblocks_max)
> > > rec->opts.nr_cblocks = nr_cblocks_max;
> > > pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
> > > @@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
> > > symbol__exit();
> > > auxtrace_record__free(rec->itr);
> > > out_opts:
> > > + record__free_thread_masks(rec, rec->nr_threads);
> > > + rec->nr_threads = 0;
> > > evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
> > > return err;
> > > }
> > > --
> > > 2.19.0
> >
> > --
> >
> > - Arnaldo
>
> --
>
> - Arnaldo

--

- Arnaldo

2022-02-01 20:51:59

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

Em Mon, Jan 31, 2022 at 06:00:31PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Jan 17, 2022 at 09:34:21PM +0300, Alexey Bayduraev escreveu:
> > Introduce affinity and mmap thread masks. Thread affinity mask
> > defines CPUs that a thread is allowed to run on. Thread maps
> > mask defines mmap data buffers the thread serves to stream
> > profiling data from.
> >
> > Acked-by: Andi Kleen <[email protected]>
> > Acked-by: Namhyung Kim <[email protected]>
> > Reviewed-by: Riccardo Mancini <[email protected]>
> > Tested-by: Riccardo Mancini <[email protected]>
> > Signed-off-by: Alexey Bayduraev <[email protected]>
>
> Some simplifications I added here while reviewing this patchkit:
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 41998f2140cd5119..53b88c8600624237 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -2213,35 +2213,33 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
>
> static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> {
> - mask->nbits = nr_bits;
> mask->bits = bitmap_zalloc(mask->nbits);
> if (!mask->bits)
> return -ENOMEM;
>
> + mask->nbits = nr_bits;
> return 0;


Interesting, building it at this point in the patchkit didn't uncover
the bug I introduced, only later when this gets used I got the compiler
error and applied this on top:

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 53b88c8600624237..6b0e506df20c002a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -2213,7 +2213,7 @@ static int record__parse_affinity(const struct option *opt, const char *str, int

static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
{
- mask->bits = bitmap_zalloc(mask->nbits);
+ mask->bits = bitmap_zalloc(nbits);
if (!mask->bits)
return -ENOMEM;


> }
>
> static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
> {
> bitmap_free(mask->bits);
> + mask->bits = NULL;
> mask->nbits = 0;
> }
>
> static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
> {
> - int ret;
> + int ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
>
> - ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> if (ret) {
> mask->affinity.bits = NULL;
> return ret;
> }
>
> ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
> - if (ret) {
> + if (ret)
> record__mmap_cpu_mask_free(&mask->maps);
> - mask->maps.bits = NULL;
> - }
>
> return ret;
> }
> @@ -2733,18 +2731,14 @@ struct option *record_options = __record_options;
>
> static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
> {
> - int c;
> -
> - for (c = 0; c < cpus->nr; c++)
> + for (int c = 0; c < cpus->nr; c++)
> set_bit(cpus->map[c].cpu, mask->bits);
> }
>
> static void record__free_thread_masks(struct record *rec, int nr_threads)
> {
> - int t;
> -
> if (rec->thread_masks)
> - for (t = 0; t < nr_threads; t++)
> + for (int t = 0; t < nr_threads; t++)
> record__thread_mask_free(&rec->thread_masks[t]);
>
> zfree(&rec->thread_masks);
> @@ -2752,7 +2746,7 @@ static void record__free_thread_masks(struct record *rec, int nr_threads)
>
> static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> {
> - int t, ret;
> + int ret;
>
> rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> if (!rec->thread_masks) {
> @@ -2760,7 +2754,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
> return -ENOMEM;
> }
>
> - for (t = 0; t < nr_threads; t++) {
> + for (int t = 0; t < nr_threads; t++) {
> ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> if (ret) {
> pr_err("Failed to allocate thread masks[%d]\n", t);
> @@ -2778,9 +2772,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
>
> static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> {
> - int ret;
> -
> - ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> + int ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> if (ret)
> return ret;
>
>
>
> > ---
> > tools/perf/builtin-record.c | 123 ++++++++++++++++++++++++++++++++++++
> > 1 file changed, 123 insertions(+)
> >
> > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > index bb716c953d02..41998f2140cd 100644
> > --- a/tools/perf/builtin-record.c
> > +++ b/tools/perf/builtin-record.c
> > @@ -87,6 +87,11 @@ struct switch_output {
> > int cur_file;
> > };
> >
> > +struct thread_mask {
> > + struct mmap_cpu_mask maps;
> > + struct mmap_cpu_mask affinity;
> > +};
> > +
> > struct record {
> > struct perf_tool tool;
> > struct record_opts opts;
> > @@ -112,6 +117,8 @@ struct record {
> > struct mmap_cpu_mask affinity_mask;
> > unsigned long output_max_size; /* = 0: unlimited */
> > struct perf_debuginfod debuginfod;
> > + int nr_threads;
> > + struct thread_mask *thread_masks;
> > };
> >
> > static volatile int done;
> > @@ -2204,6 +2211,47 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
> > return 0;
> > }
> >
> > +static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> > +{
> > + mask->nbits = nr_bits;
> > + mask->bits = bitmap_zalloc(mask->nbits);
> > + if (!mask->bits)
> > + return -ENOMEM;
> > +
> > + return 0;
> > +}
> > +
> > +static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
> > +{
> > + bitmap_free(mask->bits);
> > + mask->nbits = 0;
> > +}
> > +
> > +static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
> > +{
> > + int ret;
> > +
> > + ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> > + if (ret) {
> > + mask->affinity.bits = NULL;
> > + return ret;
> > + }
> > +
> > + ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
> > + if (ret) {
> > + record__mmap_cpu_mask_free(&mask->maps);
> > + mask->maps.bits = NULL;
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static void record__thread_mask_free(struct thread_mask *mask)
> > +{
> > + record__mmap_cpu_mask_free(&mask->maps);
> > + record__mmap_cpu_mask_free(&mask->affinity);
> > +}
> > +
> > static int parse_output_max_size(const struct option *opt,
> > const char *str, int unset)
> > {
> > @@ -2683,6 +2731,73 @@ static struct option __record_options[] = {
> >
> > struct option *record_options = __record_options;
> >
> > +static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
> > +{
> > + int c;
> > +
> > + for (c = 0; c < cpus->nr; c++)
> > + set_bit(cpus->map[c].cpu, mask->bits);
> > +}
> > +
> > +static void record__free_thread_masks(struct record *rec, int nr_threads)
> > +{
> > + int t;
> > +
> > + if (rec->thread_masks)
> > + for (t = 0; t < nr_threads; t++)
> > + record__thread_mask_free(&rec->thread_masks[t]);
> > +
> > + zfree(&rec->thread_masks);
> > +}
> > +
> > +static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> > +{
> > + int t, ret;
> > +
> > + rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> > + if (!rec->thread_masks) {
> > + pr_err("Failed to allocate thread masks\n");
> > + return -ENOMEM;
> > + }
> > +
> > + for (t = 0; t < nr_threads; t++) {
> > + ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> > + if (ret) {
> > + pr_err("Failed to allocate thread masks[%d]\n", t);
> > + goto out_free;
> > + }
> > + }
> > +
> > + return 0;
> > +
> > +out_free:
> > + record__free_thread_masks(rec, nr_threads);
> > +
> > + return ret;
> > +}
> > +
> > +static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> > +{
> > + int ret;
> > +
> > + ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> > + if (ret)
> > + return ret;
> > +
> > + record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
> > +
> > + rec->nr_threads = 1;
> > +
> > + return 0;
> > +}
> > +
> > +static int record__init_thread_masks(struct record *rec)
> > +{
> > + struct perf_cpu_map *cpus = rec->evlist->core.cpus;
> > +
> > + return record__init_thread_default_masks(rec, cpus);
> > +}
> > +
> > int cmd_record(int argc, const char **argv)
> > {
> > int err;
> > @@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
> > goto out;
> > }
> >
> > + err = record__init_thread_masks(rec);
> > + if (err) {
> > + pr_err("Failed to initialize parallel data streaming masks\n");
> > + goto out;
> > + }
> > +
> > if (rec->opts.nr_cblocks > nr_cblocks_max)
> > rec->opts.nr_cblocks = nr_cblocks_max;
> > pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
> > @@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
> > symbol__exit();
> > auxtrace_record__free(rec->itr);
> > out_opts:
> > + record__free_thread_masks(rec, rec->nr_threads);
> > + rec->nr_threads = 0;
> > evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
> > return err;
> > }
> > --
> > 2.19.0
>
> --
>
> - Arnaldo

--

- Arnaldo

2022-02-01 20:53:10

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

Em Mon, Jan 17, 2022 at 09:34:21PM +0300, Alexey Bayduraev escreveu:
> Introduce affinity and mmap thread masks. Thread affinity mask
> defines CPUs that a thread is allowed to run on. Thread maps
> mask defines mmap data buffers the thread serves to stream
> profiling data from.
>
> Acked-by: Andi Kleen <[email protected]>
> Acked-by: Namhyung Kim <[email protected]>
> Reviewed-by: Riccardo Mancini <[email protected]>
> Tested-by: Riccardo Mancini <[email protected]>
> Signed-off-by: Alexey Bayduraev <[email protected]>

Some simplifications I added here while reviewing this patchkit:

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 41998f2140cd5119..53b88c8600624237 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -2213,35 +2213,33 @@ static int record__parse_affinity(const struct option *opt, const char *str, int

static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
{
- mask->nbits = nr_bits;
mask->bits = bitmap_zalloc(mask->nbits);
if (!mask->bits)
return -ENOMEM;

+ mask->nbits = nr_bits;
return 0;
}

static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
{
bitmap_free(mask->bits);
+ mask->bits = NULL;
mask->nbits = 0;
}

static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
{
- int ret;
+ int ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);

- ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
if (ret) {
mask->affinity.bits = NULL;
return ret;
}

ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
- if (ret) {
+ if (ret)
record__mmap_cpu_mask_free(&mask->maps);
- mask->maps.bits = NULL;
- }

return ret;
}
@@ -2733,18 +2731,14 @@ struct option *record_options = __record_options;

static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
{
- int c;
-
- for (c = 0; c < cpus->nr; c++)
+ for (int c = 0; c < cpus->nr; c++)
set_bit(cpus->map[c].cpu, mask->bits);
}

static void record__free_thread_masks(struct record *rec, int nr_threads)
{
- int t;
-
if (rec->thread_masks)
- for (t = 0; t < nr_threads; t++)
+ for (int t = 0; t < nr_threads; t++)
record__thread_mask_free(&rec->thread_masks[t]);

zfree(&rec->thread_masks);
@@ -2752,7 +2746,7 @@ static void record__free_thread_masks(struct record *rec, int nr_threads)

static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
{
- int t, ret;
+ int ret;

rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
if (!rec->thread_masks) {
@@ -2760,7 +2754,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
return -ENOMEM;
}

- for (t = 0; t < nr_threads; t++) {
+ for (int t = 0; t < nr_threads; t++) {
ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
if (ret) {
pr_err("Failed to allocate thread masks[%d]\n", t);
@@ -2778,9 +2772,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr

static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
{
- int ret;
-
- ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
+ int ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
if (ret)
return ret;



> ---
> tools/perf/builtin-record.c | 123 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 123 insertions(+)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index bb716c953d02..41998f2140cd 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -87,6 +87,11 @@ struct switch_output {
> int cur_file;
> };
>
> +struct thread_mask {
> + struct mmap_cpu_mask maps;
> + struct mmap_cpu_mask affinity;
> +};
> +
> struct record {
> struct perf_tool tool;
> struct record_opts opts;
> @@ -112,6 +117,8 @@ struct record {
> struct mmap_cpu_mask affinity_mask;
> unsigned long output_max_size; /* = 0: unlimited */
> struct perf_debuginfod debuginfod;
> + int nr_threads;
> + struct thread_mask *thread_masks;
> };
>
> static volatile int done;
> @@ -2204,6 +2211,47 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
> return 0;
> }
>
> +static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> +{
> + mask->nbits = nr_bits;
> + mask->bits = bitmap_zalloc(mask->nbits);
> + if (!mask->bits)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
> +{
> + bitmap_free(mask->bits);
> + mask->nbits = 0;
> +}
> +
> +static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
> +{
> + int ret;
> +
> + ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> + if (ret) {
> + mask->affinity.bits = NULL;
> + return ret;
> + }
> +
> + ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
> + if (ret) {
> + record__mmap_cpu_mask_free(&mask->maps);
> + mask->maps.bits = NULL;
> + }
> +
> + return ret;
> +}
> +
> +static void record__thread_mask_free(struct thread_mask *mask)
> +{
> + record__mmap_cpu_mask_free(&mask->maps);
> + record__mmap_cpu_mask_free(&mask->affinity);
> +}
> +
> static int parse_output_max_size(const struct option *opt,
> const char *str, int unset)
> {
> @@ -2683,6 +2731,73 @@ static struct option __record_options[] = {
>
> struct option *record_options = __record_options;
>
> +static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
> +{
> + int c;
> +
> + for (c = 0; c < cpus->nr; c++)
> + set_bit(cpus->map[c].cpu, mask->bits);
> +}
> +
> +static void record__free_thread_masks(struct record *rec, int nr_threads)
> +{
> + int t;
> +
> + if (rec->thread_masks)
> + for (t = 0; t < nr_threads; t++)
> + record__thread_mask_free(&rec->thread_masks[t]);
> +
> + zfree(&rec->thread_masks);
> +}
> +
> +static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> +{
> + int t, ret;
> +
> + rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> + if (!rec->thread_masks) {
> + pr_err("Failed to allocate thread masks\n");
> + return -ENOMEM;
> + }
> +
> + for (t = 0; t < nr_threads; t++) {
> + ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> + if (ret) {
> + pr_err("Failed to allocate thread masks[%d]\n", t);
> + goto out_free;
> + }
> + }
> +
> + return 0;
> +
> +out_free:
> + record__free_thread_masks(rec, nr_threads);
> +
> + return ret;
> +}
> +
> +static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> +{
> + int ret;
> +
> + ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> + if (ret)
> + return ret;
> +
> + record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
> +
> + rec->nr_threads = 1;
> +
> + return 0;
> +}
> +
> +static int record__init_thread_masks(struct record *rec)
> +{
> + struct perf_cpu_map *cpus = rec->evlist->core.cpus;
> +
> + return record__init_thread_default_masks(rec, cpus);
> +}
> +
> int cmd_record(int argc, const char **argv)
> {
> int err;
> @@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
> goto out;
> }
>
> + err = record__init_thread_masks(rec);
> + if (err) {
> + pr_err("Failed to initialize parallel data streaming masks\n");
> + goto out;
> + }
> +
> if (rec->opts.nr_cblocks > nr_cblocks_max)
> rec->opts.nr_cblocks = nr_cblocks_max;
> pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
> @@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
> symbol__exit();
> auxtrace_record__free(rec->itr);
> out_opts:
> + record__free_thread_masks(rec, rec->nr_threads);
> + rec->nr_threads = 0;
> evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
> return err;
> }
> --
> 2.19.0

--

- Arnaldo

2022-02-02 16:06:30

by Bayduraev, Alexey V

[permalink] [raw]
Subject: Re: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

On 01.02.2022 0:16, Arnaldo Carvalho de Melo wrote:
> Em Mon, Jan 31, 2022 at 06:00:31PM -0300, Arnaldo Carvalho de Melo escreveu:
>> Em Mon, Jan 17, 2022 at 09:34:21PM +0300, Alexey Bayduraev escreveu:
>>> Introduce affinity and mmap thread masks. Thread affinity mask
>>> defines CPUs that a thread is allowed to run on. Thread maps
>>> mask defines mmap data buffers the thread serves to stream
>>> profiling data from.
>>>
>>> Acked-by: Andi Kleen <[email protected]>
>>> Acked-by: Namhyung Kim <[email protected]>
>>> Reviewed-by: Riccardo Mancini <[email protected]>
>>> Tested-by: Riccardo Mancini <[email protected]>
>>> Signed-off-by: Alexey Bayduraev <[email protected]>
>>
>> Some simplifications I added here while reviewing this patchkit:
>
> But then, why allocate these even without using them? I.e. the init
> should be left for when we are sure that we'll actually use this, i.e.
> when the user asks for parallel mode.
>
> We already have lots of needless initializations, reading of files that
> may not be needed, so we should avoid doing things till we really know
> that we'll use those allocations, readings, etc.
>
> Anyway, continuing to review, will leave what I have at a separata
> branch so that we can continue from there.

In the current design, we assume that without --threads option nr_threads==1
and we allocate rec->thread_masks and rec->thread_data as arrays of size 1,
also we move some variables from "struct record" to rec->thread_data and
use thread_data[0] (thru "thread" pointer) in main thread instead of
"struct record". This simplifies the later implementation.

With another approach we could assume nr_threads==0 and use only necessary
"struct record" variables, but this adds many if(record__threads_enabled())

Regards,
Alexey

>
> - Arnaldo
>
>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>> index 41998f2140cd5119..53b88c8600624237 100644
>> --- a/tools/perf/builtin-record.c
>> +++ b/tools/perf/builtin-record.c
>> @@ -2213,35 +2213,33 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
>>
>> static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
>> {
>> - mask->nbits = nr_bits;
>> mask->bits = bitmap_zalloc(mask->nbits);
>> if (!mask->bits)
>> return -ENOMEM;
>>
>> + mask->nbits = nr_bits;
>> return 0;
>> }
>>
>> static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
>> {
>> bitmap_free(mask->bits);
>> + mask->bits = NULL;
>> mask->nbits = 0;
>> }
>>
>> static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
>> {
>> - int ret;
>> + int ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
>>
>> - ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
>> if (ret) {
>> mask->affinity.bits = NULL;
>> return ret;
>> }
>>
>> ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
>> - if (ret) {
>> + if (ret)
>> record__mmap_cpu_mask_free(&mask->maps);
>> - mask->maps.bits = NULL;
>> - }
>>
>> return ret;
>> }
>> @@ -2733,18 +2731,14 @@ struct option *record_options = __record_options;
>>
>> static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
>> {
>> - int c;
>> -
>> - for (c = 0; c < cpus->nr; c++)
>> + for (int c = 0; c < cpus->nr; c++)
>> set_bit(cpus->map[c].cpu, mask->bits);
>> }
>>
>> static void record__free_thread_masks(struct record *rec, int nr_threads)
>> {
>> - int t;
>> -
>> if (rec->thread_masks)
>> - for (t = 0; t < nr_threads; t++)
>> + for (int t = 0; t < nr_threads; t++)
>> record__thread_mask_free(&rec->thread_masks[t]);
>>
>> zfree(&rec->thread_masks);
>> @@ -2752,7 +2746,7 @@ static void record__free_thread_masks(struct record *rec, int nr_threads)
>>
>> static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
>> {
>> - int t, ret;
>> + int ret;
>>
>> rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
>> if (!rec->thread_masks) {
>> @@ -2760,7 +2754,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
>> return -ENOMEM;
>> }
>>
>> - for (t = 0; t < nr_threads; t++) {
>> + for (int t = 0; t < nr_threads; t++) {
>> ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
>> if (ret) {
>> pr_err("Failed to allocate thread masks[%d]\n", t);
>> @@ -2778,9 +2772,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
>>
>> static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
>> {
>> - int ret;
>> -
>> - ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
>> + int ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
>> if (ret)
>> return ret;
>>
>>
>>
>>> ---
>>> tools/perf/builtin-record.c | 123 ++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 123 insertions(+)
>>>
>>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>>> index bb716c953d02..41998f2140cd 100644
>>> --- a/tools/perf/builtin-record.c
>>> +++ b/tools/perf/builtin-record.c
>>> @@ -87,6 +87,11 @@ struct switch_output {
>>> int cur_file;
>>> };
>>>
>>> +struct thread_mask {
>>> + struct mmap_cpu_mask maps;
>>> + struct mmap_cpu_mask affinity;
>>> +};
>>> +
>>> struct record {
>>> struct perf_tool tool;
>>> struct record_opts opts;
>>> @@ -112,6 +117,8 @@ struct record {
>>> struct mmap_cpu_mask affinity_mask;
>>> unsigned long output_max_size; /* = 0: unlimited */
>>> struct perf_debuginfod debuginfod;
>>> + int nr_threads;
>>> + struct thread_mask *thread_masks;
>>> };
>>>
>>> static volatile int done;
>>> @@ -2204,6 +2211,47 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
>>> return 0;
>>> }
>>>
>>> +static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
>>> +{
>>> + mask->nbits = nr_bits;
>>> + mask->bits = bitmap_zalloc(mask->nbits);
>>> + if (!mask->bits)
>>> + return -ENOMEM;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
>>> +{
>>> + bitmap_free(mask->bits);
>>> + mask->nbits = 0;
>>> +}
>>> +
>>> +static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
>>> +{
>>> + int ret;
>>> +
>>> + ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
>>> + if (ret) {
>>> + mask->affinity.bits = NULL;
>>> + return ret;
>>> + }
>>> +
>>> + ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
>>> + if (ret) {
>>> + record__mmap_cpu_mask_free(&mask->maps);
>>> + mask->maps.bits = NULL;
>>> + }
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static void record__thread_mask_free(struct thread_mask *mask)
>>> +{
>>> + record__mmap_cpu_mask_free(&mask->maps);
>>> + record__mmap_cpu_mask_free(&mask->affinity);
>>> +}
>>> +
>>> static int parse_output_max_size(const struct option *opt,
>>> const char *str, int unset)
>>> {
>>> @@ -2683,6 +2731,73 @@ static struct option __record_options[] = {
>>>
>>> struct option *record_options = __record_options;
>>>
>>> +static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
>>> +{
>>> + int c;
>>> +
>>> + for (c = 0; c < cpus->nr; c++)
>>> + set_bit(cpus->map[c].cpu, mask->bits);
>>> +}
>>> +
>>> +static void record__free_thread_masks(struct record *rec, int nr_threads)
>>> +{
>>> + int t;
>>> +
>>> + if (rec->thread_masks)
>>> + for (t = 0; t < nr_threads; t++)
>>> + record__thread_mask_free(&rec->thread_masks[t]);
>>> +
>>> + zfree(&rec->thread_masks);
>>> +}
>>> +
>>> +static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
>>> +{
>>> + int t, ret;
>>> +
>>> + rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
>>> + if (!rec->thread_masks) {
>>> + pr_err("Failed to allocate thread masks\n");
>>> + return -ENOMEM;
>>> + }
>>> +
>>> + for (t = 0; t < nr_threads; t++) {
>>> + ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
>>> + if (ret) {
>>> + pr_err("Failed to allocate thread masks[%d]\n", t);
>>> + goto out_free;
>>> + }
>>> + }
>>> +
>>> + return 0;
>>> +
>>> +out_free:
>>> + record__free_thread_masks(rec, nr_threads);
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
>>> +{
>>> + int ret;
>>> +
>>> + ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
>>> + if (ret)
>>> + return ret;
>>> +
>>> + record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
>>> +
>>> + rec->nr_threads = 1;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int record__init_thread_masks(struct record *rec)
>>> +{
>>> + struct perf_cpu_map *cpus = rec->evlist->core.cpus;
>>> +
>>> + return record__init_thread_default_masks(rec, cpus);
>>> +}
>>> +
>>> int cmd_record(int argc, const char **argv)
>>> {
>>> int err;
>>> @@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
>>> goto out;
>>> }
>>>
>>> + err = record__init_thread_masks(rec);
>>> + if (err) {
>>> + pr_err("Failed to initialize parallel data streaming masks\n");
>>> + goto out;
>>> + }
>>> +
>>> if (rec->opts.nr_cblocks > nr_cblocks_max)
>>> rec->opts.nr_cblocks = nr_cblocks_max;
>>> pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
>>> @@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
>>> symbol__exit();
>>> auxtrace_record__free(rec->itr);
>>> out_opts:
>>> + record__free_thread_masks(rec, rec->nr_threads);
>>> + rec->nr_threads = 0;
>>> evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
>>> return err;
>>> }
>>> --
>>> 2.19.0
>>
>> --
>>
>> - Arnaldo
>

2022-04-05 03:37:32

by Ian Rogers

[permalink] [raw]
Subject: Re: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

On Mon, Jan 17, 2022 at 10:38 AM Alexey Bayduraev
<[email protected]> wrote:
>
> Introduce affinity and mmap thread masks. Thread affinity mask
> defines CPUs that a thread is allowed to run on. Thread maps
> mask defines mmap data buffers the thread serves to stream
> profiling data from.
>
> Acked-by: Andi Kleen <[email protected]>
> Acked-by: Namhyung Kim <[email protected]>
> Reviewed-by: Riccardo Mancini <[email protected]>
> Tested-by: Riccardo Mancini <[email protected]>
> Signed-off-by: Alexey Bayduraev <[email protected]>
> ---
> tools/perf/builtin-record.c | 123 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 123 insertions(+)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index bb716c953d02..41998f2140cd 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -87,6 +87,11 @@ struct switch_output {
> int cur_file;
> };
>
> +struct thread_mask {
> + struct mmap_cpu_mask maps;
> + struct mmap_cpu_mask affinity;
> +};
> +
> struct record {
> struct perf_tool tool;
> struct record_opts opts;
> @@ -112,6 +117,8 @@ struct record {
> struct mmap_cpu_mask affinity_mask;
> unsigned long output_max_size; /* = 0: unlimited */
> struct perf_debuginfod debuginfod;
> + int nr_threads;
> + struct thread_mask *thread_masks;
> };
>
> static volatile int done;
> @@ -2204,6 +2211,47 @@ static int record__parse_affinity(const struct option *opt, const char *str, int
> return 0;
> }
>
> +static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
> +{
> + mask->nbits = nr_bits;
> + mask->bits = bitmap_zalloc(mask->nbits);
> + if (!mask->bits)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
> +{
> + bitmap_free(mask->bits);
> + mask->nbits = 0;
> +}
> +
> +static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
> +{
> + int ret;
> +
> + ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
> + if (ret) {
> + mask->affinity.bits = NULL;
> + return ret;
> + }
> +
> + ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
> + if (ret) {
> + record__mmap_cpu_mask_free(&mask->maps);
> + mask->maps.bits = NULL;
> + }
> +
> + return ret;
> +}
> +
> +static void record__thread_mask_free(struct thread_mask *mask)
> +{
> + record__mmap_cpu_mask_free(&mask->maps);
> + record__mmap_cpu_mask_free(&mask->affinity);
> +}
> +
> static int parse_output_max_size(const struct option *opt,
> const char *str, int unset)
> {
> @@ -2683,6 +2731,73 @@ static struct option __record_options[] = {
>
> struct option *record_options = __record_options;
>
> +static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
> +{
> + int c;
> + for (c = 0; c < cpus->nr; c++)
> + set_bit(cpus->map[c].cpu, mask->bits);
> +}
> +

In per-thread mode it is possible that cpus is the dummy CPU map here.
This means that the cpu below has the value -1 and setting bit -1
actually has the effect of setting bit 63. Here is a reproduction
based on the acme/perf/core branch:

```
$ make STATIC=1 DEBUG=1 EXTRA_CFLAGS='-fno-omit-frame-pointer
-fsanitize=undefined -fno-sanitize-recover'
$ perf record -o /tmp/perf.data --per-thread true
tools/include/asm-generic/bitops/atomic.h:10:36: runtime error: shift
exponent -1 is negative
$ UBSAN_OPTIONS=abort_on_error=1 gdb --args perf record -o
/tmp/perf.data --per-thread true
(gdb) r
tools/include/asm-generic/bitops/atomic.h:10:36: runtime error: shift
exponent -1 is negative
(gdb) bt
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:49
#1 0x00007ffff71d2546 in __GI_abort () at abort.c:79
#2 0x00007ffff640db9f in __sanitizer::Abort () at
../../../../src/libsanitizer/sanitizer_common/sanitizer_posix_libcdep.cpp:151
#3 0x00007ffff6418efc in __sanitizer::Die () at
../../../../src/libsanitizer/sanitizer_common/sanitizer_termination.cpp:58
#4 0x00007ffff63fd99e in
__ubsan::__ubsan_handle_shift_out_of_bounds_abort (Data=<optimized
out>, LHS=<optimized out>,
RHS=<optimized out>) at
../../../../src/libsanitizer/ubsan/ubsan_handlers.cpp:378
#5 0x0000555555c54405 in set_bit (nr=-1, addr=0x555556ecd0a0)
at tools/include/asm-generic/bitops/atomic.h:10
#6 0x0000555555c6ddaf in record__mmap_cpu_mask_init
(mask=0x555556ecd070, cpus=0x555556ecd050) at builtin-record.c:3333
#7 0x0000555555c7044c in record__init_thread_default_masks
(rec=0x55555681b100 <record>, cpus=0x555556ecd050) at
builtin-record.c:3668
#8 0x0000555555c705b3 in record__init_thread_masks
(rec=0x55555681b100 <record>) at builtin-record.c:3681
#9 0x0000555555c7297a in cmd_record (argc=1, argv=0x7fffffffdcc0) at
builtin-record.c:3976
#10 0x0000555555e06d41 in run_builtin (p=0x555556827538
<commands+216>, argc=5, argv=0x7fffffffdcc0) at perf.c:313
#11 0x0000555555e07253 in handle_internal_command (argc=5,
argv=0x7fffffffdcc0) at perf.c:365
#12 0x0000555555e07508 in run_argv (argcp=0x7fffffffdb0c,
argv=0x7fffffffdb00) at perf.c:409
#13 0x0000555555e07b32 in main (argc=5, argv=0x7fffffffdcc0) at perf.c:539
```

Not setting the mask->bits if the cpu map is dummy causes no data to
be written. Setting mask->bits 0 causes a segv. Setting bit 63 works
but feels like there are more invariants broken in the code.

Here is a not good workaround patch:

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index ba74fab02e62..62727b676f98 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -3329,6 +3329,11 @@ static void record__mmap_cpu_mask_init(struct
mmap_cpu_mask *mask, struct perf_c
{
int c;

+ if (cpu_map__is_dummy(cpus)) {
+ set_bit(63, mask->bits);
+ return;
+ }
+
for (c = 0; c < cpus->nr; c++)
set_bit(cpus->map[c].cpu, mask->bits);
}

Alexey, what should the expected behavior be with per-thread mmaps?

Thanks,
Ian

> +static void record__free_thread_masks(struct record *rec, int nr_threads)
> +{
> + int t;
> +
> + if (rec->thread_masks)
> + for (t = 0; t < nr_threads; t++)
> + record__thread_mask_free(&rec->thread_masks[t]);
> +
> + zfree(&rec->thread_masks);
> +}
> +
> +static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> +{
> + int t, ret;
> +
> + rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> + if (!rec->thread_masks) {
> + pr_err("Failed to allocate thread masks\n");
> + return -ENOMEM;
> + }
> +
> + for (t = 0; t < nr_threads; t++) {
> + ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> + if (ret) {
> + pr_err("Failed to allocate thread masks[%d]\n", t);
> + goto out_free;
> + }
> + }
> +
> + return 0;
> +
> +out_free:
> + record__free_thread_masks(rec, nr_threads);
> +
> + return ret;
> +}
> +
> +static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> +{
> + int ret;
> +
> + ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> + if (ret)
> + return ret;
> +
> + record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
> +
> + rec->nr_threads = 1;
> +
> + return 0;
> +}
> +
> +static int record__init_thread_masks(struct record *rec)
> +{
> + struct perf_cpu_map *cpus = rec->evlist->core.cpus;
> +
> + return record__init_thread_default_masks(rec, cpus);
> +}
> +
> int cmd_record(int argc, const char **argv)
> {
> int err;
> @@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
> goto out;
> }
>
> + err = record__init_thread_masks(rec);
> + if (err) {
> + pr_err("Failed to initialize parallel data streaming masks\n");
> + goto out;
> + }
> +
> if (rec->opts.nr_cblocks > nr_cblocks_max)
> rec->opts.nr_cblocks = nr_cblocks_max;
> pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
> @@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
> symbol__exit();
> auxtrace_record__free(rec->itr);
> out_opts:
> + record__free_thread_masks(rec, rec->nr_threads);
> + rec->nr_threads = 0;
> evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
> return err;
> }
> --
> 2.19.0
>

2022-04-06 12:26:00

by Bayduraev, Alexey V

[permalink] [raw]
Subject: Re: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

On 05.04.2022 1:25, Ian Rogers wrote:
> On Mon, Jan 17, 2022 at 10:38 AM Alexey Bayduraev
> <[email protected]> wrote:
>>
>> Introduce affinity and mmap thread masks. Thread affinity mask

<SNIP>

>
> In per-thread mode it is possible that cpus is the dummy CPU map here.
> This means that the cpu below has the value -1 and setting bit -1
> actually has the effect of setting bit 63. Here is a reproduction
> based on the acme/perf/core branch:
>
> ```
> $ make STATIC=1 DEBUG=1 EXTRA_CFLAGS='-fno-omit-frame-pointer
> -fsanitize=undefined -fno-sanitize-recover'
> $ perf record -o /tmp/perf.data --per-thread true
> tools/include/asm-generic/bitops/atomic.h:10:36: runtime error: shift
> exponent -1 is negative
> $ UBSAN_OPTIONS=abort_on_error=1 gdb --args perf record -o
> /tmp/perf.data --per-thread true
> (gdb) r
> tools/include/asm-generic/bitops/atomic.h:10:36: runtime error: shift
> exponent -1 is negative
> (gdb) bt
> #0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:49
> #1 0x00007ffff71d2546 in __GI_abort () at abort.c:79
> #2 0x00007ffff640db9f in __sanitizer::Abort () at
> ../../../../src/libsanitizer/sanitizer_common/sanitizer_posix_libcdep.cpp:151
> #3 0x00007ffff6418efc in __sanitizer::Die () at
> ../../../../src/libsanitizer/sanitizer_common/sanitizer_termination.cpp:58
> #4 0x00007ffff63fd99e in
> __ubsan::__ubsan_handle_shift_out_of_bounds_abort (Data=<optimized
> out>, LHS=<optimized out>,
> RHS=<optimized out>) at
> ../../../../src/libsanitizer/ubsan/ubsan_handlers.cpp:378
> #5 0x0000555555c54405 in set_bit (nr=-1, addr=0x555556ecd0a0)
> at tools/include/asm-generic/bitops/atomic.h:10
> #6 0x0000555555c6ddaf in record__mmap_cpu_mask_init
> (mask=0x555556ecd070, cpus=0x555556ecd050) at builtin-record.c:3333
> #7 0x0000555555c7044c in record__init_thread_default_masks
> (rec=0x55555681b100 <record>, cpus=0x555556ecd050) at
> builtin-record.c:3668
> #8 0x0000555555c705b3 in record__init_thread_masks
> (rec=0x55555681b100 <record>) at builtin-record.c:3681
> #9 0x0000555555c7297a in cmd_record (argc=1, argv=0x7fffffffdcc0) at
> builtin-record.c:3976
> #10 0x0000555555e06d41 in run_builtin (p=0x555556827538
> <commands+216>, argc=5, argv=0x7fffffffdcc0) at perf.c:313
> #11 0x0000555555e07253 in handle_internal_command (argc=5,
> argv=0x7fffffffdcc0) at perf.c:365
> #12 0x0000555555e07508 in run_argv (argcp=0x7fffffffdb0c,
> argv=0x7fffffffdb00) at perf.c:409
> #13 0x0000555555e07b32 in main (argc=5, argv=0x7fffffffdcc0) at perf.c:539
> ```
>
> Not setting the mask->bits if the cpu map is dummy causes no data to
> be written. Setting mask->bits 0 causes a segv. Setting bit 63 works
> but feels like there are more invariants broken in the code.
>
> Here is a not good workaround patch:
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index ba74fab02e62..62727b676f98 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -3329,6 +3329,11 @@ static void record__mmap_cpu_mask_init(struct
> mmap_cpu_mask *mask, struct perf_c
> {
> int c;
>
> + if (cpu_map__is_dummy(cpus)) {
> + set_bit(63, mask->bits);
> + return;
> + }
> +
> for (c = 0; c < cpus->nr; c++)
> set_bit(cpus->map[c].cpu, mask->bits);
> }
>
> Alexey, what should the expected behavior be with per-thread mmaps?
>
> Thanks,
> Ian

Thanks a lot,

In case of per-thread mmaps we should initialize thread_data[0]->maps[i] by
evlist->mmap[i]. Looks like this was missed by this patchset.

Your patch works, because it triggers test_bit() in record__thread_data_init_maps()
and thread_data maps get correctly initialized.

However, it's better to ignore thread_data->masks in record__mmap_cpu_mask_init()
and setup thread_data maps explicitly for per-thread case.

Also, to prevent more runtime crashes, --per-thread and --threads
options should be mutually exclusive.

I will prepare a fix for this issue soon.

Regards,
Alexey

>
>> +static void record__free_thread_masks(struct record *rec, int nr_threads)
>> +{
>> + int t;
>> +
>> + if (rec->thread_masks)
>> + for (t = 0; t < nr_threads; t++)
>> + record__thread_mask_free(&rec->thread_masks[t]);
>> +
>> + zfree(&rec->thread_masks);
>> +}
>> +
>> +static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
>> +{
>> + int t, ret;
>> +
>> + rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
>> + if (!rec->thread_masks) {
>> + pr_err("Failed to allocate thread masks\n");
>> + return -ENOMEM;
>> + }
>> +
>> + for (t = 0; t < nr_threads; t++) {
>> + ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
>> + if (ret) {
>> + pr_err("Failed to allocate thread masks[%d]\n", t);
>> + goto out_free;
>> + }
>> + }
>> +
>> + return 0;
>> +
>> +out_free:
>> + record__free_thread_masks(rec, nr_threads);
>> +
>> + return ret;
>> +}
>> +
>> +static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
>> +{
>> + int ret;
>> +
>> + ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
>> + if (ret)
>> + return ret;
>> +
>> + record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
>> +
>> + rec->nr_threads = 1;
>> +
>> + return 0;
>> +}
>> +
>> +static int record__init_thread_masks(struct record *rec)
>> +{
>> + struct perf_cpu_map *cpus = rec->evlist->core.cpus;
>> +
>> + return record__init_thread_default_masks(rec, cpus);
>> +}
>> +
>> int cmd_record(int argc, const char **argv)
>> {
>> int err;
>> @@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
>> goto out;
>> }
>>
>> + err = record__init_thread_masks(rec);
>> + if (err) {
>> + pr_err("Failed to initialize parallel data streaming masks\n");
>> + goto out;
>> + }
>> +
>> if (rec->opts.nr_cblocks > nr_cblocks_max)
>> rec->opts.nr_cblocks = nr_cblocks_max;
>> pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
>> @@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
>> symbol__exit();
>> auxtrace_record__free(rec->itr);
>> out_opts:
>> + record__free_thread_masks(rec, rec->nr_threads);
>> + rec->nr_threads = 0;
>> evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
>> return err;
>> }
>> --
>> 2.19.0
>>

2022-04-06 18:26:51

by Ian Rogers

[permalink] [raw]
Subject: Re: [PATCH v13 01/16] perf record: Introduce thread affinity and mmap masks

On Tue, Apr 5, 2022 at 9:21 AM Bayduraev, Alexey V
<[email protected]> wrote:
>
> On 05.04.2022 1:25, Ian Rogers wrote:
> > On Mon, Jan 17, 2022 at 10:38 AM Alexey Bayduraev
> > <[email protected]> wrote:
> >>
> >> Introduce affinity and mmap thread masks. Thread affinity mask
>
> <SNIP>
>
> >
> > In per-thread mode it is possible that cpus is the dummy CPU map here.
> > This means that the cpu below has the value -1 and setting bit -1
> > actually has the effect of setting bit 63. Here is a reproduction
> > based on the acme/perf/core branch:
> >
> > ```
> > $ make STATIC=1 DEBUG=1 EXTRA_CFLAGS='-fno-omit-frame-pointer
> > -fsanitize=undefined -fno-sanitize-recover'
> > $ perf record -o /tmp/perf.data --per-thread true
> > tools/include/asm-generic/bitops/atomic.h:10:36: runtime error: shift
> > exponent -1 is negative
> > $ UBSAN_OPTIONS=abort_on_error=1 gdb --args perf record -o
> > /tmp/perf.data --per-thread true
> > (gdb) r
> > tools/include/asm-generic/bitops/atomic.h:10:36: runtime error: shift
> > exponent -1 is negative
> > (gdb) bt
> > #0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:49
> > #1 0x00007ffff71d2546 in __GI_abort () at abort.c:79
> > #2 0x00007ffff640db9f in __sanitizer::Abort () at
> > ../../../../src/libsanitizer/sanitizer_common/sanitizer_posix_libcdep.cpp:151
> > #3 0x00007ffff6418efc in __sanitizer::Die () at
> > ../../../../src/libsanitizer/sanitizer_common/sanitizer_termination.cpp:58
> > #4 0x00007ffff63fd99e in
> > __ubsan::__ubsan_handle_shift_out_of_bounds_abort (Data=<optimized
> > out>, LHS=<optimized out>,
> > RHS=<optimized out>) at
> > ../../../../src/libsanitizer/ubsan/ubsan_handlers.cpp:378
> > #5 0x0000555555c54405 in set_bit (nr=-1, addr=0x555556ecd0a0)
> > at tools/include/asm-generic/bitops/atomic.h:10
> > #6 0x0000555555c6ddaf in record__mmap_cpu_mask_init
> > (mask=0x555556ecd070, cpus=0x555556ecd050) at builtin-record.c:3333
> > #7 0x0000555555c7044c in record__init_thread_default_masks
> > (rec=0x55555681b100 <record>, cpus=0x555556ecd050) at
> > builtin-record.c:3668
> > #8 0x0000555555c705b3 in record__init_thread_masks
> > (rec=0x55555681b100 <record>) at builtin-record.c:3681
> > #9 0x0000555555c7297a in cmd_record (argc=1, argv=0x7fffffffdcc0) at
> > builtin-record.c:3976
> > #10 0x0000555555e06d41 in run_builtin (p=0x555556827538
> > <commands+216>, argc=5, argv=0x7fffffffdcc0) at perf.c:313
> > #11 0x0000555555e07253 in handle_internal_command (argc=5,
> > argv=0x7fffffffdcc0) at perf.c:365
> > #12 0x0000555555e07508 in run_argv (argcp=0x7fffffffdb0c,
> > argv=0x7fffffffdb00) at perf.c:409
> > #13 0x0000555555e07b32 in main (argc=5, argv=0x7fffffffdcc0) at perf.c:539
> > ```
> >
> > Not setting the mask->bits if the cpu map is dummy causes no data to
> > be written. Setting mask->bits 0 causes a segv. Setting bit 63 works
> > but feels like there are more invariants broken in the code.
> >
> > Here is a not good workaround patch:
> >
> > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > index ba74fab02e62..62727b676f98 100644
> > --- a/tools/perf/builtin-record.c
> > +++ b/tools/perf/builtin-record.c
> > @@ -3329,6 +3329,11 @@ static void record__mmap_cpu_mask_init(struct
> > mmap_cpu_mask *mask, struct perf_c
> > {
> > int c;
> >
> > + if (cpu_map__is_dummy(cpus)) {
> > + set_bit(63, mask->bits);
> > + return;
> > + }
> > +
> > for (c = 0; c < cpus->nr; c++)
> > set_bit(cpus->map[c].cpu, mask->bits);
> > }
> >
> > Alexey, what should the expected behavior be with per-thread mmaps?
> >
> > Thanks,
> > Ian
>
> Thanks a lot,
>
> In case of per-thread mmaps we should initialize thread_data[0]->maps[i] by
> evlist->mmap[i]. Looks like this was missed by this patchset.
>
> Your patch works, because it triggers test_bit() in record__thread_data_init_maps()
> and thread_data maps get correctly initialized.
>
> However, it's better to ignore thread_data->masks in record__mmap_cpu_mask_init()
> and setup thread_data maps explicitly for per-thread case.
>
> Also, to prevent more runtime crashes, --per-thread and --threads
> options should be mutually exclusive.
>
> I will prepare a fix for this issue soon.

Hi Alexey,

sorry to nag, I'm being nagged, is there an ETA on the fix? Is it
pragmatic to roll back for now?

Thanks,
Ian

> Regards,
> Alexey
>
> >
> >> +static void record__free_thread_masks(struct record *rec, int nr_threads)
> >> +{
> >> + int t;
> >> +
> >> + if (rec->thread_masks)
> >> + for (t = 0; t < nr_threads; t++)
> >> + record__thread_mask_free(&rec->thread_masks[t]);
> >> +
> >> + zfree(&rec->thread_masks);
> >> +}
> >> +
> >> +static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
> >> +{
> >> + int t, ret;
> >> +
> >> + rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
> >> + if (!rec->thread_masks) {
> >> + pr_err("Failed to allocate thread masks\n");
> >> + return -ENOMEM;
> >> + }
> >> +
> >> + for (t = 0; t < nr_threads; t++) {
> >> + ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
> >> + if (ret) {
> >> + pr_err("Failed to allocate thread masks[%d]\n", t);
> >> + goto out_free;
> >> + }
> >> + }
> >> +
> >> + return 0;
> >> +
> >> +out_free:
> >> + record__free_thread_masks(rec, nr_threads);
> >> +
> >> + return ret;
> >> +}
> >> +
> >> +static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
> >> +{
> >> + int ret;
> >> +
> >> + ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
> >> + if (ret)
> >> + return ret;
> >> +
> >> + record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
> >> +
> >> + rec->nr_threads = 1;
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +static int record__init_thread_masks(struct record *rec)
> >> +{
> >> + struct perf_cpu_map *cpus = rec->evlist->core.cpus;
> >> +
> >> + return record__init_thread_default_masks(rec, cpus);
> >> +}
> >> +
> >> int cmd_record(int argc, const char **argv)
> >> {
> >> int err;
> >> @@ -2948,6 +3063,12 @@ int cmd_record(int argc, const char **argv)
> >> goto out;
> >> }
> >>
> >> + err = record__init_thread_masks(rec);
> >> + if (err) {
> >> + pr_err("Failed to initialize parallel data streaming masks\n");
> >> + goto out;
> >> + }
> >> +
> >> if (rec->opts.nr_cblocks > nr_cblocks_max)
> >> rec->opts.nr_cblocks = nr_cblocks_max;
> >> pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
> >> @@ -2966,6 +3087,8 @@ int cmd_record(int argc, const char **argv)
> >> symbol__exit();
> >> auxtrace_record__free(rec->itr);
> >> out_opts:
> >> + record__free_thread_masks(rec, rec->nr_threads);
> >> + rec->nr_threads = 0;
> >> evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
> >> return err;
> >> }
> >> --
> >> 2.19.0
> >>