Introduce thread local variable and use it for threaded trace streaming.
Signed-off-by: Alexey Budankov <[email protected]>
---
tools/perf/builtin-record.c | 71 ++++++++++++++++++++++++++++++++-----
1 file changed, 62 insertions(+), 9 deletions(-)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 89cb8e913fb3..3b7e9026f25b 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -101,6 +101,8 @@ struct thread_data {
u64 bytes_written;
};
+static __thread struct thread_data *thread;
+
struct record {
struct perf_tool tool;
struct record_opts opts;
@@ -587,7 +589,11 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
}
}
- rec->samples++;
+ if (thread)
+ thread->samples++;
+ else
+ rec->samples++;
+
return record__write(rec, map, bf, compressed);
}
@@ -1258,6 +1264,7 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
int i;
int rc = 0;
struct mmap *maps;
+ int nr_mmaps;
int trace_fd = rec->data.file.fd;
off_t off = 0;
@@ -1265,6 +1272,14 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
return 0;
maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
+ nr_mmaps = evlist->core.nr_mmaps;
+
+ if (thread) {
+ bytes_written = thread->bytes_written;
+ maps = thread->maps;
+ nr_mmaps = thread->nr_mmaps;
+ }
+
if (!maps)
return 0;
@@ -1274,7 +1289,7 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
if (record__aio_enabled(rec))
off = record__aio_get_pos(trace_fd);
- for (i = 0; i < evlist->core.nr_mmaps; i++) {
+ for (i = 0; i < nr_mmaps; i++) {
u64 flush = 0;
struct mmap *map = &maps[i];
@@ -1323,7 +1338,7 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
* because per-cpu maps and files have data
* sorted by kernel.
*/
- if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
+ if (!thread && bytes_written != rec->bytes_written)
rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
if (overwrite)
@@ -1343,6 +1358,15 @@ static int record__mmap_read_all(struct record *rec, bool synch)
return record__mmap_read_evlist(rec, rec->evlist, true, synch);
}
+static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
+ void *arg __maybe_unused)
+{
+ struct perf_mmap *map = fda->priv[fd].ptr;
+
+ if (map)
+ perf_mmap__put(map);
+}
+
static void record__init_features(struct record *rec)
{
struct perf_session *session = rec->session;
@@ -2020,7 +2044,12 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
trigger_ready(&switch_output_trigger);
perf_hooks__invoke_record_start();
for (;;) {
- unsigned long long hits = rec->samples;
+ unsigned long long hits0, hits1;
+
+ if (thread)
+ hits0 = thread->samples;
+ else
+ hits0 = rec->samples;
/*
* rec->evlist->bkw_mmap_state is possible to be
@@ -2089,20 +2118,44 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
alarm(rec->switch_output.time);
}
- if (hits == rec->samples) {
+ if (thread)
+ hits1 = thread->samples;
+ else
+ hits1 = rec->samples;
+
+ if (hits0 == hits1) {
if (done || draining)
break;
- err = evlist__poll(rec->evlist, -1);
+
+ if (thread)
+ err = fdarray__poll(&thread->pollfd, -1);
+ else
+ err = evlist__poll(rec->evlist, -1);
/*
* Propagate error, only if there's any. Ignore positive
* number of returned events and interrupt error.
*/
if (err > 0 || (err < 0 && errno == EINTR))
err = 0;
- waking++;
- if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
- draining = true;
+ if (thread) {
+ thread->waking++;
+ if (thread->ctlfd_pos != -1) {
+ evlist__ctlfd_update(rec->evlist,
+ &(thread->pollfd.entries[thread->ctlfd_pos]));
+ }
+ } else {
+ waking++;
+ }
+
+ if (thread) {
+ if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
+ record__thread_munmap_filtered, NULL) == 0)
+ draining = true;
+ } else {
+ if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
+ draining = true;
+ }
}
if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
--
2.24.1
On Wed, Oct 21, 2020 at 07:07:00PM +0300, Alexey Budankov wrote:
>
> Introduce thread local variable and use it for threaded trace streaming.
>
> Signed-off-by: Alexey Budankov <[email protected]>
> ---
> tools/perf/builtin-record.c | 71 ++++++++++++++++++++++++++++++++-----
> 1 file changed, 62 insertions(+), 9 deletions(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 89cb8e913fb3..3b7e9026f25b 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -101,6 +101,8 @@ struct thread_data {
> u64 bytes_written;
> };
>
> +static __thread struct thread_data *thread;
> +
> struct record {
> struct perf_tool tool;
> struct record_opts opts;
> @@ -587,7 +589,11 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
> }
> }
>
> - rec->samples++;
> + if (thread)
> + thread->samples++;
> + else
> + rec->samples++;
this is really wrong, let's keep just single samples counter
ditto for all the other places in this patch
SNIP
> - if (hits == rec->samples) {
> + if (thread)
> + hits1 = thread->samples;
> + else
> + hits1 = rec->samples;
> +
> + if (hits0 == hits1) {
> if (done || draining)
> break;
> - err = evlist__poll(rec->evlist, -1);
> +
> + if (thread)
> + err = fdarray__poll(&thread->pollfd, -1);
> + else
> + err = evlist__poll(rec->evlist, -1);
same here, why do we have the __thread struct then?
jirka
On 24.10.2020 18:43, Jiri Olsa wrote:
> On Wed, Oct 21, 2020 at 07:07:00PM +0300, Alexey Budankov wrote:
>>
>> Introduce thread local variable and use it for threaded trace streaming.
>>
>> Signed-off-by: Alexey Budankov <[email protected]>
>> ---
>> tools/perf/builtin-record.c | 71 ++++++++++++++++++++++++++++++++-----
>> 1 file changed, 62 insertions(+), 9 deletions(-)
>>
>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>> index 89cb8e913fb3..3b7e9026f25b 100644
>> --- a/tools/perf/builtin-record.c
>> +++ b/tools/perf/builtin-record.c
>> @@ -101,6 +101,8 @@ struct thread_data {
>> u64 bytes_written;
>> };
>>
>> +static __thread struct thread_data *thread;
>> +
>> struct record {
>> struct perf_tool tool;
>> struct record_opts opts;
>> @@ -587,7 +589,11 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
>> }
>> }
>>
>> - rec->samples++;
>> + if (thread)
>> + thread->samples++;
>> + else
>> + rec->samples++;
>
> this is really wrong, let's keep just single samples counter
> ditto for all the other places in this patch
This does look like data parallelism [1] which is very true for
threaded trace streaming so your prototype design looks optimal.
For this specific place incrementing global counter in memory is
less performant and faces scalability limitations as a number of
cores grow.
Not sure why you have changed your mind.
Alexei
[1] https://en.wikipedia.org/wiki/Data_parallelism#:~:text=Data%20parallelism%20is%20parallelization%20across,on%20each%20element%20in%20parallel.
On Mon, Oct 26, 2020 at 11:21:28AM +0300, Alexei Budankov wrote:
>
> On 24.10.2020 18:43, Jiri Olsa wrote:
> > On Wed, Oct 21, 2020 at 07:07:00PM +0300, Alexey Budankov wrote:
> >>
> >> Introduce thread local variable and use it for threaded trace streaming.
> >>
> >> Signed-off-by: Alexey Budankov <[email protected]>
> >> ---
> >> tools/perf/builtin-record.c | 71 ++++++++++++++++++++++++++++++++-----
> >> 1 file changed, 62 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> >> index 89cb8e913fb3..3b7e9026f25b 100644
> >> --- a/tools/perf/builtin-record.c
> >> +++ b/tools/perf/builtin-record.c
> >> @@ -101,6 +101,8 @@ struct thread_data {
> >> u64 bytes_written;
> >> };
> >>
> >> +static __thread struct thread_data *thread;
> >> +
> >> struct record {
> >> struct perf_tool tool;
> >> struct record_opts opts;
> >> @@ -587,7 +589,11 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
> >> }
> >> }
> >>
> >> - rec->samples++;
> >> + if (thread)
> >> + thread->samples++;
> >> + else
> >> + rec->samples++;
> >
> > this is really wrong, let's keep just single samples counter
> > ditto for all the other places in this patch
>
> This does look like data parallelism [1] which is very true for
> threaded trace streaming so your prototype design looks optimal.
>
> For this specific place incrementing global counter in memory is
> less performant and faces scalability limitations as a number of
> cores grow.
>
> Not sure why you have changed your mind.
I'm not sure I follow.. what I'm complaining about is to have
'samples' stat variable in separate locations for --threads
and --no-threads mode
jirka
>
> Alexei
>
> [1] https://en.wikipedia.org/wiki/Data_parallelism#:~:text=Data%20parallelism%20is%20parallelization%20across,on%20each%20element%20in%20parallel.
>
On 26.10.2020 13:34, Jiri Olsa wrote:
> On Mon, Oct 26, 2020 at 11:21:28AM +0300, Alexei Budankov wrote:
>>
>> On 24.10.2020 18:43, Jiri Olsa wrote:
>>> On Wed, Oct 21, 2020 at 07:07:00PM +0300, Alexey Budankov wrote:
>>>>
>>>> Introduce thread local variable and use it for threaded trace streaming.
>>>>
>>>> Signed-off-by: Alexey Budankov <[email protected]>
>>>> ---
>>>> tools/perf/builtin-record.c | 71 ++++++++++++++++++++++++++++++++-----
>>>> 1 file changed, 62 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>>>> index 89cb8e913fb3..3b7e9026f25b 100644
>>>> --- a/tools/perf/builtin-record.c
>>>> +++ b/tools/perf/builtin-record.c
>>>> @@ -101,6 +101,8 @@ struct thread_data {
>>>> u64 bytes_written;
>>>> };
>>>>
>>>> +static __thread struct thread_data *thread;
>>>> +
>>>> struct record {
>>>> struct perf_tool tool;
>>>> struct record_opts opts;
>>>> @@ -587,7 +589,11 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
>>>> }
>>>> }
>>>>
>>>> - rec->samples++;
>>>> + if (thread)
>>>> + thread->samples++;
>>>> + else
>>>> + rec->samples++;
>>>
>>> this is really wrong, let's keep just single samples counter
>>> ditto for all the other places in this patch
>>
>> This does look like data parallelism [1] which is very true for
>> threaded trace streaming so your prototype design looks optimal.
>>
>> For this specific place incrementing global counter in memory is
>> less performant and faces scalability limitations as a number of
>> cores grow.
>>
>> Not sure why you have changed your mind.
>
> I'm not sure I follow.. what I'm complaining about is to have
> 'samples' stat variable in separate locations for --threads
> and --no-threads mode
It is optimal to have samples variable as per thread one
and then sum up the total in the end of data collection.
Single global variable design has scalability and performance
drawbacks.
Why do you complain about per thread variable in this case?
It looks like ideally fits these specific needs.
Alexei
>
> jirka
>
>>
>> Alexei
>>
>> [1] https://en.wikipedia.org/wiki/Data_parallelism#:~:text=Data%20parallelism%20is%20parallelization%20across,on%20each%20element%20in%20parallel.
>>
>
On 27.10.2020 15:01, Jiri Olsa wrote:
> On Mon, Oct 26, 2020 at 05:11:30PM +0300, Alexei Budankov wrote:
<SNIP>
>> Why do you complain about per thread variable in this case?
>> It looks like ideally fits these specific needs.
>
> I think there's misunderstanding.. I think we should move
> samples to per thread 'thread' object and have just one
> copy of that.. and do not increase separate variables for
> thread and non-thread cases
Aw, I see. Using the same __thread object by main thread in
serial and threaded modes. That makes sense.
I will correct in v3.
Alexei
On Mon, Oct 26, 2020 at 05:11:30PM +0300, Alexei Budankov wrote:
>
> On 26.10.2020 13:34, Jiri Olsa wrote:
> > On Mon, Oct 26, 2020 at 11:21:28AM +0300, Alexei Budankov wrote:
> >>
> >> On 24.10.2020 18:43, Jiri Olsa wrote:
> >>> On Wed, Oct 21, 2020 at 07:07:00PM +0300, Alexey Budankov wrote:
> >>>>
> >>>> Introduce thread local variable and use it for threaded trace streaming.
> >>>>
> >>>> Signed-off-by: Alexey Budankov <[email protected]>
> >>>> ---
> >>>> tools/perf/builtin-record.c | 71 ++++++++++++++++++++++++++++++++-----
> >>>> 1 file changed, 62 insertions(+), 9 deletions(-)
> >>>>
> >>>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> >>>> index 89cb8e913fb3..3b7e9026f25b 100644
> >>>> --- a/tools/perf/builtin-record.c
> >>>> +++ b/tools/perf/builtin-record.c
> >>>> @@ -101,6 +101,8 @@ struct thread_data {
> >>>> u64 bytes_written;
> >>>> };
> >>>>
> >>>> +static __thread struct thread_data *thread;
> >>>> +
> >>>> struct record {
> >>>> struct perf_tool tool;
> >>>> struct record_opts opts;
> >>>> @@ -587,7 +589,11 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
> >>>> }
> >>>> }
> >>>>
> >>>> - rec->samples++;
> >>>> + if (thread)
> >>>> + thread->samples++;
> >>>> + else
> >>>> + rec->samples++;
> >>>
> >>> this is really wrong, let's keep just single samples counter
> >>> ditto for all the other places in this patch
> >>
> >> This does look like data parallelism [1] which is very true for
> >> threaded trace streaming so your prototype design looks optimal.
> >>
> >> For this specific place incrementing global counter in memory is
> >> less performant and faces scalability limitations as a number of
> >> cores grow.
> >>
> >> Not sure why you have changed your mind.
> >
> > I'm not sure I follow.. what I'm complaining about is to have
> > 'samples' stat variable in separate locations for --threads
> > and --no-threads mode
>
> It is optimal to have samples variable as per thread one
> and then sum up the total in the end of data collection.
>
> Single global variable design has scalability and performance
> drawbacks.
>
> Why do you complain about per thread variable in this case?
> It looks like ideally fits these specific needs.
I think there's misunderstanding.. I think we should move
samples to per thread 'thread' object and have just one
copy of that.. and do not increase separate variables for
thread and non-thread cases
jirka
On 27.10.2020 15:01, Jiri Olsa wrote:
> On Mon, Oct 26, 2020 at 05:11:30PM +0300, Alexei Budankov wrote:
>>
>> On 26.10.2020 13:34, Jiri Olsa wrote:
>>> On Mon, Oct 26, 2020 at 11:21:28AM +0300, Alexei Budankov wrote:
>>>>
>>>> On 24.10.2020 18:43, Jiri Olsa wrote:
>>>>> On Wed, Oct 21, 2020 at 07:07:00PM +0300, Alexey Budankov wrote:
>>>>>>
>>>>>> Introduce thread local variable and use it for threaded trace streaming.
>>>>>>
>>>>>> Signed-off-by: Alexey Budankov <[email protected]>
>>>>>> ---
>>>>>> tools/perf/builtin-record.c | 71 ++++++++++++++++++++++++++++++++-----
>>>>>> 1 file changed, 62 insertions(+), 9 deletions(-)
>>>>>>
>>>>>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>>>>>> index 89cb8e913fb3..3b7e9026f25b 100644
>>>>>> --- a/tools/perf/builtin-record.c
>>>>>> +++ b/tools/perf/builtin-record.c
>>>>>> @@ -101,6 +101,8 @@ struct thread_data {
>>>>>> u64 bytes_written;
>>>>>> };
>>>>>>
>>>>>> +static __thread struct thread_data *thread;
>>>>>> +
>>>>>> struct record {
>>>>>> struct perf_tool tool;
>>>>>> struct record_opts opts;
>>>>>> @@ -587,7 +589,11 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
>>>>>> }
>>>>>> }
>>>>>>
>>>>>> - rec->samples++;
>>>>>> + if (thread)
>>>>>> + thread->samples++;
>>>>>> + else
>>>>>> + rec->samples++;
>>>>>
>>>>> this is really wrong, let's keep just single samples counter
>>>>> ditto for all the other places in this patch
>>>>
>>>> This does look like data parallelism [1] which is very true for
>>>> threaded trace streaming so your prototype design looks optimal.
>>>>
>>>> For this specific place incrementing global counter in memory is
>>>> less performant and faces scalability limitations as a number of
>>>> cores grow.
>>>>
>>>> Not sure why you have changed your mind.
>>>
>>> I'm not sure I follow.. what I'm complaining about is to have
>>> 'samples' stat variable in separate locations for --threads
>>> and --no-threads mode
>>
>> It is optimal to have samples variable as per thread one
>> and then sum up the total in the end of data collection.
>>
>> Single global variable design has scalability and performance
>> drawbacks.
>>
>> Why do you complain about per thread variable in this case?
>> It looks like ideally fits these specific needs.
>
> I think there's misunderstanding.. I think we should move
> samples to per thread 'thread' object and have just one
> copy of that.. and do not increase separate variables for
> thread and non-thread cases
Aw, I see. Using the same __thread object by main thread in
serial and threaded modes. That makes sense.
I will try in v3.
Alexei
>
> jirka
>