2022-07-19 22:43:16

by Ian Rogers

[permalink] [raw]
Subject: [PATCH v3 1/3] perf: Align user space counter reading with code

Align the user space counter reading documentation with the code in
perf_mmap__read_self. Previously the documentation was based on the perf
rdpmc test, but now general purpose code is provided by libperf.

Signed-off-by: Ian Rogers <[email protected]>
---
include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
tools/include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
2 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d37629dbad72..6826dabb7e03 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -538,9 +538,13 @@ struct perf_event_mmap_page {
*
* if (pc->cap_usr_time && enabled != running) {
* cyc = rdtsc();
- * time_offset = pc->time_offset;
* time_mult = pc->time_mult;
* time_shift = pc->time_shift;
+ * time_offset = pc->time_offset;
+ * if (pc->cap_user_time_short) {
+ * time_cycles = pc->time_cycles;
+ * time_mask = pc->time_mask;
+ * }
* }
*
* index = pc->index;
@@ -548,6 +552,9 @@ struct perf_event_mmap_page {
* if (pc->cap_user_rdpmc && index) {
* width = pc->pmc_width;
* pmc = rdpmc(index - 1);
+ * pmc <<= 64 - width;
+ * pmc >>= 64 - width;
+ * count += pmc;
* }
*
* barrier();
@@ -590,25 +597,27 @@ struct perf_event_mmap_page {
* If cap_usr_time the below fields can be used to compute the time
* delta since time_enabled (in ns) using rdtsc or similar.
*
- * u64 quot, rem;
- * u64 delta;
- *
- * quot = (cyc >> time_shift);
- * rem = cyc & (((u64)1 << time_shift) - 1);
- * delta = time_offset + quot * time_mult +
- * ((rem * time_mult) >> time_shift);
+ * cyc = time_cycles + ((cyc - time_cycles) & time_mask);
+ * delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
*
* Where time_offset,time_mult,time_shift and cyc are read in the
- * seqcount loop described above. This delta can then be added to
- * enabled and possible running (if index), improving the scaling:
+ * seqcount loop described above. mul_u64_u32_shr will compute:
+ *
+ * (u64)(((unsigned __int128)cyc * time_mult) >> time_shift)
+ *
+ * This delta can then be added to enabled and possible running (if
+ * index) to improve the scaling. Due to event multiplexing, running
+ * may be zero and so care is needed to avoid division by zero.
*
* enabled += delta;
* if (index)
* running += delta;
*
- * quot = count / running;
- * rem = count % running;
- * count = quot * enabled + (rem * enabled) / running;
+ * if (running != 0) {
+ * quot = count / running;
+ * rem = count % running;
+ * count = quot * enabled + (rem * enabled) / running;
+ * }
*/
__u16 time_shift;
__u32 time_mult;
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index d37629dbad72..6826dabb7e03 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -538,9 +538,13 @@ struct perf_event_mmap_page {
*
* if (pc->cap_usr_time && enabled != running) {
* cyc = rdtsc();
- * time_offset = pc->time_offset;
* time_mult = pc->time_mult;
* time_shift = pc->time_shift;
+ * time_offset = pc->time_offset;
+ * if (pc->cap_user_time_short) {
+ * time_cycles = pc->time_cycles;
+ * time_mask = pc->time_mask;
+ * }
* }
*
* index = pc->index;
@@ -548,6 +552,9 @@ struct perf_event_mmap_page {
* if (pc->cap_user_rdpmc && index) {
* width = pc->pmc_width;
* pmc = rdpmc(index - 1);
+ * pmc <<= 64 - width;
+ * pmc >>= 64 - width;
+ * count += pmc;
* }
*
* barrier();
@@ -590,25 +597,27 @@ struct perf_event_mmap_page {
* If cap_usr_time the below fields can be used to compute the time
* delta since time_enabled (in ns) using rdtsc or similar.
*
- * u64 quot, rem;
- * u64 delta;
- *
- * quot = (cyc >> time_shift);
- * rem = cyc & (((u64)1 << time_shift) - 1);
- * delta = time_offset + quot * time_mult +
- * ((rem * time_mult) >> time_shift);
+ * cyc = time_cycles + ((cyc - time_cycles) & time_mask);
+ * delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
*
* Where time_offset,time_mult,time_shift and cyc are read in the
- * seqcount loop described above. This delta can then be added to
- * enabled and possible running (if index), improving the scaling:
+ * seqcount loop described above. mul_u64_u32_shr will compute:
+ *
+ * (u64)(((unsigned __int128)cyc * time_mult) >> time_shift)
+ *
+ * This delta can then be added to enabled and possible running (if
+ * index) to improve the scaling. Due to event multiplexing, running
+ * may be zero and so care is needed to avoid division by zero.
*
* enabled += delta;
* if (index)
* running += delta;
*
- * quot = count / running;
- * rem = count % running;
- * count = quot * enabled + (rem * enabled) / running;
+ * if (running != 0) {
+ * quot = count / running;
+ * rem = count % running;
+ * count = quot * enabled + (rem * enabled) / running;
+ * }
*/
__u16 time_shift;
__u32 time_mult;
--
2.37.0.170.g444d1eabd0-goog


2022-07-19 23:20:05

by Rob Herring (Arm)

[permalink] [raw]
Subject: Re: [PATCH v3 1/3] perf: Align user space counter reading with code

On Tue, Jul 19, 2022 at 4:39 PM Ian Rogers <[email protected]> wrote:
>
> Align the user space counter reading documentation with the code in
> perf_mmap__read_self. Previously the documentation was based on the perf
> rdpmc test, but now general purpose code is provided by libperf.
>
> Signed-off-by: Ian Rogers <[email protected]>
> ---
> include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> tools/include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> 2 files changed, 44 insertions(+), 26 deletions(-)

Reviewed-by: Rob Herring <[email protected]>

2022-07-20 15:45:40

by Vince Weaver

[permalink] [raw]
Subject: Re: [PATCH v3 1/3] perf: Align user space counter reading with code

On Tue, 19 Jul 2022, Ian Rogers wrote:

> Align the user space counter reading documentation with the code in
> perf_mmap__read_self. Previously the documentation was based on the perf
> rdpmc test, but now general purpose code is provided by libperf.
>
> Signed-off-by: Ian Rogers <[email protected]>
> ---
> include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> tools/include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> 2 files changed, 44 insertions(+), 26 deletions(-)
>
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index d37629dbad72..6826dabb7e03 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -538,9 +538,13 @@ struct perf_event_mmap_page {
> *
> * if (pc->cap_usr_time && enabled != running) {
> * cyc = rdtsc();
> - * time_offset = pc->time_offset;
> * time_mult = pc->time_mult;
> * time_shift = pc->time_shift;
> + * time_offset = pc->time_offset;
> + * if (pc->cap_user_time_short) {
> + * time_cycles = pc->time_cycles;
> + * time_mask = pc->time_mask;
> + * }

From what I've been told, and from what perf_mmap__read_self() actually
does, many of these MMAP fields need to be accessed by READ_ONCE()
(a GPLv2 only interface) to be correct.

Should we update perf_event.h to reflect this? Otherwise it's confusing
when the actual code and the documentation in the header don't match like
this. As an example, see the actual code snippets from
perf_mmap__read_self()

seq = READ_ONCE(pc->lock);
barrier();

count->ena = READ_ONCE(pc->time_enabled);
count->run = READ_ONCE(pc->time_running);

if (pc->cap_user_time && count->ena != count->run) {
cyc = read_timestamp();
time_mult = READ_ONCE(pc->time_mult);
time_shift = READ_ONCE(pc->time_shift);
time_offset = READ_ONCE(pc->time_offset);

if (pc->cap_user_time_short) {
time_cycles = READ_ONCE(pc->time_cycles);
time_mask = READ_ONCE(pc->time_mask);
}
}

idx = READ_ONCE(pc->index);
cnt = READ_ONCE(pc->offset);

...


Vince

2022-07-20 16:23:02

by Ian Rogers

[permalink] [raw]
Subject: Re: [PATCH v3 1/3] perf: Align user space counter reading with code

On Wed, Jul 20, 2022 at 8:06 AM Vince Weaver <[email protected]> wrote:
>
> On Tue, 19 Jul 2022, Ian Rogers wrote:
>
> > Align the user space counter reading documentation with the code in
> > perf_mmap__read_self. Previously the documentation was based on the perf
> > rdpmc test, but now general purpose code is provided by libperf.
> >
> > Signed-off-by: Ian Rogers <[email protected]>
> > ---
> > include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> > tools/include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> > 2 files changed, 44 insertions(+), 26 deletions(-)
> >
> > diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> > index d37629dbad72..6826dabb7e03 100644
> > --- a/include/uapi/linux/perf_event.h
> > +++ b/include/uapi/linux/perf_event.h
> > @@ -538,9 +538,13 @@ struct perf_event_mmap_page {
> > *
> > * if (pc->cap_usr_time && enabled != running) {
> > * cyc = rdtsc();
> > - * time_offset = pc->time_offset;
> > * time_mult = pc->time_mult;
> > * time_shift = pc->time_shift;
> > + * time_offset = pc->time_offset;
> > + * if (pc->cap_user_time_short) {
> > + * time_cycles = pc->time_cycles;
> > + * time_mask = pc->time_mask;
> > + * }
>
> From what I've been told, and from what perf_mmap__read_self() actually
> does, many of these MMAP fields need to be accessed by READ_ONCE()
> (a GPLv2 only interface) to be correct.
>
> Should we update perf_event.h to reflect this? Otherwise it's confusing
> when the actual code and the documentation in the header don't match like
> this. As an example, see the actual code snippets from
> perf_mmap__read_self()
>
> seq = READ_ONCE(pc->lock);
> barrier();
>
> count->ena = READ_ONCE(pc->time_enabled);
> count->run = READ_ONCE(pc->time_running);
>
> if (pc->cap_user_time && count->ena != count->run) {
> cyc = read_timestamp();
> time_mult = READ_ONCE(pc->time_mult);
> time_shift = READ_ONCE(pc->time_shift);
> time_offset = READ_ONCE(pc->time_offset);
>
> if (pc->cap_user_time_short) {
> time_cycles = READ_ONCE(pc->time_cycles);
> time_mask = READ_ONCE(pc->time_mask);
> }
> }
>
> idx = READ_ONCE(pc->index);
> cnt = READ_ONCE(pc->offset);
>
> ...

Thanks! So I think what this patch proposes is an improvement,
specifically it aligns it better with the code and deals with the
divide by zero. I think what you're asking for makes sense but as you
point out READ_ONCE probably isn't the correct API for something
outside the kernel. Given the kernel is now C11:
https://lwn.net/Articles/885941/
This opens the possibility of using stdatomic.h, so perhaps we can
move these variables to more correct atomic types. So, I think we can
land this and worry about the atomic API in a follow up.

Thanks,
Ian

> Vince

2022-08-01 13:18:05

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH v3 1/3] perf: Align user space counter reading with code

Em Tue, Jul 19, 2022 at 03:39:44PM -0700, Ian Rogers escreveu:
> Align the user space counter reading documentation with the code in
> perf_mmap__read_self. Previously the documentation was based on the perf
> rdpmc test, but now general purpose code is provided by libperf.

Peter, can you merge this so as not to make Linus raise eyebrows with me
processing things outside tools/perf/ when asking him to pull perf
userspace?

- Arnaldo

> Signed-off-by: Ian Rogers <[email protected]>
> ---
> include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> tools/include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> 2 files changed, 44 insertions(+), 26 deletions(-)
>
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index d37629dbad72..6826dabb7e03 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -538,9 +538,13 @@ struct perf_event_mmap_page {
> *
> * if (pc->cap_usr_time && enabled != running) {
> * cyc = rdtsc();
> - * time_offset = pc->time_offset;
> * time_mult = pc->time_mult;
> * time_shift = pc->time_shift;
> + * time_offset = pc->time_offset;
> + * if (pc->cap_user_time_short) {
> + * time_cycles = pc->time_cycles;
> + * time_mask = pc->time_mask;
> + * }
> * }
> *
> * index = pc->index;
> @@ -548,6 +552,9 @@ struct perf_event_mmap_page {
> * if (pc->cap_user_rdpmc && index) {
> * width = pc->pmc_width;
> * pmc = rdpmc(index - 1);
> + * pmc <<= 64 - width;
> + * pmc >>= 64 - width;
> + * count += pmc;
> * }
> *
> * barrier();
> @@ -590,25 +597,27 @@ struct perf_event_mmap_page {
> * If cap_usr_time the below fields can be used to compute the time
> * delta since time_enabled (in ns) using rdtsc or similar.
> *
> - * u64 quot, rem;
> - * u64 delta;
> - *
> - * quot = (cyc >> time_shift);
> - * rem = cyc & (((u64)1 << time_shift) - 1);
> - * delta = time_offset + quot * time_mult +
> - * ((rem * time_mult) >> time_shift);
> + * cyc = time_cycles + ((cyc - time_cycles) & time_mask);
> + * delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
> *
> * Where time_offset,time_mult,time_shift and cyc are read in the
> - * seqcount loop described above. This delta can then be added to
> - * enabled and possible running (if index), improving the scaling:
> + * seqcount loop described above. mul_u64_u32_shr will compute:
> + *
> + * (u64)(((unsigned __int128)cyc * time_mult) >> time_shift)
> + *
> + * This delta can then be added to enabled and possible running (if
> + * index) to improve the scaling. Due to event multiplexing, running
> + * may be zero and so care is needed to avoid division by zero.
> *
> * enabled += delta;
> * if (index)
> * running += delta;
> *
> - * quot = count / running;
> - * rem = count % running;
> - * count = quot * enabled + (rem * enabled) / running;
> + * if (running != 0) {
> + * quot = count / running;
> + * rem = count % running;
> + * count = quot * enabled + (rem * enabled) / running;
> + * }
> */
> __u16 time_shift;
> __u32 time_mult;
> diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
> index d37629dbad72..6826dabb7e03 100644
> --- a/tools/include/uapi/linux/perf_event.h
> +++ b/tools/include/uapi/linux/perf_event.h
> @@ -538,9 +538,13 @@ struct perf_event_mmap_page {
> *
> * if (pc->cap_usr_time && enabled != running) {
> * cyc = rdtsc();
> - * time_offset = pc->time_offset;
> * time_mult = pc->time_mult;
> * time_shift = pc->time_shift;
> + * time_offset = pc->time_offset;
> + * if (pc->cap_user_time_short) {
> + * time_cycles = pc->time_cycles;
> + * time_mask = pc->time_mask;
> + * }
> * }
> *
> * index = pc->index;
> @@ -548,6 +552,9 @@ struct perf_event_mmap_page {
> * if (pc->cap_user_rdpmc && index) {
> * width = pc->pmc_width;
> * pmc = rdpmc(index - 1);
> + * pmc <<= 64 - width;
> + * pmc >>= 64 - width;
> + * count += pmc;
> * }
> *
> * barrier();
> @@ -590,25 +597,27 @@ struct perf_event_mmap_page {
> * If cap_usr_time the below fields can be used to compute the time
> * delta since time_enabled (in ns) using rdtsc or similar.
> *
> - * u64 quot, rem;
> - * u64 delta;
> - *
> - * quot = (cyc >> time_shift);
> - * rem = cyc & (((u64)1 << time_shift) - 1);
> - * delta = time_offset + quot * time_mult +
> - * ((rem * time_mult) >> time_shift);
> + * cyc = time_cycles + ((cyc - time_cycles) & time_mask);
> + * delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
> *
> * Where time_offset,time_mult,time_shift and cyc are read in the
> - * seqcount loop described above. This delta can then be added to
> - * enabled and possible running (if index), improving the scaling:
> + * seqcount loop described above. mul_u64_u32_shr will compute:
> + *
> + * (u64)(((unsigned __int128)cyc * time_mult) >> time_shift)
> + *
> + * This delta can then be added to enabled and possible running (if
> + * index) to improve the scaling. Due to event multiplexing, running
> + * may be zero and so care is needed to avoid division by zero.
> *
> * enabled += delta;
> * if (index)
> * running += delta;
> *
> - * quot = count / running;
> - * rem = count % running;
> - * count = quot * enabled + (rem * enabled) / running;
> + * if (running != 0) {
> + * quot = count / running;
> + * rem = count % running;
> + * count = quot * enabled + (rem * enabled) / running;
> + * }
> */
> __u16 time_shift;
> __u32 time_mult;
> --
> 2.37.0.170.g444d1eabd0-goog

--

- Arnaldo

2022-08-04 09:17:31

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v3 1/3] perf: Align user space counter reading with code


* Vince Weaver <[email protected]> wrote:

> On Tue, 19 Jul 2022, Ian Rogers wrote:
>
> > Align the user space counter reading documentation with the code in
> > perf_mmap__read_self. Previously the documentation was based on the perf
> > rdpmc test, but now general purpose code is provided by libperf.
> >
> > Signed-off-by: Ian Rogers <[email protected]>
> > ---
> > include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> > tools/include/uapi/linux/perf_event.h | 35 +++++++++++++++++----------
> > 2 files changed, 44 insertions(+), 26 deletions(-)
> >
> > diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> > index d37629dbad72..6826dabb7e03 100644
> > --- a/include/uapi/linux/perf_event.h
> > +++ b/include/uapi/linux/perf_event.h
> > @@ -538,9 +538,13 @@ struct perf_event_mmap_page {
> > *
> > * if (pc->cap_usr_time && enabled != running) {
> > * cyc = rdtsc();
> > - * time_offset = pc->time_offset;
> > * time_mult = pc->time_mult;
> > * time_shift = pc->time_shift;
> > + * time_offset = pc->time_offset;
> > + * if (pc->cap_user_time_short) {
> > + * time_cycles = pc->time_cycles;
> > + * time_mask = pc->time_mask;
> > + * }
>
> From what I've been told, and from what perf_mmap__read_self() actually
> does, many of these MMAP fields need to be accessed by READ_ONCE()
> (a GPLv2 only interface) to be correct.

BTW., for this purpose I guess we could add a READ_ONCE() variant to perf
tooling that reimplements it with more relaxed licensing, so that the
headers & sample code can be included in GPL-incompatible projects?

READ_ONCE() isn't a super complicated primitive, and we've always been
permissive with simple primitives.

Thanks,

Ingo