2020-03-21 17:39:10

by Ian Rogers

[permalink] [raw]
Subject: [PATCH v2] perf test x86: address multiplexing in rdpmc test

Counters may be being used for pinned or other events which inhibit the
instruction counter in the test from being scheduled - time_enabled > 0
but time_running == 0. This causes the test to fail with division by 0.
Make time_running an out parameter of mmap_read_self and add a sleep loop
to ensure that the counter is running before computing the delta.

v2. Address review feedback from Peter Zijlstra.

Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/arch/x86/tests/rdpmc.c | 57 +++++++++++++++++++++++--------
1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c
index 1ea916656a2d..64145d4e3d4d 100644
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c
@@ -34,20 +34,33 @@ static u64 rdtsc(void)
return low | ((u64)high) << 32;
}

-static u64 mmap_read_self(void *addr)
+/*
+ * Return a user rdpmc result. The kernel may be multiplexing events and so the
+ * result is scaled to the period of time the counter was enabled. The
+ * time_running out parameter is set to the amount of time the counter has been
+ * running. The result will be zero if time_running is zero.
+ */
+static u64 mmap_read_self(void *addr, u64 *running)
{
struct perf_event_mmap_page *pc = addr;
u32 seq, idx, time_mult = 0, time_shift = 0;
- u64 count, cyc = 0, time_offset = 0, enabled, running, delta;
+ u64 count, cyc = 0, time_offset = 0, enabled, delta;

do {
seq = pc->lock;
barrier();

enabled = pc->time_enabled;
- running = pc->time_running;
-
- if (enabled != running) {
+ *running = pc->time_running;
+
+ if (*running == 0) {
+ /*
+ * Counter won't have a value as due to multiplexing the
+ * event wasn't scheduled.
+ */
+ return 0;
+ }
+ if (enabled != *running) {
cyc = rdtsc();
time_mult = pc->time_mult;
time_shift = pc->time_shift;
@@ -62,7 +75,7 @@ static u64 mmap_read_self(void *addr)
barrier();
} while (pc->lock != seq);

- if (enabled != running) {
+ if (enabled != *running) {
u64 quot, rem;

quot = (cyc >> time_shift);
@@ -72,11 +85,11 @@ static u64 mmap_read_self(void *addr)

enabled += delta;
if (idx)
- running += delta;
+ *running += delta;

- quot = count / running;
- rem = count % running;
- count = quot * enabled + (rem * enabled) / running;
+ quot = count / *running;
+ rem = count % *running;
+ count = quot * enabled + (rem * enabled) / *running;
}

return count;
@@ -104,7 +117,7 @@ static int __test__rdpmc(void)
.config = PERF_COUNT_HW_INSTRUCTIONS,
.exclude_kernel = 1,
};
- u64 delta_sum = 0;
+ u64 delta_sum = 0, sleep_count = 0;
struct sigaction sa;
char sbuf[STRERR_BUFSIZE];

@@ -130,14 +143,23 @@ static int __test__rdpmc(void)
}

for (n = 0; n < 6; n++) {
- u64 stamp, now, delta;
-
- stamp = mmap_read_self(addr);
+ u64 stamp, now, delta, running;
+
+ for (;;) {
+ stamp = mmap_read_self(addr, &running);
+ if (running != 0)
+ break;
+ /* Try to wait for event to be running. */
+ sleep_count++;
+ if (sleep_count > 60)
+ goto out_never_run;
+ sleep(1);
+ }

for (i = 0; i < loops; i++)
tmp++;

- now = mmap_read_self(addr);
+ now = mmap_read_self(addr, &running);
loops *= 10;

delta = now - stamp;
@@ -155,6 +177,11 @@ static int __test__rdpmc(void)
return -1;

return 0;
+
+out_never_run:
+ close(fd);
+ pr_err("Event counter failed to multiplexed in. Are higher priority counters being sampled by a different process?\n");
+ return -1;
}

int test__rdpmc(struct test *test __maybe_unused, int subtest __maybe_unused)
--
2.25.1.696.g5e7596f4ac-goog


2020-03-22 10:21:02

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] perf test x86: address multiplexing in rdpmc test

On Sat, Mar 21, 2020 at 10:37:10AM -0700, Ian Rogers wrote:

> +static u64 mmap_read_self(void *addr, u64 *running)
> {
> struct perf_event_mmap_page *pc = addr;
> u32 seq, idx, time_mult = 0, time_shift = 0;
> - u64 count, cyc = 0, time_offset = 0, enabled, running, delta;
> + u64 count, cyc = 0, time_offset = 0, enabled, delta;
>
> do {
> seq = pc->lock;
> barrier();
>
> enabled = pc->time_enabled;
> - running = pc->time_running;
> -
> - if (enabled != running) {
> + *running = pc->time_running;
> +
> + if (*running == 0) {
> + /*
> + * Counter won't have a value as due to multiplexing the
> + * event wasn't scheduled.
> + */
> + return 0;
> + }

I still think adding code for an error case here is a bad idea. And only
passing running as an argument is inconsistent.

Also, then I had a look at what the compiler made of that function and
cried.

Here's something a little better. Much of it copied from linux/math64.h
and asm/div64.h.

---
diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c
index 1ea916656a2d..386a6dacb21e 100644
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c
@@ -34,20 +34,98 @@ static u64 rdtsc(void)
return low | ((u64)high) << 32;
}

-static u64 mmap_read_self(void *addr)
+#ifdef __x86_64__
+static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
+{
+ u64 q;
+
+ asm ("mulq %2; divq %3" : "=a" (q)
+ : "a" (a), "rm" (b), "rm" (c)
+ : "rdx");
+
+ return q;
+}
+#define mul_u64_u64_div64 mul_u64_u64_div64
+#endif
+
+#ifdef __SIZEOF_INT128__
+
+static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
+{
+ return (u64)(((unsigned __int128)a * b) >> shift);
+}
+
+#ifndef mul_u64_u64_div64
+static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
+{
+ unsigned __int128 m = a;
+ m *= b;
+ return m / c;
+}
+#endif
+
+#else
+
+#ifdef __i386__
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+ u32 high, low;
+
+ asm ("mull %[b]" : "=a" (low), "=d" (high)
+ : [a] "a" (a), [b] "rm" (b) );
+
+ return low | ((u64)high) << 32;
+}
+#else
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+ return (u64)a * b;
+}
+#endif
+
+static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
+{
+ u32 ah, al;
+ u64 ret;
+
+ al = a;
+ ah = a >> 32;
+
+ ret = mul_u32_u32(al, mul) >> shift;
+ if (ah)
+ ret += mul_u32_u32(ah, mul) << (32 - shift);
+
+ return ret;
+}
+
+#ifndef mul_u64_u64_div64
+static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
+{
+ u64 quot, rem;
+
+ quot = a / c;
+ rem = a % c;
+
+ return qout * b + (rem * b) / c;
+}
+#endif
+
+#endif
+
+static u64 mmap_read_self(void *addr, u64 *enabled, u64 *running)
{
struct perf_event_mmap_page *pc = addr;
u32 seq, idx, time_mult = 0, time_shift = 0;
- u64 count, cyc = 0, time_offset = 0, enabled, running, delta;
+ u64 count, cyc = 0, time_offset = 0;

do {
seq = pc->lock;
barrier();

- enabled = pc->time_enabled;
- running = pc->time_running;
+ *enabled = pc->time_enabled;
+ *running = pc->time_running;

- if (enabled != running) {
+ if (*enabled != *running) {
cyc = rdtsc();
time_mult = pc->time_mult;
time_shift = pc->time_shift;
@@ -62,21 +140,13 @@ static u64 mmap_read_self(void *addr)
barrier();
} while (pc->lock != seq);

- if (enabled != running) {
- u64 quot, rem;
-
- quot = (cyc >> time_shift);
- rem = cyc & (((u64)1 << time_shift) - 1);
- delta = time_offset + quot * time_mult +
- ((rem * time_mult) >> time_shift);
-
- enabled += delta;
+ if (*enabled != *running) {
+ u64 delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
+ *enabled += delta;
if (idx)
- running += delta;
+ *running += delta;

- quot = count / running;
- rem = count % running;
- count = quot * enabled + (rem * enabled) / running;
+ count = mul_u64_u64_div64(count, *enabled, *running);
}

return count;
@@ -130,14 +200,18 @@ static int __test__rdpmc(void)
}

for (n = 0; n < 6; n++) {
- u64 stamp, now, delta;
+ u64 stamp, now, delta, enabled, running;

- stamp = mmap_read_self(addr);
+ stamp = mmap_read_self(addr, &enabled, &running);

for (i = 0; i < loops; i++)
tmp++;

- now = mmap_read_self(addr);
+ now = mmap_read_self(addr, &enabled, &running);
+
+ if (enabled && !running)
+ goto out_error;
+
loops *= 10;

delta = now - stamp;
@@ -155,6 +229,11 @@ static int __test__rdpmc(void)
return -1;

return 0;
+
+out_error:
+ close(fd);
+ pr_err("counter never ran; you loose\n");
+ return -1;
}

int test__rdpmc(struct test *test __maybe_unused, int subtest __maybe_unused)

2020-03-22 23:19:43

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH v2] perf test x86: address multiplexing in rdpmc test

> Here's something a little better. Much of it copied from linux/math64.h
> and asm/div64.h.

Not sure what the point is of micro optimizing a unit test?

This is never run in production.

-Andi

2020-03-23 00:16:07

by Ian Rogers

[permalink] [raw]
Subject: Re: [PATCH v2] perf test x86: address multiplexing in rdpmc test

On Sun, Mar 22, 2020 at 3:18 AM Peter Zijlstra <[email protected]> wrote:
>
> On Sat, Mar 21, 2020 at 10:37:10AM -0700, Ian Rogers wrote:
>
> > +static u64 mmap_read_self(void *addr, u64 *running)
> > {
> > struct perf_event_mmap_page *pc = addr;
> > u32 seq, idx, time_mult = 0, time_shift = 0;
> > - u64 count, cyc = 0, time_offset = 0, enabled, running, delta;
> > + u64 count, cyc = 0, time_offset = 0, enabled, delta;
> >
> > do {
> > seq = pc->lock;
> > barrier();
> >
> > enabled = pc->time_enabled;
> > - running = pc->time_running;
> > -
> > - if (enabled != running) {
> > + *running = pc->time_running;
> > +
> > + if (*running == 0) {
> > + /*
> > + * Counter won't have a value as due to multiplexing the
> > + * event wasn't scheduled.
> > + */
> > + return 0;
> > + }
>
> I still think adding code for an error case here is a bad idea. And only
> passing running as an argument is inconsistent.
>
> Also, then I had a look at what the compiler made of that function and
> cried.
>
> Here's something a little better. Much of it copied from linux/math64.h
> and asm/div64.h.
>
> ---
> diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c
> index 1ea916656a2d..386a6dacb21e 100644
> --- a/tools/perf/arch/x86/tests/rdpmc.c
> +++ b/tools/perf/arch/x86/tests/rdpmc.c
> @@ -34,20 +34,98 @@ static u64 rdtsc(void)
> return low | ((u64)high) << 32;
> }
>
> -static u64 mmap_read_self(void *addr)
> +#ifdef __x86_64__
> +static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
> +{
> + u64 q;
> +
> + asm ("mulq %2; divq %3" : "=a" (q)
> + : "a" (a), "rm" (b), "rm" (c)
> + : "rdx");
> +
> + return q;
> +}
> +#define mul_u64_u64_div64 mul_u64_u64_div64
> +#endif
> +
> +#ifdef __SIZEOF_INT128__
> +
> +static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
> +{
> + return (u64)(((unsigned __int128)a * b) >> shift);
> +}
> +
> +#ifndef mul_u64_u64_div64
> +static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
> +{
> + unsigned __int128 m = a;
> + m *= b;
> + return m / c;
> +}
> +#endif
> +
> +#else
> +
> +#ifdef __i386__
> +static inline u64 mul_u32_u32(u32 a, u32 b)
> +{
> + u32 high, low;
> +
> + asm ("mull %[b]" : "=a" (low), "=d" (high)
> + : [a] "a" (a), [b] "rm" (b) );
> +
> + return low | ((u64)high) << 32;
> +}
> +#else
> +static inline u64 mul_u32_u32(u32 a, u32 b)
> +{
> + return (u64)a * b;
> +}
> +#endif
> +
> +static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
> +{
> + u32 ah, al;
> + u64 ret;
> +
> + al = a;
> + ah = a >> 32;
> +
> + ret = mul_u32_u32(al, mul) >> shift;
> + if (ah)
> + ret += mul_u32_u32(ah, mul) << (32 - shift);
> +
> + return ret;
> +}
> +
> +#ifndef mul_u64_u64_div64
> +static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
> +{
> + u64 quot, rem;
> +
> + quot = a / c;
> + rem = a % c;
> +
> + return qout * b + (rem * b) / c;
> +}
> +#endif
> +
> +#endif
> +
> +static u64 mmap_read_self(void *addr, u64 *enabled, u64 *running)
> {
> struct perf_event_mmap_page *pc = addr;
> u32 seq, idx, time_mult = 0, time_shift = 0;
> - u64 count, cyc = 0, time_offset = 0, enabled, running, delta;
> + u64 count, cyc = 0, time_offset = 0;
>
> do {
> seq = pc->lock;
> barrier();
>
> - enabled = pc->time_enabled;
> - running = pc->time_running;
> + *enabled = pc->time_enabled;
> + *running = pc->time_running;
>
> - if (enabled != running) {
> + if (*enabled != *running) {
> cyc = rdtsc();
> time_mult = pc->time_mult;
> time_shift = pc->time_shift;
> @@ -62,21 +140,13 @@ static u64 mmap_read_self(void *addr)
> barrier();
> } while (pc->lock != seq);
>
> - if (enabled != running) {
> - u64 quot, rem;
> -
> - quot = (cyc >> time_shift);
> - rem = cyc & (((u64)1 << time_shift) - 1);
> - delta = time_offset + quot * time_mult +
> - ((rem * time_mult) >> time_shift);
> -
> - enabled += delta;
> + if (*enabled != *running) {
> + u64 delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
> + *enabled += delta;
> if (idx)
> - running += delta;
> + *running += delta;
>
> - quot = count / running;
> - rem = count % running;
> - count = quot * enabled + (rem * enabled) / running;
> + count = mul_u64_u64_div64(count, *enabled, *running);

*running may be 0 here, because of multiplexing, and so it will yield a SIGFPE.

> }
>
> return count;
> @@ -130,14 +200,18 @@ static int __test__rdpmc(void)
> }
>
> for (n = 0; n < 6; n++) {
> - u64 stamp, now, delta;
> + u64 stamp, now, delta, enabled, running;
>
> - stamp = mmap_read_self(addr);
> + stamp = mmap_read_self(addr, &enabled, &running);
>
> for (i = 0; i < loops; i++)
> tmp++;
>
> - now = mmap_read_self(addr);
> + now = mmap_read_self(addr, &enabled, &running);
> +
> + if (enabled && !running)
> + goto out_error;
> +
> loops *= 10;
>
> delta = now - stamp;
> @@ -155,6 +229,11 @@ static int __test__rdpmc(void)
> return -1;
>
> return 0;
> +
> +out_error:
> + close(fd);
> + pr_err("counter never ran; you loose\n");
> + return -1;

I'd prefer to retry in this case as I'm not running the test on an
isolated machine. Perhaps -2 (TEST_SKIP) rather than -1 (TEST_FAIL),
some kind of resource not available error would be better.

Thanks,
Ian

> }
>
> int test__rdpmc(struct test *test __maybe_unused, int subtest __maybe_unused)
>

2020-03-23 12:30:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] perf test x86: address multiplexing in rdpmc test

On Sun, Mar 22, 2020 at 04:18:20PM -0700, Andi Kleen wrote:
> > Here's something a little better. Much of it copied from linux/math64.h
> > and asm/div64.h.
>
> Not sure what the point is of micro optimizing a unit test?
>
> This is never run in production.

People might use it as an example of how to use the stuff.. and then
copy horrendous crap code.