LinuxLists.cc - [PATCH v4.15.7 1/1] x86/vdso: handle clock_gettime(CLOCK_MONOTONIC

2018-03-07 00:30:57

Subject: [PATCH v4.15.7 1/1] x86/vdso: handle clock_gettime(CLOCK_MONOTONIC_RAW, &ts) in VDSO

Handling clock_gettime( CLOCK_MONOTONIC_RAW, &timespec)
by calling vdso_fallback_gettime(), ie. syscall, is too slow -
latencies of 300-700ns are common on Haswell (06:3C) CPUs .

This patch against the 4.15.7 stable branch makes the VDSO handle
clock_gettime(CLOCK_GETTIME_RAW, &ts)
by issuing rdtscp in userspace, IFF the clock source is the TSC, and converting
it to nanoseconds using the vsyscall_gtod_data 'mult' and 'shift' fields :

volatile u32 tsc_lo, tsc_hi, tsc_cpu;
asm volatile( "rdtscp" : (=a) tsc_lo, (=d) tsc_hi, (=c) tsc_cpu );
u64 tsc = (((u64)tsc_hi)<<32) | ((u64)tsc_lo);
tsc *= gtod->mult;
tsc >>=gtod->shift;
/* tsc is now number of nanoseconds */
ts->tv_sec = __iter_div_u64_rem( tsc, NSEC_PER_SEC, &ts->tv_nsec);

Use of the "open coded asm" style here actually forces the compiler to
always choose the 32-bit version of rdtscp, which sets only %eax,
%edx, and %ecx and does not clear the high bits of %rax, %rdx, and
%rdx , because the
variables are declared 32-bit - so the same 32-bit version is used whether
the code is compiled with -m32 or -m64 ( tested using gcc 5.4.0, gcc 6.4.1 ) .

The full story and test programs are in Bug #198961 :
https://bugzilla.kernel.org/show_bug.cgi?id=198961
.

The patched VDSO now handles clock_gettime(CLOCK_MONOTONIC_RAW, &ts)
on the same machine with a latency (minimum time that can be measured)
of
around 100ns (compared with 300-700ns before patch).

I also think it makes sense to expose pointers to the live, updated
gtod->mult and gtod->shift values somehow to userspace . Then
a userspace TSC reader could re-use previous values to avoid
the long-division in most cases and obtain latencies of 10-20ns .

Hence there is now a new method in the VDSO:
__ vdso_linux_tsc_calibration()
which returns a pointer to a 'struct linux_tsc_calibration'
declared in a new header
arch/x86/include/uapi/asm/vdso_tsc_calibration.h

If the clock source is NOT the TSC, this function returns NULL .
The pointer is only valid when the system clock source is the TSC .
User-space TSC readers can detect when TSC is modified with Events,
and now can detect when clock source changes from / to TSC with
this function .

The patch :

---
diff --git a/arch/x86/entry/vdso/vclock_gettime.c \
b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d..e840600 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -21,6 +21,7 @@
#include <linux/math64.h>
#include <linux/time.h>
#include <linux/kernel.h>
+#include <uapi/asm/vdso_tsc_calibration.h>

#define gtod (&VVAR(vsyscall_gtod_data))

@@ -246,6 +247,29 @@ notrace static int __always_inline do_monotonic\
(struct timespec *ts)
return mode;
}

+notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
+{
+ volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs
generated for 64-bit as for 32-bit builds
+ u64 ns;
+ register u64 tsc=0;
+ if (gtod->vclock_mode == VCLOCK_TSC)
+ {
+ asm volatile
+ ( "rdtscp"
+ : "=a" (tsc_lo)
+ , "=d" (tsc_hi)
+ , "=c" (tsc_cpu)
+ ); // : eax, edx, ecx used - NOT rax, rdx, rcx
+ tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) |
(((u64)tsc_lo) & 0xffffffffUL);
+ tsc *= gtod->mult;
+ tsc >>= gtod->shift;
+ ts->tv_sec = __iter_div_u64_rem(tsc, NSEC_PER_SEC,
&ns);
+ ts->tv_nsec = ns;
+ return VCLOCK_TSC;
+ }
+ return VCLOCK_NONE;
+}
+
notrace static void do_realtime_coarse(struct timespec *ts)
{
unsigned long seq;
@@ -277,6 +301,10 @@ notrace int __vdso_clock_gettime(clockid_t clock,
struct timespec *ts)
if (do_monotonic(ts) == VCLOCK_NONE)
goto fallback;
break;
+ case CLOCK_MONOTONIC_RAW:
+ if (do_monotonic_raw(ts) == VCLOCK_NONE)
+ goto fallback;
+ break;
case CLOCK_REALTIME_COARSE:
do_realtime_coarse(ts);
break;
@@ -326,3 +354,18 @@ notrace time_t __vdso_time(time_t *t)
}
time_t time(time_t *t)
__attribute__((weak, alias("__vdso_time")));
+
+extern const struct linux_tsc_calibration *
+ __vdso_linux_tsc_calibration(void);
+
+notrace const struct linux_tsc_calibration *
+ __vdso_linux_tsc_calibration(void)
+{
+ if( gtod->vclock_mode == VCLOCK_TSC )
+ return ((const struct linux_tsc_calibration*) &gtod->mult);
+ return 0UL;
+}
+
+const struct linux_tsc_calibration * linux_tsc_calibration(void)
+ __attribute((weak, alias("__vdso_linux_tsc_calibration")));
+
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index d3a2dce..41a2ca5 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -24,7 +24,9 @@ VERSION {
getcpu;
__vdso_getcpu;
time;
- __vdso_time;
+ __vdso_time;
+ linux_tsc_calibration;
+ __vdso_linux_tsc_calibration;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 422764a..d53bd73 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,7 +25,8 @@ VERSION
global:
__vdso_clock_gettime;
__vdso_gettimeofday;
- __vdso_time;
+ __vdso_time;
+ __vdso_linux_tsc_calibration;
};

LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S
b/arch/x86/entry/vdso/vdsox32.lds.S
index 05cd1c5..fb13b16 100644
--- a/arch/x86/entry/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -20,7 +20,8 @@ VERSION {
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_getcpu;
- __vdso_time;
+ __vdso_time;
+ __vdso_linux_tsc_calibration;
local: *;
};
}
diff --git a/arch/x86/include/uapi/asm/vdso_tsc_calibration.h \
b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
new file mode 100644
index 0000000..6febb84
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H
+#define _ASM_X86_VDSO_TSC_CALIBRATION_H
+/*
+ * Programs that want to use rdtsc / rdtscp instructions
+ * from user-space can make use of the Linux kernel TSC calibration
+ * by calling :
+ * __vdso_linux_tsc_calibration(void);
+ * ( one has to resolve this symbol as in
+ * tools/testing/selftests/vDSO/parse_vdso.c
+ * )
+ * which returns a pointer into the read-only
+ * vvar_page (the vsyscall_gtod_data structure),
+ * with the following layout :
+ */
+
+struct linux_tsc_calibration
+{
+ unsigned int mult; /* amount to multiply 64-bit TSC value by */
+
+ unsigned int shift; /* the right shift to apply to
(mult*TSC) yielding \
nanoseconds */
+};
+
+/* To use:
+ *
+ * static const struct lnx_tsc_calibration_s*
+ * (*linux_tsc_cal)(void) = vdso_sym("LINUX_2.6",
"__vdso_linux_tsc_calibration");
+ * if( linux_tsc_cal == 0UL )
+ * { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not \
applied to the kernel.\n");
+ * return ERROR;
+ * }
+ * static const struct lnx_tsc_calibration_s *clock_source =
(*linux_tsc_cal)();
+ * if( clock_source == ((void*)0UL))
+ * fprintf(stderr,"TSC is not the system clocksource.\n");
+ * volatile unsigned int tsc_lo, tsc_hi, tsc_cpu;
+ * asm volatile
+ * ( "rdtscp" : (=a) tsc_hi, (=d) tsc_lo, (=c) tsc_cpu );
+ * unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo;
+ * unsigned long nanoseconds =
+ * (( clock_source -> mult ) * tsc ) >> (clock_source -> shift);
+ *
+ * nanoseconds is now TSC value converted to nanoseconds,
+ * according to Linux' clocksource calibration values.
+ *
+ */
+
+#endif
---

Attachments:

vdso_vclock_gettime_userspace_tsc_4.15.7.patch (5.47 kB)

2018-03-08 14:09:04

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [PATCH v4.15.7 1/1] x86/vdso: handle clock_gettime(CLOCK_MONOTONIC_RAW, &ts) in VDSO

On Tue, 6 Mar 2018, Jason Vas Dias wrote:

Please don't submit patches against randomly chosen stable kernel
versions. Development happens on top of Linus tree or of a subsystem
specific tree.

> +notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
> +{
> + volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs
> generated for 64-bit as for 32-bit builds

volatile is wrong to begin with. And tail comments are horrible.

> + u64 ns;
> + register u64 tsc=0;

Lacks a new line between variable declaration and code. Documentation
surely told you that you should run checkpatch.pl on your patch....

> + if (gtod->vclock_mode == VCLOCK_TSC)
> + {

Opening bracket goes in the same line as the condition.

> + asm volatile
> + ( "rdtscp"
> + : "=a" (tsc_lo)
> + , "=d" (tsc_hi)
> + , "=c" (tsc_cpu)
> + ); // : eax, edx, ecx used - NOT rax, rdx, rcx

I surely asked for using the proper accessor for a reason.

> + tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) |
> (((u64)tsc_lo) & 0xffffffffUL);
> + tsc *= gtod->mult;
> + tsc >>= gtod->shift;

And again this is using the _adjusted_ mult/shift values. So the result is
not CLOCK_MONOTONIC_RAW. Aside of that the mult shift values are accessed
completely unprotected against concurrent changes.

Vs. the multiplication: Any RDTSC value which results in a 64bit
multiplication overflow will result in crap output. That is bound to happen
after a certain amount of uptime....

> + ts->tv_sec = __iter_div_u64_rem(tsc, NSEC_PER_SEC,
> &ns);

And this falls apart once tsc >= (1 << 32) simply because
__iter_div_u64_rem() returns the integer divison result as u32 ...

Aside of that for large values of @tsc this will result in a loop repeating
exactly int(tsc / NSEC_PER_SEC) times. That's surely fast for anything > 1.

Just for the record: It's well documented that
clock_gettime(CLOCK_MONOTONIC_RAW) returns monotonically increasing time
stamps based on the raw hardware clock.

While your implementation might be way faster than the syscall based
implementation it is not providing the same data and semantics.

Performance benchmark results are only interesting when the underlying
implementation is correct. Sorry, but it wouldn't have been rocket science
to figure out that the result wraps around after 4.29497 seconds....

tools/testing/selftests/timers/inconsistency-check

would have certainly detected this, but any serious test program which does
not only do blindly performance benchmarking would have noticed as well.

> + ts->tv_nsec = ns;
> + return VCLOCK_TSC;
> + }
> + return VCLOCK_NONE;
> +}
> +
> notrace static void do_realtime_coarse(struct timespec *ts)
> {
> unsigned long seq;
> @@ -277,6 +301,10 @@ notrace int __vdso_clock_gettime(clockid_t clock,
> struct timespec *ts)
> if (do_monotonic(ts) == VCLOCK_NONE)
> goto fallback;
> break;
> + case CLOCK_MONOTONIC_RAW:
> + if (do_monotonic_raw(ts) == VCLOCK_NONE)
> + goto fallback;
> + break;
> case CLOCK_REALTIME_COARSE:
> do_realtime_coarse(ts);
> break;
> @@ -326,3 +354,18 @@ notrace time_t __vdso_time(time_t *t)
> }
> time_t time(time_t *t)
> __attribute__((weak, alias("__vdso_time")));
> +
> +extern const struct linux_tsc_calibration *
> + __vdso_linux_tsc_calibration(void);

If at all then this stuff wants to be in a separate patch because it has
absolutely nothing to do with the VDSO based CLOCK_MONOTONIC_RAW accessor.

Documentation is clear about that as well:

The point to remember is that each patch should make an easily understood
change that can be verified by reviewers. Each patch should be
justifiable on its own merits.

> +notrace const struct linux_tsc_calibration *
> + __vdso_linux_tsc_calibration(void)
> +{
> + if( gtod->vclock_mode == VCLOCK_TSC )
> + return ((const struct linux_tsc_calibration*) &gtod->mult);

Returning a pointer into a part of another data struture is completely
wrong. The data structure can change and as a result the pointer will point
at even more bogus data as it does anyway.

> + return 0UL;

0UL is hardly a valid return value for a function returning a pointer.

> +}
> +
> +const struct linux_tsc_calibration * linux_tsc_calibration(void)
> + __attribute((weak, alias("__vdso_linux_tsc_calibration")));
> +
> diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
> index d3a2dce..41a2ca5 100644
> --- a/arch/x86/entry/vdso/vdso.lds.S
> +++ b/arch/x86/entry/vdso/vdso.lds.S
> @@ -24,7 +24,9 @@ VERSION {
> getcpu;
> __vdso_getcpu;
> time;
> - __vdso_time;
> + __vdso_time;

You are using spaces not tabs and that line is not supposed to be changed
anyway.

> + linux_tsc_calibration;
> + __vdso_linux_tsc_calibration;

More whitespace damage.

> diff --git a/arch/x86/include/uapi/asm/vdso_tsc_calibration.h \
> b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
> new file mode 100644
> index 0000000..6febb84
> --- /dev/null
> +++ b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
> @@ -0,0 +1,47 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H
> +#define _ASM_X86_VDSO_TSC_CALIBRATION_H
> +/*
> + * Programs that want to use rdtsc / rdtscp instructions
> + * from user-space can make use of the Linux kernel TSC calibration
> + * by calling :
> + * __vdso_linux_tsc_calibration(void);
> + * ( one has to resolve this symbol as in
> + * tools/testing/selftests/vDSO/parse_vdso.c
> + * )
> + * which returns a pointer into the read-only
> + * vvar_page (the vsyscall_gtod_data structure),
> + * with the following layout :

Again. This returns a pointer to a data structure which is completely
decoupled from vsyscall_gtod_data and it just happens to work right now by
some definition of works.

> + */
> +
> +struct linux_tsc_calibration
> +{
> + unsigned int mult; /* amount to multiply 64-bit TSC value by */
> +
> + unsigned int shift; /* the right shift to apply to
> (mult*TSC) yielding \
> nanoseconds */

If you want to document a data structure then please use the proper
kerneldoc comment on top of it.

Thanks,

tglx