Currently the VDSO does not handle
clock_gettime( CLOCK_MONOTONIC_RAW, &ts )
on Intel / AMD - it calls
vdso_fallback_gettime()
for this clock, which issues a syscall, having an unacceptably high
latency (minimum measurable time or time between measurements)
of 300-700ns on 2 2.8-3.9ghz Haswell x86_64 Family'_'Model : 06_3C
machines under various versions of Linux.
This patch handles CLOCK_MONOTONIC_RAW clock_gettime() in the VDSO ,
by exporting the raw clock calibration, last cycles, last xtime_nsec,
and last raw_sec value in the vsyscall_gtod_data during vsyscall_update() .
Now the new do_monotonic_raw() function in the vDSO has a latency of @ 24ns
on average, and the test program:
tools/testing/selftest/timers/inconsistency-check.c
succeeds with arguments: '-c 4 -t 120' or any arbitrary -t value.
The patch is against Linus' latest 4.16-rc4 tree,
current HEAD of :
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
.
The patch affects only files:
arch/x86/include/asm/vgtod.h
arch/x86/entry/vdso/vclock_gettime.c
arch/x86/entry/vsyscall/vsyscall_gtod.c
Best Regards,
Jason Vas Dias .
---
diff -up linux-4.16-rc4/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc4 linux-4.16-rc4/arch/x86/entry/vdso/vclock_gettime.c
--- linux-4.16-rc4/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc4 2018-03-04 22:54:11.000000000 +0000
+++ linux-4.16-rc4/arch/x86/entry/vdso/vclock_gettime.c 2018-03-11 05:08:31.137681337 +0000
@@ -182,6 +182,29 @@ notrace static u64 vread_tsc(void)
return last;
}
+notrace static u64 vread_tsc_raw(void)
+{
+ u64 tsc, last=gtod->raw_cycle_last;
+ if( likely( gtod->has_rdtscp ) ) {
+ u32 tsc_lo, tsc_hi,
+ tsc_cpu __attribute__((unused));
+ asm volatile
+ ( "rdtscp"
+ /* ^- has built-in cancellation point / pipeline stall "barrier" */
+ : "=a" (tsc_lo)
+ , "=d" (tsc_hi)
+ , "=c" (tsc_cpu)
+ ); // since all variables 32-bit, eax, edx, ecx used - NOT rax, rdx, rcx
+ tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo) & 0xffffffffUL);
+ } else {
+ tsc = rdtsc_ordered();
+ }
+ if (likely(tsc >= last))
+ return tsc;
+ asm volatile ("");
+ return last;
+}
+
notrace static inline u64 vgetsns(int *mode)
{
u64 v;
@@ -203,6 +226,27 @@ notrace static inline u64 vgetsns(int *m
return v * gtod->mult;
}
+notrace static inline u64 vgetsns_raw(int *mode)
+{
+ u64 v;
+ cycles_t cycles;
+
+ if (gtod->vclock_mode == VCLOCK_TSC)
+ cycles = vread_tsc_raw();
+#ifdef CONFIG_PARAVIRT_CLOCK
+ else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
+ cycles = vread_pvclock(mode);
+#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+ else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
+ cycles = vread_hvclock(mode);
+#endif
+ else
+ return 0;
+ v = (cycles - gtod->raw_cycle_last) & gtod->raw_mask;
+ return v * gtod->raw_mult;
+}
+
/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
notrace static int __always_inline do_realtime(struct timespec *ts)
{
@@ -246,6 +290,27 @@ notrace static int __always_inline do_mo
return mode;
}
+notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
+{
+ unsigned long seq;
+ u64 ns;
+ int mode;
+
+ do {
+ seq = gtod_read_begin(gtod);
+ mode = gtod->vclock_mode;
+ ts->tv_sec = gtod->monotonic_time_raw_sec;
+ ns = gtod->monotonic_time_raw_nsec;
+ ns += vgetsns_raw(&mode);
+ ns >>= gtod->raw_shift;
+ } while (unlikely(gtod_read_retry(gtod, seq)));
+
+ ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return mode;
+}
+
notrace static void do_realtime_coarse(struct timespec *ts)
{
unsigned long seq;
@@ -277,6 +342,10 @@ notrace int __vdso_clock_gettime(clockid
if (do_monotonic(ts) == VCLOCK_NONE)
goto fallback;
break;
+ case CLOCK_MONOTONIC_RAW:
+ if (do_monotonic_raw(ts) == VCLOCK_NONE)
+ goto fallback;
+ break;
case CLOCK_REALTIME_COARSE:
do_realtime_coarse(ts);
break;
diff -up linux-4.16-rc4/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc4 linux-4.16-rc4/arch/x86/entry/vsyscall/vsyscall_gtod.c
--- linux-4.16-rc4/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc4 2018-03-04 22:54:11.000000000 +0000
+++ linux-4.16-rc4/arch/x86/entry/vsyscall/vsyscall_gtod.c 2018-03-11 05:10:57.371197747 +0000
@@ -16,6 +16,7 @@
#include <linux/timekeeper_internal.h>
#include <asm/vgtod.h>
#include <asm/vvar.h>
+#include <asm/cpufeature.h>
int vclocks_used __read_mostly;
@@ -45,6 +46,12 @@ void update_vsyscall(struct timekeeper *
vdata->mult = tk->tkr_mono.mult;
vdata->shift = tk->tkr_mono.shift;
+ vdata->raw_cycle_last = tk->tkr_raw.cycle_last;
+ vdata->raw_mask = tk->tkr_raw.mask;
+ vdata->raw_mult = tk->tkr_raw.mult;
+ vdata->raw_shift = tk->tkr_raw.shift;
+ vdata->has_rdtscp = static_cpu_has(X86_FEATURE_RDTSCP);
+
vdata->wall_time_sec = tk->xtime_sec;
vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
@@ -74,5 +81,8 @@ void update_vsyscall(struct timekeeper *
vdata->monotonic_time_coarse_sec++;
}
+ vdata->monotonic_time_raw_sec = tk->raw_sec;
+ vdata->monotonic_time_raw_nsec = tk->tkr_raw.xtime_nsec;
+
gtod_write_end(vdata);
}
diff -up linux-4.16-rc4/arch/x86/include/asm/vgtod.h.4.16-rc4 linux-4.16-rc4/arch/x86/include/asm/vgtod.h
--- linux-4.16-rc4/arch/x86/include/asm/vgtod.h.4.16-rc4 2018-03-04 22:54:11.000000000 +0000
+++ linux-4.16-rc4/arch/x86/include/asm/vgtod.h 2018-03-11 05:12:35.916338703 +0000
@@ -22,6 +22,11 @@ struct vsyscall_gtod_data {
u64 mask;
u32 mult;
u32 shift;
+ u64 raw_cycle_last;
+ u64 raw_mask;
+ u32 raw_mult;
+ u32 raw_shift;
+ u32 has_rdtscp;
/* open coded 'struct timespec' */
u64 wall_time_snsec;
@@ -32,6 +37,8 @@ struct vsyscall_gtod_data {
gtod_long_t wall_time_coarse_nsec;
gtod_long_t monotonic_time_coarse_sec;
gtod_long_t monotonic_time_coarse_nsec;
+ gtod_long_t monotonic_time_raw_sec;
+ gtod_long_t monotonic_time_raw_nsec;
int tz_minuteswest;
int tz_dsttime;
---
On Sun, 11 Mar 2018, Jason Vas Dias wrote:
This looks better now. Though running that patch through checkpatch.pl
results in:
total: 28 errors, 20 warnings, 139 lines checked
....
> +notrace static u64 vread_tsc_raw(void)
Why do you need a separate function? I asked you to use vread_tsc(). So you
might have reasons for doing that, but please then explain WHY and not just
throw the stuff in my direction w/o any comment.
> +{
> + u64 tsc, last=gtod->raw_cycle_last;
> + if( likely( gtod->has_rdtscp ) ) {
> + u32 tsc_lo, tsc_hi,
> + tsc_cpu __attribute__((unused));
> + asm volatile
> + ( "rdtscp"
> + /* ^- has built-in cancellation point / pipeline stall "barrier" */
> + : "=a" (tsc_lo)
> + , "=d" (tsc_hi)
> + , "=c" (tsc_cpu)
> + ); // since all variables 32-bit, eax, edx, ecx used - NOT rax, rdx, rcx
> + tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo) & 0xffffffffUL);
This is not required to make the vdso accessor for monotonic raw work.
If at all then the rdtscp support wants to be in a separate patch with a
proper explanation.
Aside of that the code for rdtscp wants to be in a proper inline helper in
the relevant header file and written according to the coding style the
kernel uses for asm inlines.
> + } else {
> + tsc = rdtsc_ordered();
> + }
> + if (likely(tsc >= last))
> + return tsc;
> + asm volatile ("");
> + return last;
> +}
The rest looks ok.
Thanks,
tglx
Thanks Thomas -
On 11/03/2018, Thomas Gleixner <[email protected]> wrote:
> On Sun, 11 Mar 2018, Jason Vas Dias wrote:
>
> This looks better now. Though running that patch through checkpatch.pl
> results in:
>
> total: 28 errors, 20 warnings, 139 lines checked
>
Hmm, I was unaware of that script, I'll run and find out why -
probably because whitespace is not visible in emacs with
my monospace font and it is very difficult to see if tabs
are used if somehow a '\t\ ' or ' \t' has slipped in .
I'll run the script, fix the errors. and repost.
> ....
>
>> +notrace static u64 vread_tsc_raw(void)
>
> Why do you need a separate function? I asked you to use vread_tsc(). So you
> might have reasons for doing that, but please then explain WHY and not just
> throw the stuff in my direction w/o any comment.
>
mainly, because vread_tsc() makes its comparison against gtod->cycles_last ,
a copy of tk->tkr_mono.cycle_last, while vread_tsc_raw() uses
gtod->raw_cycle_last, a copy of tk->tkr_raw.cycle_last .
And rdtscp has a built-in "barrier", as the comments explain, making
rdtsc_ordered()'s 'barrier()' unnecessary .
>> +{
>> + u64 tsc, last=gtod->raw_cycle_last;
>> + if( likely( gtod->has_rdtscp ) ) {
>> + u32 tsc_lo, tsc_hi,
>> + tsc_cpu __attribute__((unused));
>> + asm volatile
>> + ( "rdtscp"
>> + /* ^- has built-in cancellation point / pipeline stall
>> "barrier" */
>> + : "=a" (tsc_lo)
>> + , "=d" (tsc_hi)
>> + , "=c" (tsc_cpu)
>> + ); // since all variables 32-bit, eax, edx, ecx used -
>> NOT rax, rdx, rcx
>> + tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) |
>> (((u64)tsc_lo) & 0xffffffffUL);
>
> This is not required to make the vdso accessor for monotonic raw work.
>
> If at all then the rdtscp support wants to be in a separate patch with a
> proper explanation.
>
> Aside of that the code for rdtscp wants to be in a proper inline helper in
> the relevant header file and written according to the coding style the
> kernel uses for asm inlines.
>
Sorry, I will put the function in the same header as rdtsc_ordered () ,
in a separate patch.
> The rest looks ok.
>
> Thanks,
>
> tglx
>
I'll re-generate patches and resend .
A complete patch , against 4.15.9, is attached , that I am using ,
including a suggested '__vdso_linux_tsc_calibration()'
function and arch/x86/include/uapi/asm/vdso_tsc_calibration.h file
that does not return any pointers into the VDSO .
Presuming this was split into separate patches as you suggest,
and was against the latest HEAD branch (4.16-rcX), would it be OK to
include the vdso_linux_tsc_calibration() work ?
It does enable user space code to develop accurate TSC readers
which are free to use different structures and pico-second resolution.
The actual user-space clock_gettime(CLOCK_MONOTONIC_RAW)
replacement I am using for work just reads the TSC , with a latency of
< 8ns, and uses the linux_tsc_calibration to convert using
floating-point as required.
Thanks & Regards,
Jason
On Sun, 11 Mar 2018, Jason Vas Dias wrote:
> On 11/03/2018, Thomas Gleixner <[email protected]> wrote:
> > On Sun, 11 Mar 2018, Jason Vas Dias wrote:
> >
> > This looks better now. Though running that patch through checkpatch.pl
> > results in:
> >
> > total: 28 errors, 20 warnings, 139 lines checked
> >
>
> Hmm, I was unaware of that script, I'll run and find out why -
Documentation/process/* which I recommended for reading before definitely
mentions it.
> probably because whitespace is not visible in emacs with
> my monospace font and it is very difficult to see if tabs
> are used if somehow a '\t\ ' or ' \t' has slipped in .
It's not only about whitespace ....
> >> +notrace static u64 vread_tsc_raw(void)
> >
> > Why do you need a separate function? I asked you to use vread_tsc(). So you
> > might have reasons for doing that, but please then explain WHY and not just
> > throw the stuff in my direction w/o any comment.
> >
>
> mainly, because vread_tsc() makes its comparison against gtod->cycles_last ,
> a copy of tk->tkr_mono.cycle_last, while vread_tsc_raw() uses
> gtod->raw_cycle_last, a copy of tk->tkr_raw.cycle_last .
Well, if you look at the timekeeping core then you'll notice that it's
actually the same value. It's updated in both tkr_mono and tkr_raw so its
available when either of the tkr_* structs is handed to the relevant
functions.
> A complete patch , against 4.15.9, is attached , that I am using ,
> including a suggested '__vdso_linux_tsc_calibration()'
> function and arch/x86/include/uapi/asm/vdso_tsc_calibration.h file
> that does not return any pointers into the VDSO .
I'm not going to look at attached complete patches. The development process
is well defined for a reason and it's not optional.
Thanks,
tglx