LinuxLists.cc - [patch 09/11] x86/vdso: Simplify the invalid vclock case

2018-09-14 12:52:33

Subject: [patch 09/11] x86/vdso: Simplify the invalid vclock case

The code flow for the vclocks is convoluted as it requires the vclocks
which can be invalidated separately from the vsyscall_gtod_data sequence to
store the fact in a separate variable. That's inefficient.

Restructure the code so the vclock readout returns cycles and the
conversion to nanoseconds is handled at the call site.

If the clock gets invalidated or vclock is already VCLOCK_NONE, return
U64_MAX as the cycle value, which is invalid for all clocks and leave the
sequence loop immediately in that case by calling the fallback function
directly.

This allows to remove the gettimeofday fallback as it now uses the
clock_gettime() fallback and does the nanoseconds to microseconds
conversion in the same way as it does when the vclock is functional. It
does not make a difference whether the division by 1000 happens in the
kernel fallback or in userspace.

Generates way better code and gains a few cycles back.

Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/entry/vdso/vclock_gettime.c | 81 +++++++++--------------------------
1 file changed, 21 insertions(+), 60 deletions(-)

--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -48,16 +48,6 @@ notrace static long vdso_fallback_gettim
return ret;
}

-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
- long ret;
-
- asm("syscall" : "=a" (ret) :
- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
- return ret;
-}
-
-
#else

notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
@@ -75,21 +65,6 @@ notrace static long vdso_fallback_gettim
return ret;
}

-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
- long ret;
-
- asm(
- "mov %%ebx, %%edx \n"
- "mov %2, %%ebx \n"
- "call __kernel_vsyscall \n"
- "mov %%edx, %%ebx \n"
- : "=a" (ret)
- : "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
- : "memory", "edx");
- return ret;
-}
-
#endif

#ifdef CONFIG_PARAVIRT_CLOCK
@@ -98,7 +73,7 @@ static notrace const struct pvclock_vsys
return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
}

-static notrace u64 vread_pvclock(int *mode)
+static notrace u64 vread_pvclock(void)
{
const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
u64 ret;
@@ -130,10 +105,8 @@ static notrace u64 vread_pvclock(int *mo
do {
version = pvclock_read_begin(pvti);

- if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
- *mode = VCLOCK_NONE;
- return 0;
- }
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)))
+ return U64_MAX;

ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
} while (pvclock_read_retry(pvti, version));
@@ -148,17 +121,12 @@ static notrace u64 vread_pvclock(int *mo
}
#endif
#ifdef CONFIG_HYPERV_TSCPAGE
-static notrace u64 vread_hvclock(int *mode)
+static notrace u64 vread_hvclock(void)
{
const struct ms_hyperv_tsc_page *tsc_pg =
(const struct ms_hyperv_tsc_page *)&hvclock_page;
- u64 current_tick = hv_read_tsc_page(tsc_pg);
-
- if (current_tick != U64_MAX)
- return current_tick;

- *mode = VCLOCK_NONE;
- return 0;
+ return hv_read_tsc_page(tsc_pg);
}
#endif

@@ -182,47 +150,42 @@ notrace static u64 vread_tsc(void)
return last;
}

-notrace static inline u64 vgetsns(int *mode)
+notrace static inline u64 vgetcyc(int mode)
{
- u64 v;
- cycles_t cycles;
-
- if (gtod->vclock_mode == VCLOCK_TSC)
- cycles = vread_tsc();
+ if (mode == VCLOCK_TSC)
+ return vread_tsc();
#ifdef CONFIG_PARAVIRT_CLOCK
- else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
- cycles = vread_pvclock(mode);
+ else if (mode == VCLOCK_PVCLOCK)
+ return vread_pvclock();
#endif
#ifdef CONFIG_HYPERV_TSCPAGE
- else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
- cycles = vread_hvclock(mode);
+ else if (mode == VCLOCK_HVCLOCK)
+ return vread_hvclock();
#endif
- else
- return 0;
- v = cycles - gtod->cycle_last;
- return v * gtod->mult;
+ return U64_MAX;
}

notrace static int do_hres(clockid_t clk, struct timespec *ts)
{
struct vgtod_ts *base = &gtod->basetime[clk];
unsigned int seq;
- int mode;
- u64 ns;
+ u64 cycles, ns;

do {
seq = gtod_read_begin(gtod);
- mode = gtod->vclock_mode;
ts->tv_sec = base->sec;
ns = base->nsec;
- ns += vgetsns(&mode);
+ cycles = vgetcyc(gtod->vclock_mode);
+ if (unlikely((s64)cycles < 0))
+ return vdso_fallback_gettime(clk, ts);
+ ns += (cycles - gtod->cycle_last) * gtod->mult;
ns >>= gtod->shift;
} while (unlikely(gtod_read_retry(gtod, seq)));

ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;

- return mode;
+ return 0;
}

notrace static void do_coarse(clockid_t clk, struct timespec *ts)
@@ -251,8 +214,7 @@ notrace int __vdso_clock_gettime(clockid
*/
msk = 1U << clock;
if (likely(msk & VGTOD_HRES)) {
- if (do_hres(clock, ts) != VCLOCK_NONE)
- return 0;
+ return do_hres(clock, ts);
} else if (msk & VGTOD_COARSE) {
do_coarse(clock, ts);
return 0;
@@ -268,8 +230,7 @@ notrace int __vdso_gettimeofday(struct t
if (likely(tv != NULL)) {
struct timespec *ts = (struct timespec *) tv;

- if (unlikely(do_hres(CLOCK_REALTIME, ts) == VCLOCK_NONE))
- return vdso_fallback_gtod(tv, tz);
+ do_hres(CLOCK_REALTIME, ts);
tv->tv_usec /= 1000;
}
if (unlikely(tz != NULL)) {

2018-09-17 19:26:37

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Fri, Sep 14, 2018 at 5:50 AM, Thomas Gleixner <[email protected]> wrote:
> The code flow for the vclocks is convoluted as it requires the vclocks
> which can be invalidated separately from the vsyscall_gtod_data sequence to
> store the fact in a separate variable. That's inefficient.
>

> notrace static int do_hres(clockid_t clk, struct timespec *ts)
> {
> struct vgtod_ts *base = &gtod->basetime[clk];
> unsigned int seq;
> - int mode;
> - u64 ns;
> + u64 cycles, ns;
>
> do {
> seq = gtod_read_begin(gtod);
> - mode = gtod->vclock_mode;
> ts->tv_sec = base->sec;
> ns = base->nsec;
> - ns += vgetsns(&mode);
> + cycles = vgetcyc(gtod->vclock_mode);
> + if (unlikely((s64)cycles < 0))
> + return vdso_fallback_gettime(clk, ts);

i was contemplating this, and I would suggest one of two optimizations:

1. have all the helpers return a struct containing a u64 and a bool,
and use that bool. The compiler should do okay.

2. Be sneaky. Later in the series, you do:

if (unlikely((s64)cycles < 0))
return vdso_fallback_gettime(clk, ts);
- ns += (cycles - gtod->cycle_last) * gtod->mult;
+ if (cycles > last)
+ ns += (cycles - last) * gtod->mult;

How about:

if (unlikely((s64)cycles <= last)) {
if (cycles < 0) [or cycles == -1 or whatever]
return vdso_fallback_gettime;
} else {
ns += (cycles - last) * gtod->mult;
}

which should actually make this whole mess be essentially free.

Also, I'm not entirely convinced that this "last" thing is needed at
all. John, what's the scenario under which we need it?

--Andy

--Andy

2018-09-17 20:13:02

by John Stultz

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Mon, Sep 17, 2018 at 12:25 PM, Andy Lutomirski <[email protected]> wrote:
> On Fri, Sep 14, 2018 at 5:50 AM, Thomas Gleixner <[email protected]> wrote:
>> The code flow for the vclocks is convoluted as it requires the vclocks
>> which can be invalidated separately from the vsyscall_gtod_data sequence to
>> store the fact in a separate variable. That's inefficient.
>>
>
>> notrace static int do_hres(clockid_t clk, struct timespec *ts)
>> {
>> struct vgtod_ts *base = &gtod->basetime[clk];
>> unsigned int seq;
>> - int mode;
>> - u64 ns;
>> + u64 cycles, ns;
>>
>> do {
>> seq = gtod_read_begin(gtod);
>> - mode = gtod->vclock_mode;
>> ts->tv_sec = base->sec;
>> ns = base->nsec;
>> - ns += vgetsns(&mode);
>> + cycles = vgetcyc(gtod->vclock_mode);
>> + if (unlikely((s64)cycles < 0))
>> + return vdso_fallback_gettime(clk, ts);
>
> i was contemplating this, and I would suggest one of two optimizations:
>
> 1. have all the helpers return a struct containing a u64 and a bool,
> and use that bool. The compiler should do okay.
>
> 2. Be sneaky. Later in the series, you do:
>
> if (unlikely((s64)cycles < 0))
> return vdso_fallback_gettime(clk, ts);
> - ns += (cycles - gtod->cycle_last) * gtod->mult;
> + if (cycles > last)
> + ns += (cycles - last) * gtod->mult;
>
> How about:
>
> if (unlikely((s64)cycles <= last)) {
> if (cycles < 0) [or cycles == -1 or whatever]
> return vdso_fallback_gettime;
> } else {
> ns += (cycles - last) * gtod->mult;
> }
>
> which should actually make this whole mess be essentially free.
>
> Also, I'm not entirely convinced that this "last" thing is needed at
> all. John, what's the scenario under which we need it?

So my memory is probably a bit foggy, but I recall that as we
accelerated gettimeofday, we found that even on systems that claimed
to have synced TSCs, they were actually just slightly out of sync.
Enough that right after cycles_last had been updated, a read on
another cpu could come in just behind cycles_last, resulting in a
negative interval causing lots of havoc.

So the sanity check is needed to avoid that case.

thanks
-john

2018-09-18 07:53:18

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Mon, 17 Sep 2018, John Stultz wrote:
> On Mon, Sep 17, 2018 at 12:25 PM, Andy Lutomirski <[email protected]> wrote:
> > Also, I'm not entirely convinced that this "last" thing is needed at
> > all. John, what's the scenario under which we need it?
>
> So my memory is probably a bit foggy, but I recall that as we
> accelerated gettimeofday, we found that even on systems that claimed
> to have synced TSCs, they were actually just slightly out of sync.
> Enough that right after cycles_last had been updated, a read on
> another cpu could come in just behind cycles_last, resulting in a
> negative interval causing lots of havoc.
>
> So the sanity check is needed to avoid that case.

Your memory serves you right. That's indeed observable on CPUs which
lack TSC_ADJUST.

@Andy: Welcome to the wonderful world of TSC.

Thanks,

tglx

2018-09-18 08:32:11

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, Sep 18, 2018 at 09:52:26AM +0200, Thomas Gleixner wrote:
> On Mon, 17 Sep 2018, John Stultz wrote:
> > On Mon, Sep 17, 2018 at 12:25 PM, Andy Lutomirski <[email protected]> wrote:
> > > Also, I'm not entirely convinced that this "last" thing is needed at
> > > all. John, what's the scenario under which we need it?
> >
> > So my memory is probably a bit foggy, but I recall that as we
> > accelerated gettimeofday, we found that even on systems that claimed
> > to have synced TSCs, they were actually just slightly out of sync.
> > Enough that right after cycles_last had been updated, a read on
> > another cpu could come in just behind cycles_last, resulting in a
> > negative interval causing lots of havoc.
> >
> > So the sanity check is needed to avoid that case.
>
> Your memory serves you right. That's indeed observable on CPUs which
> lack TSC_ADJUST.

But, if the gtod code can observe this, then why doesn't the code that
checks the sync?

2018-09-18 08:55:57

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, 18 Sep 2018, Peter Zijlstra wrote:
> On Tue, Sep 18, 2018 at 09:52:26AM +0200, Thomas Gleixner wrote:
> > On Mon, 17 Sep 2018, John Stultz wrote:
> > > On Mon, Sep 17, 2018 at 12:25 PM, Andy Lutomirski <[email protected]> wrote:
> > > > Also, I'm not entirely convinced that this "last" thing is needed at
> > > > all. John, what's the scenario under which we need it?
> > >
> > > So my memory is probably a bit foggy, but I recall that as we
> > > accelerated gettimeofday, we found that even on systems that claimed
> > > to have synced TSCs, they were actually just slightly out of sync.
> > > Enough that right after cycles_last had been updated, a read on
> > > another cpu could come in just behind cycles_last, resulting in a
> > > negative interval causing lots of havoc.
> > >
> > > So the sanity check is needed to avoid that case.
> >
> > Your memory serves you right. That's indeed observable on CPUs which
> > lack TSC_ADJUST.
>
> But, if the gtod code can observe this, then why doesn't the code that
> checks the sync?

Because it depends where the involved CPUs are in the topology. The sync
code might just run on the same package an simply not see it. Yes, w/o
TSC_ADJUST the TSC sync code can just fail to see the havoc.

Thanks,

tglx

2018-09-18 10:06:56

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, 18 Sep 2018, Thomas Gleixner wrote:
> On Tue, 18 Sep 2018, Peter Zijlstra wrote:
> > > Your memory serves you right. That's indeed observable on CPUs which
> > > lack TSC_ADJUST.
> >
> > But, if the gtod code can observe this, then why doesn't the code that
> > checks the sync?
>
> Because it depends where the involved CPUs are in the topology. The sync
> code might just run on the same package an simply not see it. Yes, w/o
> TSC_ADJUST the TSC sync code can just fail to see the havoc.

Even with TSC adjust the TSC can be slightly off by design on multi-socket
systems.

Thanks,

tglx

2018-09-18 10:44:43

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, 18 Sep 2018, Thomas Gleixner wrote:
> On Tue, 18 Sep 2018, Thomas Gleixner wrote:
> > On Tue, 18 Sep 2018, Peter Zijlstra wrote:
> > > > Your memory serves you right. That's indeed observable on CPUs which
> > > > lack TSC_ADJUST.
> > >
> > > But, if the gtod code can observe this, then why doesn't the code that
> > > checks the sync?
> >
> > Because it depends where the involved CPUs are in the topology. The sync
> > code might just run on the same package an simply not see it. Yes, w/o
> > TSC_ADJUST the TSC sync code can just fail to see the havoc.
>
> Even with TSC adjust the TSC can be slightly off by design on multi-socket
> systems.

Here are the gory details:

https://lore.kernel.org/lkml/[email protected]/

The changelog has an explanation as well.

d8bb6f4c1670 ("x86: tsc prevent time going backwards")

I still have one of the machines which is affected by this.

Thanks,

tglx

2018-09-18 12:50:22

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, Sep 18, 2018 at 12:41:57PM +0200, Thomas Gleixner wrote:
> On Tue, 18 Sep 2018, Thomas Gleixner wrote:
> > On Tue, 18 Sep 2018, Thomas Gleixner wrote:
> > > On Tue, 18 Sep 2018, Peter Zijlstra wrote:
> > > > > Your memory serves you right. That's indeed observable on CPUs which
> > > > > lack TSC_ADJUST.
> > > >
> > > > But, if the gtod code can observe this, then why doesn't the code that
> > > > checks the sync?
> > >
> > > Because it depends where the involved CPUs are in the topology. The sync
> > > code might just run on the same package an simply not see it. Yes, w/o
> > > TSC_ADJUST the TSC sync code can just fail to see the havoc.
> >
> > Even with TSC adjust the TSC can be slightly off by design on multi-socket
> > systems.
>
> Here are the gory details:
>
> https://lore.kernel.org/lkml/[email protected]/
>
> The changelog has an explanation as well.
>
> d8bb6f4c1670 ("x86: tsc prevent time going backwards")
>
> I still have one of the machines which is affected by this.

Are we sure this isn't a load vs rdtsc reorder? Because if I look at the
current code:

notrace static u64 vread_tsc(void)
{
u64 ret = (u64)rdtsc_ordered();
u64 last = gtod->cycle_last;

if (likely(ret >= last))
return ret;

/*
* GCC likes to generate cmov here, but this branch is extremely
* predictable (it's just a function of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
* ever gets inlined it will generate worse code.
*/
asm volatile ("");
return last;
}

That does:

lfence
rdtsc
load gtod->cycle_last

Which obviously allows us to observe a cycles_last that is later than
the rdtsc itself, and thus time can trivially go backwards.

The new code:

last = gtod->cycle_last;
cycles = vgetcyc(gtod->vclock_mode);
if (unlikely((s64)cycles < 0))
return vdso_fallback_gettime(clk, ts);
if (cycles > last)
ns += (cycles - last) * gtod->mult;

looks like:

load gtod->cycle_last
lfence
rdtsc

which avoids that possibility, the cycle_last load must have completed
before the rdtsc.

2018-09-18 13:24:23

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, 18 Sep 2018, Peter Zijlstra wrote:
> On Tue, Sep 18, 2018 at 12:41:57PM +0200, Thomas Gleixner wrote:
> > I still have one of the machines which is affected by this.
>
> Are we sure this isn't a load vs rdtsc reorder? Because if I look at the
> current code:

The load order of last vs. rdtsc does not matter at all.

CPU0 CPU1

....
now0 = rdtsc_ordered();
...
tk->cycle_last = now0;

gtod->seq++;
gtod->cycle_last = tk->cycle_last;
...
gtod->seq++;
seq_begin(gtod->seq);
now1 = rdtsc_ordered();

So if the TSC on CPU1 is slightly behind the TSC on CPU0 then now1 can be
smaller than cycle_last. The TSC sync stuff does not catch the small delta
for unknown raisins. I'll go and find that machine and test that again.

Thanks,

tglx

2018-09-18 13:41:39

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, Sep 18, 2018 at 03:23:08PM +0200, Thomas Gleixner wrote:
> On Tue, 18 Sep 2018, Peter Zijlstra wrote:
> > On Tue, Sep 18, 2018 at 12:41:57PM +0200, Thomas Gleixner wrote:
> > > I still have one of the machines which is affected by this.
> >
> > Are we sure this isn't a load vs rdtsc reorder? Because if I look at the
> > current code:
>
> The load order of last vs. rdtsc does not matter at all.
>
> CPU0 CPU1
>
> ....
> now0 = rdtsc_ordered();
> ...
> tk->cycle_last = now0;
>
> gtod->seq++;
> gtod->cycle_last = tk->cycle_last;
> ...
> gtod->seq++;
> seq_begin(gtod->seq);
> now1 = rdtsc_ordered();
>
> So if the TSC on CPU1 is slightly behind the TSC on CPU0 then now1 can be
> smaller than cycle_last. The TSC sync stuff does not catch the small delta
> for unknown raisins. I'll go and find that machine and test that again.

Yeah, somehow I forgot about rseq.. maybe I should go sleep or
something.

2018-09-18 14:02:00

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

> On Sep 18, 2018, at 12:52 AM, Thomas Gleixner <[email protected]> wrote:
>
>> On Mon, 17 Sep 2018, John Stultz wrote:
>>> On Mon, Sep 17, 2018 at 12:25 PM, Andy Lutomirski <[email protected]> wrote:
>>> Also, I'm not entirely convinced that this "last" thing is needed at
>>> all. John, what's the scenario under which we need it?
>>
>> So my memory is probably a bit foggy, but I recall that as we
>> accelerated gettimeofday, we found that even on systems that claimed
>> to have synced TSCs, they were actually just slightly out of sync.
>> Enough that right after cycles_last had been updated, a read on
>> another cpu could come in just behind cycles_last, resulting in a
>> negative interval causing lots of havoc.
>>
>> So the sanity check is needed to avoid that case.
>
> Your memory serves you right. That's indeed observable on CPUs which
> lack TSC_ADJUST.
>
> @Andy: Welcome to the wonderful world of TSC.
>

Do we do better if we use signed arithmetic for the whole calculation? Then a small backwards movement would result in a small backwards result. Or we could offset everything so that we’d have to go back several hundred ms before we cross zero.

2018-09-18 15:54:35

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, 18 Sep 2018, Thomas Gleixner wrote:
> So if the TSC on CPU1 is slightly behind the TSC on CPU0 then now1 can be
> smaller than cycle_last. The TSC sync stuff does not catch the small delta
> for unknown raisins. I'll go and find that machine and test that again.

Of course it does not trigger anymore. We accumulated code between the
point in timekeeping_advance() where the TSC is read and the update of the
VDSO data.

I'll might have to get an 2.6ish kernel booted on that machine and try with
that again. /me shudders

Thanks,

tglx

2018-09-18 22:47:26

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, 18 Sep 2018, Andy Lutomirski wrote:
> > On Sep 18, 2018, at 12:52 AM, Thomas Gleixner <[email protected]> wrote:
> >
> >> On Mon, 17 Sep 2018, John Stultz wrote:
> >>> On Mon, Sep 17, 2018 at 12:25 PM, Andy Lutomirski <[email protected]> wrote:
> >>> Also, I'm not entirely convinced that this "last" thing is needed at
> >>> all. John, what's the scenario under which we need it?
> >>
> >> So my memory is probably a bit foggy, but I recall that as we
> >> accelerated gettimeofday, we found that even on systems that claimed
> >> to have synced TSCs, they were actually just slightly out of sync.
> >> Enough that right after cycles_last had been updated, a read on
> >> another cpu could come in just behind cycles_last, resulting in a
> >> negative interval causing lots of havoc.
> >>
> >> So the sanity check is needed to avoid that case.
> >
> > Your memory serves you right. That's indeed observable on CPUs which
> > lack TSC_ADJUST.
> >
> > @Andy: Welcome to the wonderful world of TSC.
> >
>
> Do we do better if we use signed arithmetic for the whole calculation?
> Then a small backwards movement would result in a small backwards result.
> Or we could offset everything so that we’d have to go back several
> hundred ms before we cross zero.

That would be probably the better solution as signed math would be
problematic when the resulting ns value becomes negative. As the delta is
really small, otherwise the TSC sync check would have caught it, the caller
should never be able to observe time going backwards.

I'll have a look into that. It needs some thought vs. the fractional part
of the base time, but it should be not rocket science to get that
correct. Famous last words...

Thanks,

tglx

2018-09-18 23:03:39

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

> On Sep 18, 2018, at 3:46 PM, Thomas Gleixner <[email protected]> wrote:
>
> On Tue, 18 Sep 2018, Andy Lutomirski wrote:
>>> On Sep 18, 2018, at 12:52 AM, Thomas Gleixner <[email protected]> wrote:
>>>
>>>>> On Mon, 17 Sep 2018, John Stultz wrote:
>>>>> On Mon, Sep 17, 2018 at 12:25 PM, Andy Lutomirski <[email protected]> wrote:
>>>>> Also, I'm not entirely convinced that this "last" thing is needed at
>>>>> all. John, what's the scenario under which we need it?
>>>>
>>>> So my memory is probably a bit foggy, but I recall that as we
>>>> accelerated gettimeofday, we found that even on systems that claimed
>>>> to have synced TSCs, they were actually just slightly out of sync.
>>>> Enough that right after cycles_last had been updated, a read on
>>>> another cpu could come in just behind cycles_last, resulting in a
>>>> negative interval causing lots of havoc.
>>>>
>>>> So the sanity check is needed to avoid that case.
>>>
>>> Your memory serves you right. That's indeed observable on CPUs which
>>> lack TSC_ADJUST.
>>>
>>> @Andy: Welcome to the wonderful world of TSC.
>>>
>>
>> Do we do better if we use signed arithmetic for the whole calculation?
>> Then a small backwards movement would result in a small backwards result.
>> Or we could offset everything so that we’d have to go back several
>> hundred ms before we cross zero.
>
> That would be probably the better solution as signed math would be
> problematic when the resulting ns value becomes negative. As the delta is
> really small, otherwise the TSC sync check would have caught it, the caller
> should never be able to observe time going backwards.
>
> I'll have a look into that. It needs some thought vs. the fractional part
> of the base time, but it should be not rocket science to get that
> correct. Famous last words...
>

It’s also fiddly to tune. If you offset it too much, then the fancy divide-by-repeated-subtraction loop will hurt more than the comparison to last.

2018-09-18 23:18:31

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, 18 Sep 2018, Andy Lutomirski wrote:
> > On Sep 18, 2018, at 3:46 PM, Thomas Gleixner <[email protected]> wrote:
> > On Tue, 18 Sep 2018, Andy Lutomirski wrote:
> >> Do we do better if we use signed arithmetic for the whole calculation?
> >> Then a small backwards movement would result in a small backwards result.
> >> Or we could offset everything so that we’d have to go back several
> >> hundred ms before we cross zero.
> >
> > That would be probably the better solution as signed math would be
> > problematic when the resulting ns value becomes negative. As the delta is
> > really small, otherwise the TSC sync check would have caught it, the caller
> > should never be able to observe time going backwards.
> >
> > I'll have a look into that. It needs some thought vs. the fractional part
> > of the base time, but it should be not rocket science to get that
> > correct. Famous last words...
> >
>
> It’s also fiddly to tune. If you offset it too much, then the fancy
> divide-by-repeated-subtraction loop will hurt more than the comparison to
> last.

Not really. It's sufficient to offset it by at max. 1000 cycles or so. That
won't hurt the magic loop, but it will definitely cover that slight offset
case.

Thanks,

tglx

2018-09-19 09:09:41

by Rasmus Villemoes

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On 2018-09-19 00:46, Thomas Gleixner wrote:
> On Tue, 18 Sep 2018, Andy Lutomirski wrote:
>>>
>>
>> Do we do better if we use signed arithmetic for the whole calculation?
>> Then a small backwards movement would result in a small backwards result.
>> Or we could offset everything so that we’d have to go back several
>> hundred ms before we cross zero.
>
> That would be probably the better solution as signed math would be
> problematic when the resulting ns value becomes negative. As the delta is
> really small, otherwise the TSC sync check would have caught it, the caller
> should never be able to observe time going backwards.
>
> I'll have a look into that. It needs some thought vs. the fractional part
> of the base time, but it should be not rocket science to get that
> correct. Famous last words...

Does the sentinel need to be U64_MAX? What if vgetcyc and its minions
returned gtod->cycle_last-1 (for some value of 1), and the caller just
does "if ((s64)cycles - (s64)last < 0) return fallback; ns +=
(cycles-last)* ...". That should just be a "sub ; js ; ". It's an extra
load of ->cycle_last, but only on the path where we're heading for the
fallback anyway. The value of 1 can be adjusted so that in the "js"
path, we could detect and accept an rdtsc_ordered() call that's just a
few 10s of cycles behind last and treat that as 0 and continue back on
the normal path. But maybe it's hard to get gcc to generate the expected
code.

Rasmus

2018-09-19 13:31:28

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Wed, 19 Sep 2018, Rasmus Villemoes wrote:
> On 2018-09-19 00:46, Thomas Gleixner wrote:
> > On Tue, 18 Sep 2018, Andy Lutomirski wrote:
> >>>
> >>
> >> Do we do better if we use signed arithmetic for the whole calculation?
> >> Then a small backwards movement would result in a small backwards result.
> >> Or we could offset everything so that we’d have to go back several
> >> hundred ms before we cross zero.
> >
> > That would be probably the better solution as signed math would be
> > problematic when the resulting ns value becomes negative. As the delta is
> > really small, otherwise the TSC sync check would have caught it, the caller
> > should never be able to observe time going backwards.
> >
> > I'll have a look into that. It needs some thought vs. the fractional part
> > of the base time, but it should be not rocket science to get that
> > correct. Famous last words...
>
> Does the sentinel need to be U64_MAX? What if vgetcyc and its minions
> returned gtod->cycle_last-1 (for some value of 1), and the caller just
> does "if ((s64)cycles - (s64)last < 0) return fallback; ns +=
> (cycles-last)* ...". That should just be a "sub ; js ; ". It's an extra
> load of ->cycle_last, but only on the path where we're heading for the
> fallback anyway. The value of 1 can be adjusted so that in the "js"
> path, we could detect and accept an rdtsc_ordered() call that's just a
> few 10s of cycles behind last and treat that as 0 and continue back on
> the normal path. But maybe it's hard to get gcc to generate the expected
> code.

I played around with a lot of variants and GCC generates all kinds of
interesting ASM. And at some point optimizing that math code is not buying
anything because the LFENCE before RDTSC is dominating all of it.

Thanks,

tglx

2018-09-27 14:38:34

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Wed, 19 Sep 2018, Thomas Gleixner wrote:
> On Tue, 18 Sep 2018, Andy Lutomirski wrote:
> > > On Sep 18, 2018, at 3:46 PM, Thomas Gleixner <[email protected]> wrote:
> > > On Tue, 18 Sep 2018, Andy Lutomirski wrote:
> > >> Do we do better if we use signed arithmetic for the whole calculation?
> > >> Then a small backwards movement would result in a small backwards result.
> > >> Or we could offset everything so that we’d have to go back several
> > >> hundred ms before we cross zero.
> > >
> > > That would be probably the better solution as signed math would be
> > > problematic when the resulting ns value becomes negative. As the delta is
> > > really small, otherwise the TSC sync check would have caught it, the caller
> > > should never be able to observe time going backwards.
> > >
> > > I'll have a look into that. It needs some thought vs. the fractional part
> > > of the base time, but it should be not rocket science to get that
> > > correct. Famous last words...
> > >
> >
> > It’s also fiddly to tune. If you offset it too much, then the fancy
> > divide-by-repeated-subtraction loop will hurt more than the comparison to
> > last.
>
> Not really. It's sufficient to offset it by at max. 1000 cycles or so. That
> won't hurt the magic loop, but it will definitely cover that slight offset
> case.

I got it working, but first of all the gain is close to 0.

There is this other subtle issue that we've seen TSCs slowly drifting apart
which is caught by the TSC watchdog eventually, but if it exeeds the offset
_before_ the watchdog triggers, we're back to square one.

So I rather stay on the safe side and just accept that we have to deal with
that. Sigh.

Thanks,

tglx

2018-09-27 14:39:58

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

> On Sep 27, 2018, at 7:36 AM, Thomas Gleixner <[email protected]> wrote:
>
>> On Wed, 19 Sep 2018, Thomas Gleixner wrote:
>> On Tue, 18 Sep 2018, Andy Lutomirski wrote:
>>>> On Sep 18, 2018, at 3:46 PM, Thomas Gleixner <[email protected]> wrote:
>>>>> On Tue, 18 Sep 2018, Andy Lutomirski wrote:
>>>>> Do we do better if we use signed arithmetic for the whole calculation?
>>>>> Then a small backwards movement would result in a small backwards result.
>>>>> Or we could offset everything so that we’d have to go back several
>>>>> hundred ms before we cross zero.
>>>>
>>>> That would be probably the better solution as signed math would be
>>>> problematic when the resulting ns value becomes negative. As the delta is
>>>> really small, otherwise the TSC sync check would have caught it, the caller
>>>> should never be able to observe time going backwards.
>>>>
>>>> I'll have a look into that. It needs some thought vs. the fractional part
>>>> of the base time, but it should be not rocket science to get that
>>>> correct. Famous last words...
>>>>
>>>
>>> It’s also fiddly to tune. If you offset it too much, then the fancy
>>> divide-by-repeated-subtraction loop will hurt more than the comparison to
>>> last.
>>
>> Not really. It's sufficient to offset it by at max. 1000 cycles or so. That
>> won't hurt the magic loop, but it will definitely cover that slight offset
>> case.
>
> I got it working, but first of all the gain is close to 0.
>
> There is this other subtle issue that we've seen TSCs slowly drifting apart
> which is caught by the TSC watchdog eventually, but if it exeeds the offset
> _before_ the watchdog triggers, we're back to square one.
>
> So I rather stay on the safe side and just accept that we have to deal with
> that. Sigh.
>
>

Seems okay to me. Oh well.

2018-09-27 14:42:17

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 09/11] x86/vdso: Simplify the invalid vclock case

On Tue, 18 Sep 2018, Thomas Gleixner wrote:
> On Tue, 18 Sep 2018, Thomas Gleixner wrote:
> > So if the TSC on CPU1 is slightly behind the TSC on CPU0 then now1 can be
> > smaller than cycle_last. The TSC sync stuff does not catch the small delta
> > for unknown raisins. I'll go and find that machine and test that again.
>
> Of course it does not trigger anymore. We accumulated code between the
> point in timekeeping_advance() where the TSC is read and the update of the
> VDSO data.
>
> I'll might have to get an 2.6ish kernel booted on that machine and try with
> that again. /me shudders

Actually it does happen, because the TSC is very slowly drifting apart due
to SMI wreckage trying to hide itself. It just takes a very long time.

Thanks,

tglx