LinuxLists.cc - [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

2008-01-09 23:33:57

Subject: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

Handle accurate time even if there's a long delay between
accumulated clock cycles.

Signed-off-by: John Stultz <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86/kernel/vsyscall_64.c | 5 ++-
include/asm-x86/vgtod.h | 2 -
include/linux/clocksource.h | 58 ++++++++++++++++++++++++++++++++++++++++--
kernel/time/timekeeping.c | 35 +++++++++++++------------
4 files changed, 80 insertions(+), 20 deletions(-)

linux-2.6.21-rc5_cycles-accumulated_C7.patch
============================================
Index: linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-compile-i386.git.orig/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:10:20.000000000 -0500
+++ linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:17:53.000000000 -0500
@@ -86,6 +86,7 @@ void update_vsyscall(struct timespec *wa
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = clock->mult;
vsyscall_gtod_data.clock.shift = clock->shift;
+ vsyscall_gtod_data.clock.cycle_accumulated = clock->cycle_accumulated;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
@@ -121,7 +122,7 @@ static __always_inline long time_syscall

static __always_inline void do_vgettimeofday(struct timeval * tv)
{
- cycle_t now, base, mask, cycle_delta;
+ cycle_t now, base, accumulated, mask, cycle_delta;
unsigned seq;
unsigned long mult, shift, nsec;
cycle_t (*vread)(void);
@@ -135,6 +136,7 @@ static __always_inline void do_vgettimeo
}
now = vread();
base = __vsyscall_gtod_data.clock.cycle_last;
+ accumulated = __vsyscall_gtod_data.clock.cycle_accumulated;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
shift = __vsyscall_gtod_data.clock.shift;
@@ -145,6 +147,7 @@ static __always_inline void do_vgettimeo

/* calculate interval: */
cycle_delta = (now - base) & mask;
+ cycle_delta += accumulated;
/* convert to nsecs: */
nsec += (cycle_delta * mult) >> shift;

Index: linux-compile-i386.git/include/linux/clocksource.h
===================================================================
--- linux-compile-i386.git.orig/include/linux/clocksource.h 2008-01-09 14:07:34.000000000 -0500
+++ linux-compile-i386.git/include/linux/clocksource.h 2008-01-09 15:17:33.000000000 -0500
@@ -50,8 +50,12 @@ struct clocksource;
* @flags: flags describing special properties
* @vread: vsyscall based read
* @resume: resume function for the clocksource, if necessary
+ * @cycle_last: Used internally by timekeeping core, please ignore.
+ * @cycle_accumulated: Used internally by timekeeping core, please ignore.
* @cycle_interval: Used internally by timekeeping core, please ignore.
* @xtime_interval: Used internally by timekeeping core, please ignore.
+ * @xtime_nsec: Used internally by timekeeping core, please ignore.
+ * @error: Used internally by timekeeping core, please ignore.
*/
struct clocksource {
/*
@@ -82,7 +86,10 @@ struct clocksource {
* Keep it in a different cache line to dirty no
* more than one cache line.
*/
- cycle_t cycle_last ____cacheline_aligned_in_smp;
+ struct {
+ cycle_t cycle_last, cycle_accumulated;
+ } ____cacheline_aligned_in_smp;
+
u64 xtime_nsec;
s64 error;

@@ -168,11 +175,44 @@ static inline cycle_t clocksource_read(s
}

/**
+ * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * @cs: pointer to clocksource being read
+ * @now: current cycle value
+ *
+ * Uses the clocksource to return the current cycle_t value.
+ * NOTE!!!: This is different from clocksource_read, because it
+ * returns the accumulated cycle value! Must hold xtime lock!
+ */
+static inline cycle_t
+clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+{
+ cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ offset += cs->cycle_accumulated;
+ return offset;
+}
+
+/**
+ * clocksource_accumulate: - Accumulates clocksource cycles
+ * @cs: pointer to clocksource being read
+ * @now: current cycle value
+ *
+ * Used to avoids clocksource hardware overflow by periodically
+ * accumulating the current cycle delta. Must hold xtime write lock!
+ */
+static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
+{
+ cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ cs->cycle_last = now;
+ cs->cycle_accumulated += offset;
+}
+
+/**
* cyc2ns - converts clocksource cycles to nanoseconds
* @cs: Pointer to clocksource
* @cycles: Cycles
*
* Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ * Must hold xtime lock!
*
* XXX - This could use some mult_lxl_ll() asm optimization
*/
@@ -184,13 +224,27 @@ static inline s64 cyc2ns(struct clocksou
}

/**
+ * ns2cyc - converts nanoseconds to clocksource cycles
+ * @cs: Pointer to clocksource
+ * @nsecs: Nanoseconds
+ */
+static inline cycle_t ns2cyc(struct clocksource *cs, u64 nsecs)
+{
+ cycle_t ret = nsecs << cs->shift;
+
+ do_div(ret, cs->mult + 1);
+
+ return ret;
+}
+
+/**
* clocksource_calculate_interval - Calculates a clocksource interval struct
*
* @c: Pointer to clocksource.
* @length_nsec: Desired interval length in nanoseconds.
*
* Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
- * pair and interval request.
+ * pair and interval request. Must hold xtime_lock!
*
* Unless you're the timekeeping code, you should not be using this!
*/
Index: linux-compile-i386.git/kernel/time/timekeeping.c
===================================================================
--- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:07:34.000000000 -0500
+++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 15:17:31.000000000 -0500
@@ -66,16 +66,10 @@ static struct clocksource *clock; /* poi
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_now, cycle_delta;
+ cycle_t cycle_delta;
s64 ns_offset;

- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
+ cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
ns_offset = cyc2ns(clock, cycle_delta);

return ns_offset;
@@ -195,7 +189,7 @@ static void change_clocksource(void)

clock = new;
clock->cycle_last = now;
-
+ clock->cycle_accumulated = 0;
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -205,9 +199,15 @@ static void change_clocksource(void)
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
clock->name);
}
+
+void timekeeping_accumulate(void)
+{
+ clocksource_accumulate(clock, clocksource_read(clock));
+}
#else
static inline void change_clocksource(void) { }
static inline s64 __get_nsec_offset(void) { return 0; }
+void timekeeping_accumulate(void) { }
#endif

/**
@@ -302,6 +302,7 @@ static int timekeeping_resume(struct sys
timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
/* re-base the last cycle value */
clock->cycle_last = clocksource_read(clock);
+ clock->cycle_accumulated = 0;
clock->error = 0;
timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -448,27 +449,29 @@ static void clocksource_adjust(s64 offse
*/
void update_wall_time(void)
{
- cycle_t offset;
+ cycle_t cycle_now, offset;

/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;

#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+ cycle_now = clocksource_read(clock);
#else
- offset = clock->cycle_interval;
+ cycle_now = clock->cycle_last + clock->cycle_interval;
#endif
+ offset = (cycle_now - clock->cycle_last) & clock->mask;
+ clocksource_accumulate(clock, cycle_now);
+
clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;

/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
- while (offset >= clock->cycle_interval) {
+ while (clock->cycle_accumulated >= clock->cycle_interval) {
/* accumulate one interval */
clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
- offset -= clock->cycle_interval;
+ clock->cycle_accumulated -= clock->cycle_interval;

if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
@@ -482,7 +485,7 @@ void update_wall_time(void)
}

/* correct the clock when NTP error is too big */
- clocksource_adjust(offset);
+ clocksource_adjust(clock->cycle_accumulated);

/* store full nanoseconds into xtime */
xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
Index: linux-compile-i386.git/include/asm-x86/vgtod.h
===================================================================
--- linux-compile-i386.git.orig/include/asm-x86/vgtod.h 2008-01-09 14:07:34.000000000 -0500
+++ linux-compile-i386.git/include/asm-x86/vgtod.h 2008-01-09 14:17:53.000000000 -0500
@@ -15,7 +15,7 @@ struct vsyscall_gtod_data {
struct timezone sys_tz;
struct { /* extract of a clocksource struct */
cycle_t (*vread)(void);
- cycle_t cycle_last;
+ cycle_t cycle_last, cycle_accumulated;
cycle_t mask;
u32 mult;
u32 shift;

--

2008-01-10 00:01:00

by john stultz

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Wed, 2008-01-09 at 18:29 -0500, Steven Rostedt wrote:
> plain text document attachment (rt-time-starvation-fix.patch)
> Handle accurate time even if there's a long delay between
> accumulated clock cycles.
>
> Signed-off-by: John Stultz <[email protected]>
> Signed-off-by: Steven Rostedt <[email protected]>
> ---
> arch/x86/kernel/vsyscall_64.c | 5 ++-
> include/asm-x86/vgtod.h | 2 -
> include/linux/clocksource.h | 58 ++++++++++++++++++++++++++++++++++++++++--
> kernel/time/timekeeping.c | 35 +++++++++++++------------
> 4 files changed, 80 insertions(+), 20 deletions(-)
>
> linux-2.6.21-rc5_cycles-accumulated_C7.patch
^^ An oldie but a goodie?

I was just reminded that in the time since 2.6.21-rc5, other arches
beside x86_64 have gained vgettimeofday implementations, and thus will
need similar update_vsyscall() tweaks as seen below to get the correct
time.

Here's the fix for x86_64:
> Index: linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c
> ===================================================================
> --- linux-compile-i386.git.orig/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:10:20.000000000 -0500
> +++ linux-compile-i386.git/arch/x86/kernel/vsyscall_64.c 2008-01-09 14:17:53.000000000 -0500
> @@ -86,6 +86,7 @@ void update_vsyscall(struct timespec *wa
> vsyscall_gtod_data.clock.mask = clock->mask;
> vsyscall_gtod_data.clock.mult = clock->mult;
> vsyscall_gtod_data.clock.shift = clock->shift;
> + vsyscall_gtod_data.clock.cycle_accumulated = clock->cycle_accumulated;
> vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
> vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
> vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
> @@ -121,7 +122,7 @@ static __always_inline long time_syscall
>
> static __always_inline void do_vgettimeofday(struct timeval * tv)
> {
> - cycle_t now, base, mask, cycle_delta;
> + cycle_t now, base, accumulated, mask, cycle_delta;
> unsigned seq;
> unsigned long mult, shift, nsec;
> cycle_t (*vread)(void);
> @@ -135,6 +136,7 @@ static __always_inline void do_vgettimeo
> }
> now = vread();
> base = __vsyscall_gtod_data.clock.cycle_last;
> + accumulated = __vsyscall_gtod_data.clock.cycle_accumulated;
> mask = __vsyscall_gtod_data.clock.mask;
> mult = __vsyscall_gtod_data.clock.mult;
> shift = __vsyscall_gtod_data.clock.shift;
> @@ -145,6 +147,7 @@ static __always_inline void do_vgettimeo
>
> /* calculate interval: */
> cycle_delta = (now - base) & mask;
> + cycle_delta += accumulated;
> /* convert to nsecs: */
> nsec += (cycle_delta * mult) >> shift;

Tony: ia64 also needs something like this, but I found the fsyscall asm
bits a little difficult to grasp. So I'll need some assistance on how to
include the accumulated cycles into the final calculation.

The following is a quick and dirty fix for powerpc so it includes
cycle_accumulated in its calculation. It relies on the fact that the
powerpc clocksource is a 64bit counter (don't have to worry about
multiple overflows), so the subtraction should be safe.

Signed-off-by: John Stultz <[email protected]>
Index: 2.6.24-rc5/arch/powerpc/kernel/time.c
===================================================================
--- 2.6.24-rc5.orig/arch/powerpc/kernel/time.c 2008-01-09 15:17:32.000000000 -0800
+++ 2.6.24-rc5/arch/powerpc/kernel/time.c 2008-01-09 15:17:43.000000000 -0800
@@ -773,7 +773,7 @@ void update_vsyscall(struct timespec *wa
stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
do_div(stamp_xsec, 1000000000);
stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
- update_gtod(clock->cycle_last, stamp_xsec, t2x);
+ update_gtod(clock->cycle_last-clock->cycle_accumulated, stamp_xsec, t2x);
}

void update_vsyscall_tz(void)

2008-01-10 00:10:19

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Wed, 9 Jan 2008, john stultz wrote:
> > ---
> > arch/x86/kernel/vsyscall_64.c | 5 ++-
> > include/asm-x86/vgtod.h | 2 -
> > include/linux/clocksource.h | 58 ++++++++++++++++++++++++++++++++++++++++--
> > kernel/time/timekeeping.c | 35 +++++++++++++------------
> > 4 files changed, 80 insertions(+), 20 deletions(-)
> >
> > linux-2.6.21-rc5_cycles-accumulated_C7.patch
> ^^ An oldie but a goodie?

Hehe, I got this directly from the RT queue.

>
> I was just reminded that in the time since 2.6.21-rc5, other arches
> beside x86_64 have gained vgettimeofday implementations, and thus will
> need similar update_vsyscall() tweaks as seen below to get the correct
> time.
>
> Here's the fix for x86_64:

Thanks! I'll fold this into this patch for the next release.

-- Steve

2008-01-10 00:20:27

by john stultz

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Wed, 2008-01-09 at 18:29 -0500, Steven Rostedt wrote:
> plain text document attachment (rt-time-starvation-fix.patch)
> Handle accurate time even if there's a long delay between
> accumulated clock cycles.
>
> Signed-off-by: John Stultz <[email protected]>
> Signed-off-by: Steven Rostedt <[email protected]>

Hrmm.. One more item I just noticed.

> Index: linux-compile-i386.git/kernel/time/timekeeping.c
> ===================================================================
> --- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:07:34.000000000 -0500
> +++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 15:17:31.000000000 -0500
> @@ -448,27 +449,29 @@ static void clocksource_adjust(s64 offse
> */
> void update_wall_time(void)
> {
> - cycle_t offset;
> + cycle_t cycle_now, offset;
>
> /* Make sure we're fully resumed: */
> if (unlikely(timekeeping_suspended))
> return;
>
> #ifdef CONFIG_GENERIC_TIME
> - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
> + cycle_now = clocksource_read(clock);
> #else
> - offset = clock->cycle_interval;
> + cycle_now = clock->cycle_last + clock->cycle_interval;
> #endif
> + offset = (cycle_now - clock->cycle_last) & clock->mask;

It seems this offset addition was to merge against the colliding
xtime_cache changes in mainline. However, I don't think its quite right,
and might be causing incorrect time() or vtime() results if NO_HZ is
enabled.

> + clocksource_accumulate(clock, cycle_now);
> +
> clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
>
> /* normally this loop will run just once, however in the
> * case of lost or late ticks, it will accumulate correctly.
> */
> - while (offset >= clock->cycle_interval) {
> + while (clock->cycle_accumulated >= clock->cycle_interval) {
> /* accumulate one interval */
> clock->xtime_nsec += clock->xtime_interval;
> - clock->cycle_last += clock->cycle_interval;
> - offset -= clock->cycle_interval;
> + clock->cycle_accumulated -= clock->cycle_interval;
>
> if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
> clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
> @@ -482,7 +485,7 @@ void update_wall_time(void)
> }
>
> /* correct the clock when NTP error is too big */
> - clocksource_adjust(offset);
> + clocksource_adjust(clock->cycle_accumulated);

I suspect the following is needed, but haven't been able to test it yet.

thanks
-john

Fixup merge between xtime_cache and timekeeping starvation fix.

Signed-off-by: John Stultz <[email protected]>

Index: 2.6/kernel/time/timekeeping.c
===================================================================
--- 2.6.orig/kernel/time/timekeeping.c 2008-01-09 16:12:49.000000000 -0800
+++ 2.6/kernel/time/timekeeping.c 2008-01-09 16:13:18.000000000 -0800
@@ -449,7 +449,7 @@ static void clocksource_adjust(s64 offse
*/
void update_wall_time(void)
{
- cycle_t cycle_now, offset;
+ cycle_t cycle_now;

/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
@@ -460,7 +460,6 @@ void update_wall_time(void)
#else
cycle_now = clock->cycle_last + clock->cycle_interval;
#endif
- offset = (cycle_now - clock->cycle_last) & clock->mask;
clocksource_accumulate(clock, cycle_now);

clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
@@ -491,7 +490,7 @@ void update_wall_time(void)
xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;

- update_xtime_cache(cyc2ns(clock, offset));
+ update_xtime_cache(cyc2ns(clock, clock->cycle_accumulated));

/* check to see if there is a new clocksource to use */
change_clocksource();

2008-01-10 00:25:17

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Wed, 9 Jan 2008, john stultz wrote:
> > Index: linux-compile-i386.git/kernel/time/timekeeping.c
> > ===================================================================
> > --- linux-compile-i386.git.orig/kernel/time/timekeeping.c 2008-01-09 14:07:34.000000000 -0500
> > +++ linux-compile-i386.git/kernel/time/timekeeping.c 2008-01-09 15:17:31.000000000 -0500
> > @@ -448,27 +449,29 @@ static void clocksource_adjust(s64 offse
> > */
> > void update_wall_time(void)
> > {
> > - cycle_t offset;
> > + cycle_t cycle_now, offset;
> >
> > /* Make sure we're fully resumed: */
> > if (unlikely(timekeeping_suspended))
> > return;
> >
> > #ifdef CONFIG_GENERIC_TIME
> > - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
> > + cycle_now = clocksource_read(clock);
> > #else
> > - offset = clock->cycle_interval;
> > + cycle_now = clock->cycle_last + clock->cycle_interval;
> > #endif
> > + offset = (cycle_now - clock->cycle_last) & clock->mask;
>
> It seems this offset addition was to merge against the colliding
> xtime_cache changes in mainline. However, I don't think its quite right,
> and might be causing incorrect time() or vtime() results if NO_HZ is
> enabled.

Yeah, this had a bit of clashes in its life in the RT kernel.

>
> > + clocksource_accumulate(clock, cycle_now);
> > +
> > clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
> >
> > /* normally this loop will run just once, however in the
> > * case of lost or late ticks, it will accumulate correctly.
> > */
> > - while (offset >= clock->cycle_interval) {
> > + while (clock->cycle_accumulated >= clock->cycle_interval) {
> > /* accumulate one interval */
> > clock->xtime_nsec += clock->xtime_interval;
> > - clock->cycle_last += clock->cycle_interval;
> > - offset -= clock->cycle_interval;
> > + clock->cycle_accumulated -= clock->cycle_interval;
> >
> > if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
> > clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
> > @@ -482,7 +485,7 @@ void update_wall_time(void)
> > }
> >
> > /* correct the clock when NTP error is too big */
> > - clocksource_adjust(offset);
> > + clocksource_adjust(clock->cycle_accumulated);
>
>
> I suspect the following is needed, but haven't been able to test it yet.

Thanks, I'll pull it in and start testing it.

-- Steve

2008-01-10 19:54:36

by Luck, Tony

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

> Tony: ia64 also needs something like this, but I found the fsyscall asm
> bits a little difficult to grasp. So I'll need some assistance on how to
> include the accumulated cycles into the final calculation.

I'm trying to figure out all the ramifications of the new
"cycle_accumulated" field. Does it really need to be
propagated all the way to the low level assembler (which
I don't want to mess with unless I really, really have to).
Can't I do the necessary calculations in update_vsyscall()
[Where I can do them in C :-)] and keep the same low
level assembly code. I think I must be missing some
important bit of what is going on here.

-Tony

2008-01-10 20:16:19

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

> I'm trying to figure out all the ramifications of the new
> "cycle_accumulated" field. Does it really need to be

John,

Before we hardcode these names, can we change them? Later in the series I
use something called 'cycle_raw' which really should be called
'cycle_accumulated'. Since cycle_accumulated IIRC can go backwards.

Or do you think I should rename cycle_raw to cycle_monotonic?

-- Steve

2008-01-10 20:29:39

by john stultz

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Thu, 2008-01-10 at 11:54 -0800, Tony Luck wrote:
> > Tony: ia64 also needs something like this, but I found the fsyscall asm
> > bits a little difficult to grasp. So I'll need some assistance on how to
> > include the accumulated cycles into the final calculation.
>
> I'm trying to figure out all the ramifications of the new
> "cycle_accumulated" field. Does it really need to be
> propagated all the way to the low level assembler (which
> I don't want to mess with unless I really, really have to).
> Can't I do the necessary calculations in update_vsyscall()
> [Where I can do them in C :-)] and keep the same low
> level assembly code. I think I must be missing some
> important bit of what is going on here.

(Added Bob Picco to the mail, as he was involved in the ia64 clocksource
work).

So the background on the patch is this:

Some clocksources wrap frequently (every few seconds, for example). This
can cause issues if we defer the update_wall_time() function where we
accumulate time for too long (this really only happens on -rt systems
right now).

To avoid that issue, we've added the cycle_accumulated value, which acts
as a midpoint, where we can quickly accumulate cycles off of the
counter, without doing the more expensive update_wall_time() function.
This avoids issues with the clocksource wrapping, but requires that
cycle_accumulated be added in to the gettiemofday() calculation.

If you noticed in my email, the fix for ppc was a bit easier, as it has
only a 64bit counter that is quite unlikely to wrap twice between calls
to update_wall_time(). There we could decrement the cycles_last value by
cycles_accumulated and get the same effect of adding it in.

Unfortunately on ia64, I suspect it will be necessary to do similar to
the x86_64 code and add in the cycles accumulated value in
vgettime/fgettime function, since there is the possibility of quickly
wrapping clocksources on that architecture.

So unless someone can point out a nicer trick, it likely means adding a
new cycles_accumulated value to the fsyscall structure and the asm to do
the addition. :(

thanks
-john

2008-01-10 20:41:48

by john stultz

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Thu, 2008-01-10 at 15:15 -0500, Steven Rostedt wrote:
> > I'm trying to figure out all the ramifications of the new
> > "cycle_accumulated" field. Does it really need to be
>
> John,
>
> Before we hardcode these names, can we change them? Later in the series I
> use something called 'cycle_raw' which really should be called
> 'cycle_accumulated'. Since cycle_accumulated IIRC can go backwards.

Err... I don't think the current cycle_accumulated can go backwards. Or
maybe I'm missing what you mean.

> Or do you think I should rename cycle_raw to cycle_monotonic?

That's fine by me either way.

thanks
-john

2008-01-10 20:43:18

by Mathieu Desnoyers

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

* john stultz ([email protected]) wrote:
>
> On Thu, 2008-01-10 at 11:54 -0800, Tony Luck wrote:
> > > Tony: ia64 also needs something like this, but I found the fsyscall asm
> > > bits a little difficult to grasp. So I'll need some assistance on how to
> > > include the accumulated cycles into the final calculation.
> >
> > I'm trying to figure out all the ramifications of the new
> > "cycle_accumulated" field. Does it really need to be
> > propagated all the way to the low level assembler (which
> > I don't want to mess with unless I really, really have to).
> > Can't I do the necessary calculations in update_vsyscall()
> > [Where I can do them in C :-)] and keep the same low
> > level assembly code. I think I must be missing some
> > important bit of what is going on here.
>
> (Added Bob Picco to the mail, as he was involved in the ia64 clocksource
> work).
>
> So the background on the patch is this:
>
> Some clocksources wrap frequently (every few seconds, for example). This
> can cause issues if we defer the update_wall_time() function where we
> accumulate time for too long (this really only happens on -rt systems
> right now).
>
> To avoid that issue, we've added the cycle_accumulated value, which acts
> as a midpoint, where we can quickly accumulate cycles off of the
> counter, without doing the more expensive update_wall_time() function.
> This avoids issues with the clocksource wrapping, but requires that
> cycle_accumulated be added in to the gettiemofday() calculation.
>
> If you noticed in my email, the fix for ppc was a bit easier, as it has
> only a 64bit counter that is quite unlikely to wrap twice between calls
> to update_wall_time(). There we could decrement the cycles_last value by
> cycles_accumulated and get the same effect of adding it in.
>
> Unfortunately on ia64, I suspect it will be necessary to do similar to
> the x86_64 code and add in the cycles accumulated value in
> vgettime/fgettime function, since there is the possibility of quickly
> wrapping clocksources on that architecture.
>
> So unless someone can point out a nicer trick, it likely means adding a
> new cycles_accumulated value to the fsyscall structure and the asm to do
> the addition. :(
>

I think it's about time I introduce the approach I have taken for LTTng
timestamping. Basically, one of the main issues with the clock sources
is the xtime lock : having a read seqlock nested over a write seqlock is
a really, really bad idea. This can happen with NMIs. Basically, it
would cause a deadlock.

What I have done is an RCU algorithm that extends a 32 bits TSC (that's
the case on MIPS, for instance) to 64 bits. The update of the MSBs is
done by a periodical timer (fired often enough to make sure we always
detect the 32 LSBs wrap-around) and the read-side only has to disable
preemption.

I use a 2 slots array, each of them keeping, alternatively, the last 64
bits counter value, to implement the RCU algorithm.

Since we are discussing time source modification, this is one that I
would really like to see in the Linux kernel : it would provide the kind
of time source needed for function entry/exit tracing and for generic
kernel tracing as well.

Mathieu

Here is the patch, for reference. It applies on 2.6.24-rc7, after some
other LTTng patches in my patchset.

LTTng timestamp

LTTng synthetic TSC code for timestamping. Extracts 64 bits tsc from a 32 bits
counter, kept up to date by periodical timer interrupt. Lockless.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
init/Kconfig | 2
ltt/Kconfig | 17 ++++
ltt/Makefile | 1
ltt/ltt-timestamp.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 218 insertions(+)

Index: linux-2.6-lttng/ltt/ltt-timestamp.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/ltt-timestamp.c 2007-12-05 20:54:48.000000000 -0500
@@ -0,0 +1,198 @@
+/*
+ * (C) Copyright 2006,2007 -
+ * Mathieu Desnoyers ([email protected])
+ *
+ * notes : ltt-timestamp timer-based clock cannot be used for early tracing in
+ * the boot process, as it depends on timer interrupts.
+ *
+ * The timer needs to be only on one CPU to support hotplug.
+ * We have the choice between schedule_delayed_work_on and an IPI to get each
+ * CPU to write the heartbeat. IPI has been chosen because it is considered
+ * faster than passing through the timer to get the work scheduled on all the
+ * CPUs.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/ltt.h>
+#include <linux/smp.h>
+#include <linux/sched.h> /* FIX for m68k local_irq_enable in on_each_cpu */
+
+atomic_t lttng_generic_clock;
+EXPORT_SYMBOL(lttng_generic_clock);
+
+/* Expected maximum interrupt latency in ms : 15ms, *2 for security */
+#define EXPECTED_INTERRUPT_LATENCY 30
+
+static struct timer_list stsc_timer;
+static unsigned int precalc_expire;
+
+/* For architectures with 32 bits TSC */
+static struct synthetic_tsc_struct {
+ u32 tsc[2][2]; /* a pair of 2 32 bits. [0] is the MSB, [1] is LSB */
+ unsigned int index; /* Index of the current synth. tsc. */
+} ____cacheline_aligned synthetic_tsc[NR_CPUS];
+
+/* Called from IPI : either in interrupt or process context */
+static void ltt_update_synthetic_tsc(void)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u32 tsc;
+
+ preempt_disable();
+ cpu_synth = &synthetic_tsc[smp_processor_id()];
+ tsc = ltt_get_timestamp32(); /* We deal with a 32 LSB TSC */
+
+ if (tsc < cpu_synth->tsc[cpu_synth->index][1]) {
+ unsigned int new_index = cpu_synth->index ? 0 : 1; /* 0 <-> 1 */
+ /*
+ * Overflow
+ * Non atomic update of the non current synthetic TSC, followed
+ * by an atomic index change. There is no write concurrency,
+ * so the index read/write does not need to be atomic.
+ */
+ cpu_synth->tsc[new_index][1] = tsc; /* LSB update */
+ cpu_synth->tsc[new_index][0] =
+ cpu_synth->tsc[cpu_synth->index][0]+1; /* MSB update */
+ cpu_synth->index = new_index; /* atomic change of index */
+ } else {
+ /*
+ * No overflow : we can simply update the 32 LSB of the current
+ * synthetic TSC as it's an atomic write.
+ */
+ cpu_synth->tsc[cpu_synth->index][1] = tsc;
+ }
+ preempt_enable();
+}
+
+/* Called from buffer switch : in _any_ context (even NMI) */
+u64 ltt_read_synthetic_tsc(void)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u64 ret;
+ unsigned int index;
+ u32 tsc;
+
+ preempt_disable();
+ cpu_synth = &synthetic_tsc[smp_processor_id()];
+ index = cpu_synth->index; /* atomic read */
+ tsc = ltt_get_timestamp32(); /* We deal with a 32 LSB TSC */
+
+ if (tsc < cpu_synth->tsc[index][1]) {
+ /* Overflow */
+ ret = ((u64)(cpu_synth->tsc[index][0]+1) << 32) | ((u64)tsc);
+ } else {
+ /* no overflow */
+ ret = ((u64)cpu_synth->tsc[index][0] << 32) | ((u64)tsc);
+ }
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_read_synthetic_tsc);
+
+static void synthetic_tsc_ipi(void *info)
+{
+ ltt_update_synthetic_tsc();
+}
+
+/* We need to be in process context to do an IPI */
+static void synthetic_tsc_work(struct work_struct *work)
+{
+ on_each_cpu(synthetic_tsc_ipi, NULL, 1, 1);
+}
+static DECLARE_WORK(stsc_work, synthetic_tsc_work);
+
+/*
+ * stsc_timer : - Timer function synchronizing synthetic TSC.
+ * @data: unused
+ *
+ * Guarantees at least 1 execution before low word of TSC wraps.
+ */
+static void stsc_timer_fct(unsigned long data)
+{
+ PREPARE_WORK(&stsc_work, synthetic_tsc_work);
+ schedule_work(&stsc_work);
+
+ mod_timer(&stsc_timer, jiffies + precalc_expire);
+}
+
+/*
+ * precalc_stsc_interval: - Precalculates the interval between the 32 bits TSC
+ * wraparounds.
+ */
+static int __init precalc_stsc_interval(void)
+{
+ unsigned long mask;
+
+ mask = 0xFFFFFFFFUL;
+ precalc_expire =
+ (mask/((ltt_frequency() / HZ * ltt_freq_scale()) << 1)
+ - 1 - (EXPECTED_INTERRUPT_LATENCY*HZ/1000)) >> 1;
+ WARN_ON(precalc_expire == 0);
+ printk(KERN_DEBUG "Synthetic TSC timer will fire each %u jiffies.\n",
+ precalc_expire);
+ return 0;
+}
+
+/*
+ * hotcpu_callback - CPU hotplug callback
+ * @nb: notifier block
+ * @action: hotplug action to take
+ * @hcpu: CPU number
+ *
+ * Sets the new CPU's current synthetic TSC to the same value as the
+ * currently running CPU.
+ *
+ * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
+ */
+static int __cpuinit hotcpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int hotcpu = (unsigned long)hcpu;
+ struct synthetic_tsc_struct *cpu_synth;
+ u64 local_count;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ cpu_synth = &synthetic_tsc[hotcpu];
+ local_count = ltt_read_synthetic_tsc();
+ cpu_synth->tsc[0][1] = (u32)local_count; /* LSB */
+ cpu_synth->tsc[0][0] = (u32)(local_count >> 32); /* MSB */
+ cpu_synth->index = 0;
+ smp_wmb(); /* Writing in data of CPU about to come up */
+ break;
+ case CPU_ONLINE:
+ /*
+ * FIXME : heartbeat events are currently broken with CPU
+ * hotplug : events can be recorded before heartbeat, heartbeat
+ * too far from trace start and are broken with trace stop/start
+ * as well.
+ */
+ /* As we are preemptible, make sure it runs on the right cpu */
+ smp_call_function_single(hotcpu, synthetic_tsc_ipi, NULL, 1, 0);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/* Called from one CPU, before any tracing starts, to init each structure */
+static int __init ltt_init_synthetic_tsc(void)
+{
+ int cpu;
+ hotcpu_notifier(hotcpu_callback, 3);
+ precalc_stsc_interval();
+ init_timer(&stsc_timer);
+ stsc_timer.function = stsc_timer_fct;
+ stsc_timer.expires = jiffies + precalc_expire;
+ add_timer(&stsc_timer);
+ return 0;
+}
+
+__initcall(ltt_init_synthetic_tsc);
Index: linux-2.6-lttng/ltt/Kconfig
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/Kconfig 2007-12-05 20:54:48.000000000 -0500
@@ -0,0 +1,17 @@
+menu "Linux Trace Toolkit"
+
+config LTT_TIMESTAMP
+ bool "LTTng fine-grained timestamping"
+ default y
+ help
+ Allow fine-grained timestamps to be taken from tracing applications.
+
+config HAVE_LTT_CLOCK
+ def_bool n
+
+config HAVE_LTT_SYNTHETIC_TSC
+ bool
+ default y if (!HAVE_LTT_CLOCK)
+ default n if HAVE_LTT_CLOCK
+
+endmenu
Index: linux-2.6-lttng/ltt/Makefile
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/Makefile 2007-12-05 20:54:48.000000000 -0500
@@ -0,0 +1 @@
+obj-$(CONFIG_HAVE_LTT_SYNTHETIC_TSC) += ltt-timestamp.o
Index: linux-2.6-lttng/init/Kconfig
===================================================================
--- linux-2.6-lttng.orig/init/Kconfig 2007-12-05 20:53:35.000000000 -0500
+++ linux-2.6-lttng/init/Kconfig 2007-12-05 20:54:48.000000000 -0500
@@ -682,6 +682,8 @@ config MARKERS
Place an empty function call at each marker site. Can be
dynamically changed for a probe function.

+source "ltt/Kconfig"
+
source "arch/Kconfig"

config DISABLE_IMMEDIATE

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-01-10 21:25:30

by john stultz

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Thu, 2008-01-10 at 15:42 -0500, Mathieu Desnoyers wrote:
> I think it's about time I introduce the approach I have taken for LTTng
> timestamping. Basically, one of the main issues with the clock sources
> is the xtime lock : having a read seqlock nested over a write seqlock is
> a really, really bad idea. This can happen with NMIs. Basically, it
> would cause a deadlock.
>
> What I have done is an RCU algorithm that extends a 32 bits TSC (that's
> the case on MIPS, for instance) to 64 bits. The update of the MSBs is
> done by a periodical timer (fired often enough to make sure we always
> detect the 32 LSBs wrap-around) and the read-side only has to disable
> preemption.
>
> I use a 2 slots array, each of them keeping, alternatively, the last 64
> bits counter value, to implement the RCU algorithm.
>
> Since we are discussing time source modification, this is one that I
> would really like to see in the Linux kernel : it would provide the kind
> of time source needed for function entry/exit tracing and for generic
> kernel tracing as well.

Hmm. I know powerpc has had a similar lock-free dual structure method
and for just a raw cycles based method you've shown below (or for some
of the bits Steven is working on), I think it should be fine.

The concern I've had with this method for general timekeeping, is that
I'm not sure it can handle the frequency corrections made by NTP. Since
we have to make sure time does not jump backwards, consider this
exaggerated situation:

time = base + (now - last)*mult;

So we have two structures:
base: 60 base: 180
last: 10 last: 30
mult: 06 mult: 05

Where the second structure has just been updated lock-free, however just
before the atomic pointer switch we were preempted, or somehow delayed,
and some time has past.

Now imagine two cpus now race to get the time. Both read the same now
value, but get different structure pointer values. (Note: You can create
the same race if you reverse the order and grab the pointer first, then
the cycle. However I think this example makes it easier to understand).

now = 50
cpu1:
60 + (50-10)*6 = 300
cpu2:
180 + (50-30)*5 = 280

Alternatively:
now=50: 60 + (50-10)*6 = 300
now=51: 180 + (51-30)*5 = 285

Eek. That's not good.

I'm not sure how this can be avoided, but I'd be very interested in
hearing ideas! Bounding the issue is a possibility, but then starts to
run amok with NO_HZ and -rt deferment.

thanks
-john

2008-01-10 21:36:45

by Luck, Tony

[permalink] [raw]

Subject: RE: [RFC PATCH 13/22 -v2] handle accurate time keeping over longdelays

> If you noticed in my email, the fix for ppc was a bit easier, as it has
> only a 64bit counter that is quite unlikely to wrap twice between calls
> to update_wall_time().

"quite unlikely" ...

Hmmm just how fast are you driving the clocks on your ppc? Even at 100GHz
It is almost SIX YEARS between wrap-arounds of a 64-bit counter.
Perhaps you could just have a cron job that forces a call to update_wall_time()
every January 1st, rather than add extra code overhead to a hot path :-) ?

But I agree that narrower counters are a problem.

-Tony

2008-01-10 22:05:31

by Mathieu Desnoyers

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

* john stultz ([email protected]) wrote:
>
> On Thu, 2008-01-10 at 15:42 -0500, Mathieu Desnoyers wrote:
> > I think it's about time I introduce the approach I have taken for LTTng
> > timestamping. Basically, one of the main issues with the clock sources
> > is the xtime lock : having a read seqlock nested over a write seqlock is
> > a really, really bad idea. This can happen with NMIs. Basically, it
> > would cause a deadlock.
> >
> > What I have done is an RCU algorithm that extends a 32 bits TSC (that's
> > the case on MIPS, for instance) to 64 bits. The update of the MSBs is
> > done by a periodical timer (fired often enough to make sure we always
> > detect the 32 LSBs wrap-around) and the read-side only has to disable
> > preemption.
> >
> > I use a 2 slots array, each of them keeping, alternatively, the last 64
> > bits counter value, to implement the RCU algorithm.
> >
> > Since we are discussing time source modification, this is one that I
> > would really like to see in the Linux kernel : it would provide the kind
> > of time source needed for function entry/exit tracing and for generic
> > kernel tracing as well.
>
> Hmm. I know powerpc has had a similar lock-free dual structure method
> and for just a raw cycles based method you've shown below (or for some
> of the bits Steven is working on), I think it should be fine.
>
> The concern I've had with this method for general timekeeping, is that
> I'm not sure it can handle the frequency corrections made by NTP. Since
> we have to make sure time does not jump backwards, consider this
> exaggerated situation:
>
> time = base + (now - last)*mult;
>
> So we have two structures:
> base: 60 base: 180
> last: 10 last: 30
> mult: 06 mult: 05
>
> Where the second structure has just been updated lock-free, however just
> before the atomic pointer switch we were preempted, or somehow delayed,
> and some time has past.
>
> Now imagine two cpus now race to get the time. Both read the same now
> value, but get different structure pointer values. (Note: You can create
> the same race if you reverse the order and grab the pointer first, then
> the cycle. However I think this example makes it easier to understand).
>
> now = 50
> cpu1:
> 60 + (50-10)*6 = 300
> cpu2:
> 180 + (50-30)*5 = 280
>
>
> Alternatively:
> now=50: 60 + (50-10)*6 = 300
> now=51: 180 + (51-30)*5 = 285
>
> Eek. That's not good.
>
> I'm not sure how this can be avoided, but I'd be very interested in
> hearing ideas! Bounding the issue is a possibility, but then starts to
> run amok with NO_HZ and -rt deferment.
>
> thanks
> -john
>

I suggest we try to see the problem differently (and see how far we can
get with this) :

Let's suppose we have a 32 bits cycles counter given by the
architecture. We use the lockless algorithm to extend it to 64 bits : we
therefore have a 64 bits cycle counter that can be read locklessly.

Then, from this 64 bits counter (let's call it "now") (which has the
advantage of never overflowing, or after enough years so nobody
cares...), we can calculate the current time with :

time = base + (now - last) * mul

NTP would adjust time by modifying mul, last and base; base would be
recalculated from the formula with : base + (now - last) * mul each time
we modify the clock rate (mul), we also read the current "now" value
(which is saved as "last"). This would be done system-wide and would be
kept in a data structure separate from the 2 64 bits slots array.
Ideally, this NTP correction update could also be done atomically with a
2 slots array.

Whenever we need to read the time, we then have to insure that the "now"
value we use is consistent with the current NTP time correction. We want
to eliminate races where we would use value from the wrong NTP "window"
with a "now" value not belonging to this window. (an NTP window would be
defined by a tuple of base, last and mul values)

If "now" is lower than "last", we are using an old timestamp with a
new copy of the structure and must therefore re-read the "now" value.

If, when we are about to return the "time" value calculated, we figure
out that the current NTP window pointer have changed, we must make sure
that the time value we are about to return is lower than the new base or
otherwise time could go backward. If we detect that time is higher than
the new base, we re-read the "now" value and re-do the calculation.

Am I only partially crazy ? ;)

Mathieu

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-01-10 22:41:15

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Thu, 10 Jan 2008, Mathieu Desnoyers wrote:
>
> Am I only partially crazy ? ;)

Not at all, you should just do some more shopping!

-- Steve

2008-01-10 22:52:17

by john stultz

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Thu, 2008-01-10 at 17:00 -0500, Mathieu Desnoyers wrote:
> * john stultz ([email protected]) wrote:
> >
> > On Thu, 2008-01-10 at 15:42 -0500, Mathieu Desnoyers wrote:
> > > I think it's about time I introduce the approach I have taken for LTTng
> > > timestamping. Basically, one of the main issues with the clock sources
> > > is the xtime lock : having a read seqlock nested over a write seqlock is
> > > a really, really bad idea. This can happen with NMIs. Basically, it
> > > would cause a deadlock.
> > >
> > > What I have done is an RCU algorithm that extends a 32 bits TSC (that's
> > > the case on MIPS, for instance) to 64 bits. The update of the MSBs is
> > > done by a periodical timer (fired often enough to make sure we always
> > > detect the 32 LSBs wrap-around) and the read-side only has to disable
> > > preemption.
> > >
> > > I use a 2 slots array, each of them keeping, alternatively, the last 64
> > > bits counter value, to implement the RCU algorithm.
> > >
> > > Since we are discussing time source modification, this is one that I
> > > would really like to see in the Linux kernel : it would provide the kind
> > > of time source needed for function entry/exit tracing and for generic
> > > kernel tracing as well.
> >
> > Hmm. I know powerpc has had a similar lock-free dual structure method
> > and for just a raw cycles based method you've shown below (or for some
> > of the bits Steven is working on), I think it should be fine.
> >
> > The concern I've had with this method for general timekeeping, is that
> > I'm not sure it can handle the frequency corrections made by NTP. Since
> > we have to make sure time does not jump backwards, consider this
> > exaggerated situation:
> >
> > time = base + (now - last)*mult;
> >
> > So we have two structures:
> > base: 60 base: 180
> > last: 10 last: 30
> > mult: 06 mult: 05
> >
> > Where the second structure has just been updated lock-free, however just
> > before the atomic pointer switch we were preempted, or somehow delayed,
> > and some time has past.
> >
> > Now imagine two cpus now race to get the time. Both read the same now
> > value, but get different structure pointer values. (Note: You can create
> > the same race if you reverse the order and grab the pointer first, then
> > the cycle. However I think this example makes it easier to understand).
> >
> > now = 50
> > cpu1:
> > 60 + (50-10)*6 = 300
> > cpu2:
> > 180 + (50-30)*5 = 280
> >
> >
> > Alternatively:
> > now=50: 60 + (50-10)*6 = 300
> > now=51: 180 + (51-30)*5 = 285
> >
> > Eek. That's not good.
> >
> > I'm not sure how this can be avoided, but I'd be very interested in
> > hearing ideas! Bounding the issue is a possibility, but then starts to
> > run amok with NO_HZ and -rt deferment.
> >
> > thanks
> > -john
> >
>
> I suggest we try to see the problem differently (and see how far we can
> get with this) :
>
> Let's suppose we have a 32 bits cycles counter given by the
> architecture. We use the lockless algorithm to extend it to 64 bits : we
> therefore have a 64 bits cycle counter that can be read locklessly.
>
> Then, from this 64 bits counter (let's call it "now") (which has the
> advantage of never overflowing, or after enough years so nobody
> cares...), we can calculate the current time with :

Hmm. Maybe I'm missing something here. I'm not sure I'm following the
importance of the 64bit extension.

The clocksource code deals with counters in a range of widths (from
64bit TSC to 24bit ACPI PM). The only requirement there is that have
accumulate often enough that it doesn't wrap twice between
accumulations. Currently this is done in update_wall_time(), but the
patch Steven sent that started this thread adds an interim step using
cycle_accumulated, allowing update_wall_time() to be deferred for longer
periods of time.

I do see how the method you're describing could be applied to just the
cycle_accumulated management, and maybe that's the whole point?

However my concern is that when we do the frequency adjustment in
update_wall_time, I'm not sure the method works. Thus we still would
have to have a lock in there for gettimeofday().

But let's continue...

> time = base + (now - last) * mul
>
> NTP would adjust time by modifying mul, last and base; base would be
> recalculated from the formula with : base + (now - last) * mul each time
> we modify the clock rate (mul), we also read the current "now" value
> (which is saved as "last"). This would be done system-wide and would be
> kept in a data structure separate from the 2 64 bits slots array.
> Ideally, this NTP correction update could also be done atomically with a
> 2 slots array.
>
> Whenever we need to read the time, we then have to insure that the "now"
> value we use is consistent with the current NTP time correction. We want
> to eliminate races where we would use value from the wrong NTP "window"
> with a "now" value not belonging to this window. (an NTP window would be
> defined by a tuple of base, last and mul values)
>
> If "now" is lower than "last", we are using an old timestamp with a
> new copy of the structure and must therefore re-read the "now" value.

Ok, that would avoid one type of error, but in both of my examples in
the last mail, now was greater then last.

> If, when we are about to return the "time" value calculated, we figure
> out that the current NTP window pointer have changed, we must make sure
> that the time value we are about to return is lower than the new base or
> otherwise time could go backward. If we detect that time is higher than
> the new base, we re-read the "now" value and re-do the calculation.

Again, I'm not sure I see how this resolves the previous example given,
as in that case the update code was delayed in between its reading of
now and the final pointer change.

The issue is that the race isn't just between the readers and the
writer, but that time races against writer as well. So if you don't lock
the readers out during the write, I'm not sure how you can avoid the
window for inconsistencies.

thanks
-john

2008-01-10 23:05:26

by john stultz

[permalink] [raw]

Subject: Re: [RFC PATCH 13/22 -v2] handle accurate time keeping over long delays

On Thu, 2008-01-10 at 14:52 -0800, john stultz wrote:
> The issue is that the race isn't just between the readers and the
> writer, but that time races against writer as well. So if you don't lock
> the readers out during the write, I'm not sure how you can avoid the
> window for inconsistencies.

Maybe to state it more clearly, the issue is that in order to be atomic,
the writer must atomically access the clock, and make all the updates in
one step.

So if a reader accesses the clock after the writer accesses the clock,
but before the writer finishes the update, there is the possibility time
could go backwards.

Now, not to completely throw water on it, It is possible to set up a
bounds argument, and say as long as NTP adjustments are less then X, and
the window between the writer starting and finishing the update is less
then Y, the resulting inconsistency is limited to Z. And if Z is less
then a nanosecond, then you're ok.

However, items like virtualization and the realtime patch can cause Y to
be stretched quite a bit, so finding a way to handle that would be
needed as well.

thanks
-john