From: John Stultz <[email protected]>
Handle accurate time even if there's a long delay between
accumulated clock cycles.
Signed-off-by: John Stultz <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/powerpc/kernel/time.c | 3 +-
arch/x86/kernel/vsyscall_64.c | 5 ++-
include/asm-x86/vgtod.h | 2 -
include/linux/clocksource.h | 58 ++++++++++++++++++++++++++++++++++++++++--
kernel/time/timekeeping.c | 36 +++++++++++++-------------
5 files changed, 82 insertions(+), 22 deletions(-)
Index: linux-mcount.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-mcount.git.orig/arch/x86/kernel/vsyscall_64.c 2008-01-30 14:47:08.000000000 -0500
+++ linux-mcount.git/arch/x86/kernel/vsyscall_64.c 2008-01-30 14:54:12.000000000 -0500
@@ -86,6 +86,7 @@ void update_vsyscall(struct timespec *wa
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = clock->mult;
vsyscall_gtod_data.clock.shift = clock->shift;
+ vsyscall_gtod_data.clock.cycle_accumulated = clock->cycle_accumulated;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
@@ -121,7 +122,7 @@ static __always_inline long time_syscall
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
- cycle_t now, base, mask, cycle_delta;
+ cycle_t now, base, accumulated, mask, cycle_delta;
unsigned seq;
unsigned long mult, shift, nsec;
cycle_t (*vread)(void);
@@ -135,6 +136,7 @@ static __always_inline void do_vgettimeo
}
now = vread();
base = __vsyscall_gtod_data.clock.cycle_last;
+ accumulated = __vsyscall_gtod_data.clock.cycle_accumulated;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
shift = __vsyscall_gtod_data.clock.shift;
@@ -145,6 +147,7 @@ static __always_inline void do_vgettimeo
/* calculate interval: */
cycle_delta = (now - base) & mask;
+ cycle_delta += accumulated;
/* convert to nsecs: */
nsec += (cycle_delta * mult) >> shift;
Index: linux-mcount.git/include/asm-x86/vgtod.h
===================================================================
--- linux-mcount.git.orig/include/asm-x86/vgtod.h 2008-01-30 14:35:51.000000000 -0500
+++ linux-mcount.git/include/asm-x86/vgtod.h 2008-01-30 14:54:12.000000000 -0500
@@ -15,7 +15,7 @@ struct vsyscall_gtod_data {
struct timezone sys_tz;
struct { /* extract of a clocksource struct */
cycle_t (*vread)(void);
- cycle_t cycle_last;
+ cycle_t cycle_last, cycle_accumulated;
cycle_t mask;
u32 mult;
u32 shift;
Index: linux-mcount.git/include/linux/clocksource.h
===================================================================
--- linux-mcount.git.orig/include/linux/clocksource.h 2008-01-30 14:35:51.000000000 -0500
+++ linux-mcount.git/include/linux/clocksource.h 2008-01-30 14:54:12.000000000 -0500
@@ -50,8 +50,12 @@ struct clocksource;
* @flags: flags describing special properties
* @vread: vsyscall based read
* @resume: resume function for the clocksource, if necessary
+ * @cycle_last: Used internally by timekeeping core, please ignore.
+ * @cycle_accumulated: Used internally by timekeeping core, please ignore.
* @cycle_interval: Used internally by timekeeping core, please ignore.
* @xtime_interval: Used internally by timekeeping core, please ignore.
+ * @xtime_nsec: Used internally by timekeeping core, please ignore.
+ * @error: Used internally by timekeeping core, please ignore.
*/
struct clocksource {
/*
@@ -82,7 +86,10 @@ struct clocksource {
* Keep it in a different cache line to dirty no
* more than one cache line.
*/
- cycle_t cycle_last ____cacheline_aligned_in_smp;
+ struct {
+ cycle_t cycle_last, cycle_accumulated;
+ } ____cacheline_aligned_in_smp;
+
u64 xtime_nsec;
s64 error;
@@ -168,11 +175,44 @@ static inline cycle_t clocksource_read(s
}
/**
+ * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
+ * @cs: pointer to clocksource being read
+ * @now: current cycle value
+ *
+ * Uses the clocksource to return the current cycle_t value.
+ * NOTE!!!: This is different from clocksource_read, because it
+ * returns the accumulated cycle value! Must hold xtime lock!
+ */
+static inline cycle_t
+clocksource_get_cycles(struct clocksource *cs, cycle_t now)
+{
+ cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ offset += cs->cycle_accumulated;
+ return offset;
+}
+
+/**
+ * clocksource_accumulate: - Accumulates clocksource cycles
+ * @cs: pointer to clocksource being read
+ * @now: current cycle value
+ *
+ * Used to avoids clocksource hardware overflow by periodically
+ * accumulating the current cycle delta. Must hold xtime write lock!
+ */
+static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
+{
+ cycle_t offset = (now - cs->cycle_last) & cs->mask;
+ cs->cycle_last = now;
+ cs->cycle_accumulated += offset;
+}
+
+/**
* cyc2ns - converts clocksource cycles to nanoseconds
* @cs: Pointer to clocksource
* @cycles: Cycles
*
* Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ * Must hold xtime lock!
*
* XXX - This could use some mult_lxl_ll() asm optimization
*/
@@ -184,13 +224,27 @@ static inline s64 cyc2ns(struct clocksou
}
/**
+ * ns2cyc - converts nanoseconds to clocksource cycles
+ * @cs: Pointer to clocksource
+ * @nsecs: Nanoseconds
+ */
+static inline cycle_t ns2cyc(struct clocksource *cs, u64 nsecs)
+{
+ cycle_t ret = nsecs << cs->shift;
+
+ do_div(ret, cs->mult + 1);
+
+ return ret;
+}
+
+/**
* clocksource_calculate_interval - Calculates a clocksource interval struct
*
* @c: Pointer to clocksource.
* @length_nsec: Desired interval length in nanoseconds.
*
* Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
- * pair and interval request.
+ * pair and interval request. Must hold xtime_lock!
*
* Unless you're the timekeeping code, you should not be using this!
*/
Index: linux-mcount.git/kernel/time/timekeeping.c
===================================================================
--- linux-mcount.git.orig/kernel/time/timekeeping.c 2008-01-30 14:35:51.000000000 -0500
+++ linux-mcount.git/kernel/time/timekeeping.c 2008-01-30 14:54:12.000000000 -0500
@@ -66,16 +66,10 @@ static struct clocksource *clock; /* poi
*/
static inline s64 __get_nsec_offset(void)
{
- cycle_t cycle_now, cycle_delta;
+ cycle_t cycle_delta;
s64 ns_offset;
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
+ cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
ns_offset = cyc2ns(clock, cycle_delta);
return ns_offset;
@@ -195,7 +189,7 @@ static void change_clocksource(void)
clock = new;
clock->cycle_last = now;
-
+ clock->cycle_accumulated = 0;
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -205,9 +199,15 @@ static void change_clocksource(void)
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
clock->name);
}
+
+void timekeeping_accumulate(void)
+{
+ clocksource_accumulate(clock, clocksource_read(clock));
+}
#else
static inline void change_clocksource(void) { }
static inline s64 __get_nsec_offset(void) { return 0; }
+void timekeeping_accumulate(void) { }
#endif
/**
@@ -302,6 +302,7 @@ static int timekeeping_resume(struct sys
timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
/* re-base the last cycle value */
clock->cycle_last = clocksource_read(clock);
+ clock->cycle_accumulated = 0;
clock->error = 0;
timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -448,27 +449,28 @@ static void clocksource_adjust(s64 offse
*/
void update_wall_time(void)
{
- cycle_t offset;
+ cycle_t cycle_now;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;
#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+ cycle_now = clocksource_read(clock);
#else
- offset = clock->cycle_interval;
+ cycle_now = clock->cycle_last + clock->cycle_interval;
#endif
+ clocksource_accumulate(clock, cycle_now);
+
clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
- while (offset >= clock->cycle_interval) {
+ while (clock->cycle_accumulated >= clock->cycle_interval) {
/* accumulate one interval */
clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
- offset -= clock->cycle_interval;
+ clock->cycle_accumulated -= clock->cycle_interval;
if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
@@ -482,13 +484,13 @@ void update_wall_time(void)
}
/* correct the clock when NTP error is too big */
- clocksource_adjust(offset);
+ clocksource_adjust(clock->cycle_accumulated);
/* store full nanoseconds into xtime */
xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
- update_xtime_cache(cyc2ns(clock, offset));
+ update_xtime_cache(cyc2ns(clock, clock->cycle_accumulated));
/* check to see if there is a new clocksource to use */
change_clocksource();
Index: linux-mcount.git/arch/powerpc/kernel/time.c
===================================================================
--- linux-mcount.git.orig/arch/powerpc/kernel/time.c 2008-01-30 14:35:51.000000000 -0500
+++ linux-mcount.git/arch/powerpc/kernel/time.c 2008-01-30 14:54:12.000000000 -0500
@@ -773,7 +773,8 @@ void update_vsyscall(struct timespec *wa
stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
do_div(stamp_xsec, 1000000000);
stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
- update_gtod(clock->cycle_last, stamp_xsec, t2x);
+ update_gtod(clock->cycle_last-clock->cycle_accumulated,
+ stamp_xsec, t2x);
}
void update_vsyscall_tz(void)
--
* Steven Rostedt ([email protected]) wrote:
> From: John Stultz <[email protected]>
>
> Handle accurate time even if there's a long delay between
> accumulated clock cycles.
>
About this one.. we talked a lot about the importance of timekeeping at
the first Montreal Tracing Summit this week. Actually, someone
mentioned a very interesting point : in order to be able to synchronize
traces taken from the machine with traces taken on external hardware
(i.e. memory bus tracer on Freescale), taking the "real" counter value
rather that using the "cumulated cycles" approach (which creates a
virtual counted instead) would be better.
So I would recommend using an algorithm that would return a clock value
which is the same as the underlying hardware counter.
Mathieu
> Signed-off-by: John Stultz <[email protected]>
> Signed-off-by: Steven Rostedt <[email protected]>
> ---
> arch/powerpc/kernel/time.c | 3 +-
> arch/x86/kernel/vsyscall_64.c | 5 ++-
> include/asm-x86/vgtod.h | 2 -
> include/linux/clocksource.h | 58 ++++++++++++++++++++++++++++++++++++++++--
> kernel/time/timekeeping.c | 36 +++++++++++++-------------
> 5 files changed, 82 insertions(+), 22 deletions(-)
>
> Index: linux-mcount.git/arch/x86/kernel/vsyscall_64.c
> ===================================================================
> --- linux-mcount.git.orig/arch/x86/kernel/vsyscall_64.c 2008-01-30 14:47:08.000000000 -0500
> +++ linux-mcount.git/arch/x86/kernel/vsyscall_64.c 2008-01-30 14:54:12.000000000 -0500
> @@ -86,6 +86,7 @@ void update_vsyscall(struct timespec *wa
> vsyscall_gtod_data.clock.mask = clock->mask;
> vsyscall_gtod_data.clock.mult = clock->mult;
> vsyscall_gtod_data.clock.shift = clock->shift;
> + vsyscall_gtod_data.clock.cycle_accumulated = clock->cycle_accumulated;
> vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
> vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
> vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
> @@ -121,7 +122,7 @@ static __always_inline long time_syscall
>
> static __always_inline void do_vgettimeofday(struct timeval * tv)
> {
> - cycle_t now, base, mask, cycle_delta;
> + cycle_t now, base, accumulated, mask, cycle_delta;
> unsigned seq;
> unsigned long mult, shift, nsec;
> cycle_t (*vread)(void);
> @@ -135,6 +136,7 @@ static __always_inline void do_vgettimeo
> }
> now = vread();
> base = __vsyscall_gtod_data.clock.cycle_last;
> + accumulated = __vsyscall_gtod_data.clock.cycle_accumulated;
> mask = __vsyscall_gtod_data.clock.mask;
> mult = __vsyscall_gtod_data.clock.mult;
> shift = __vsyscall_gtod_data.clock.shift;
> @@ -145,6 +147,7 @@ static __always_inline void do_vgettimeo
>
> /* calculate interval: */
> cycle_delta = (now - base) & mask;
> + cycle_delta += accumulated;
> /* convert to nsecs: */
> nsec += (cycle_delta * mult) >> shift;
>
> Index: linux-mcount.git/include/asm-x86/vgtod.h
> ===================================================================
> --- linux-mcount.git.orig/include/asm-x86/vgtod.h 2008-01-30 14:35:51.000000000 -0500
> +++ linux-mcount.git/include/asm-x86/vgtod.h 2008-01-30 14:54:12.000000000 -0500
> @@ -15,7 +15,7 @@ struct vsyscall_gtod_data {
> struct timezone sys_tz;
> struct { /* extract of a clocksource struct */
> cycle_t (*vread)(void);
> - cycle_t cycle_last;
> + cycle_t cycle_last, cycle_accumulated;
> cycle_t mask;
> u32 mult;
> u32 shift;
> Index: linux-mcount.git/include/linux/clocksource.h
> ===================================================================
> --- linux-mcount.git.orig/include/linux/clocksource.h 2008-01-30 14:35:51.000000000 -0500
> +++ linux-mcount.git/include/linux/clocksource.h 2008-01-30 14:54:12.000000000 -0500
> @@ -50,8 +50,12 @@ struct clocksource;
> * @flags: flags describing special properties
> * @vread: vsyscall based read
> * @resume: resume function for the clocksource, if necessary
> + * @cycle_last: Used internally by timekeeping core, please ignore.
> + * @cycle_accumulated: Used internally by timekeeping core, please ignore.
> * @cycle_interval: Used internally by timekeeping core, please ignore.
> * @xtime_interval: Used internally by timekeeping core, please ignore.
> + * @xtime_nsec: Used internally by timekeeping core, please ignore.
> + * @error: Used internally by timekeeping core, please ignore.
> */
> struct clocksource {
> /*
> @@ -82,7 +86,10 @@ struct clocksource {
> * Keep it in a different cache line to dirty no
> * more than one cache line.
> */
> - cycle_t cycle_last ____cacheline_aligned_in_smp;
> + struct {
> + cycle_t cycle_last, cycle_accumulated;
> + } ____cacheline_aligned_in_smp;
> +
> u64 xtime_nsec;
> s64 error;
>
> @@ -168,11 +175,44 @@ static inline cycle_t clocksource_read(s
> }
>
> /**
> + * clocksource_get_cycles: - Access the clocksource's accumulated cycle value
> + * @cs: pointer to clocksource being read
> + * @now: current cycle value
> + *
> + * Uses the clocksource to return the current cycle_t value.
> + * NOTE!!!: This is different from clocksource_read, because it
> + * returns the accumulated cycle value! Must hold xtime lock!
> + */
> +static inline cycle_t
> +clocksource_get_cycles(struct clocksource *cs, cycle_t now)
> +{
> + cycle_t offset = (now - cs->cycle_last) & cs->mask;
> + offset += cs->cycle_accumulated;
> + return offset;
> +}
> +
> +/**
> + * clocksource_accumulate: - Accumulates clocksource cycles
> + * @cs: pointer to clocksource being read
> + * @now: current cycle value
> + *
> + * Used to avoids clocksource hardware overflow by periodically
> + * accumulating the current cycle delta. Must hold xtime write lock!
> + */
> +static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now)
> +{
> + cycle_t offset = (now - cs->cycle_last) & cs->mask;
> + cs->cycle_last = now;
> + cs->cycle_accumulated += offset;
> +}
> +
> +/**
> * cyc2ns - converts clocksource cycles to nanoseconds
> * @cs: Pointer to clocksource
> * @cycles: Cycles
> *
> * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
> + * Must hold xtime lock!
> *
> * XXX - This could use some mult_lxl_ll() asm optimization
> */
> @@ -184,13 +224,27 @@ static inline s64 cyc2ns(struct clocksou
> }
>
> /**
> + * ns2cyc - converts nanoseconds to clocksource cycles
> + * @cs: Pointer to clocksource
> + * @nsecs: Nanoseconds
> + */
> +static inline cycle_t ns2cyc(struct clocksource *cs, u64 nsecs)
> +{
> + cycle_t ret = nsecs << cs->shift;
> +
> + do_div(ret, cs->mult + 1);
> +
> + return ret;
> +}
> +
> +/**
> * clocksource_calculate_interval - Calculates a clocksource interval struct
> *
> * @c: Pointer to clocksource.
> * @length_nsec: Desired interval length in nanoseconds.
> *
> * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
> - * pair and interval request.
> + * pair and interval request. Must hold xtime_lock!
> *
> * Unless you're the timekeeping code, you should not be using this!
> */
> Index: linux-mcount.git/kernel/time/timekeeping.c
> ===================================================================
> --- linux-mcount.git.orig/kernel/time/timekeeping.c 2008-01-30 14:35:51.000000000 -0500
> +++ linux-mcount.git/kernel/time/timekeeping.c 2008-01-30 14:54:12.000000000 -0500
> @@ -66,16 +66,10 @@ static struct clocksource *clock; /* poi
> */
> static inline s64 __get_nsec_offset(void)
> {
> - cycle_t cycle_now, cycle_delta;
> + cycle_t cycle_delta;
> s64 ns_offset;
>
> - /* read clocksource: */
> - cycle_now = clocksource_read(clock);
> -
> - /* calculate the delta since the last update_wall_time: */
> - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
> -
> - /* convert to nanoseconds: */
> + cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock));
> ns_offset = cyc2ns(clock, cycle_delta);
>
> return ns_offset;
> @@ -195,7 +189,7 @@ static void change_clocksource(void)
>
> clock = new;
> clock->cycle_last = now;
> -
> + clock->cycle_accumulated = 0;
> clock->error = 0;
> clock->xtime_nsec = 0;
> clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
> @@ -205,9 +199,15 @@ static void change_clocksource(void)
> printk(KERN_INFO "Time: %s clocksource has been installed.\n",
> clock->name);
> }
> +
> +void timekeeping_accumulate(void)
> +{
> + clocksource_accumulate(clock, clocksource_read(clock));
> +}
> #else
> static inline void change_clocksource(void) { }
> static inline s64 __get_nsec_offset(void) { return 0; }
> +void timekeeping_accumulate(void) { }
> #endif
>
> /**
> @@ -302,6 +302,7 @@ static int timekeeping_resume(struct sys
> timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
> /* re-base the last cycle value */
> clock->cycle_last = clocksource_read(clock);
> + clock->cycle_accumulated = 0;
> clock->error = 0;
> timekeeping_suspended = 0;
> write_sequnlock_irqrestore(&xtime_lock, flags);
> @@ -448,27 +449,28 @@ static void clocksource_adjust(s64 offse
> */
> void update_wall_time(void)
> {
> - cycle_t offset;
> + cycle_t cycle_now;
>
> /* Make sure we're fully resumed: */
> if (unlikely(timekeeping_suspended))
> return;
>
> #ifdef CONFIG_GENERIC_TIME
> - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
> + cycle_now = clocksource_read(clock);
> #else
> - offset = clock->cycle_interval;
> + cycle_now = clock->cycle_last + clock->cycle_interval;
> #endif
> + clocksource_accumulate(clock, cycle_now);
> +
> clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
>
> /* normally this loop will run just once, however in the
> * case of lost or late ticks, it will accumulate correctly.
> */
> - while (offset >= clock->cycle_interval) {
> + while (clock->cycle_accumulated >= clock->cycle_interval) {
> /* accumulate one interval */
> clock->xtime_nsec += clock->xtime_interval;
> - clock->cycle_last += clock->cycle_interval;
> - offset -= clock->cycle_interval;
> + clock->cycle_accumulated -= clock->cycle_interval;
>
> if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
> clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
> @@ -482,13 +484,13 @@ void update_wall_time(void)
> }
>
> /* correct the clock when NTP error is too big */
> - clocksource_adjust(offset);
> + clocksource_adjust(clock->cycle_accumulated);
>
> /* store full nanoseconds into xtime */
> xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
> clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
>
> - update_xtime_cache(cyc2ns(clock, offset));
> + update_xtime_cache(cyc2ns(clock, clock->cycle_accumulated));
>
> /* check to see if there is a new clocksource to use */
> change_clocksource();
> Index: linux-mcount.git/arch/powerpc/kernel/time.c
> ===================================================================
> --- linux-mcount.git.orig/arch/powerpc/kernel/time.c 2008-01-30 14:35:51.000000000 -0500
> +++ linux-mcount.git/arch/powerpc/kernel/time.c 2008-01-30 14:54:12.000000000 -0500
> @@ -773,7 +773,8 @@ void update_vsyscall(struct timespec *wa
> stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
> do_div(stamp_xsec, 1000000000);
> stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
> - update_gtod(clock->cycle_last, stamp_xsec, t2x);
> + update_gtod(clock->cycle_last-clock->cycle_accumulated,
> + stamp_xsec, t2x);
> }
>
> void update_vsyscall_tz(void)
>
> --
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Thu, 2008-01-31 at 07:10 -0500, Mathieu Desnoyers wrote:
> * Steven Rostedt ([email protected]) wrote:
> > From: John Stultz <[email protected]>
> >
> > Handle accurate time even if there's a long delay between
> > accumulated clock cycles.
> >
>
> About this one.. we talked a lot about the importance of timekeeping at
> the first Montreal Tracing Summit this week. Actually, someone
> mentioned a very interesting point : in order to be able to synchronize
> traces taken from the machine with traces taken on external hardware
> (i.e. memory bus tracer on Freescale), taking the "real" counter value
> rather that using the "cumulated cycles" approach (which creates a
> virtual counted instead) would be better.
>
> So I would recommend using an algorithm that would return a clock value
> which is the same as the underlying hardware counter.
Hmm. It is an interesting issue. Clearly having the raw cycle value
match up so hardware analysis could be mapped to software timestamps
would be useful(although obscure) feature. However with the variety of
clocksources, dealing properly with the clocksource wrap issue (ACPI PM
for instance wraps about every 5 seconds) also has to be addressed.
I think you were mentioning an idea that required some work on the read
side to handle the wraps, basically managing the high order bits by
hand. This sounds like it would be an additional feature that could be
added on to the infrastructure being provided in the
get_monotonic_cycles() patch. No?
However, all of the above is a separate issue then what this (the
timekeeping over long delay) patch addresses, as it is not really
directly related to the get_monotonic_cycles() patch, but instead allows
for correct timekeeping, making update_wall_time() to function properly
if it was deferred for longer then the clocksource's wrap time.
thanks
-john
* John Stultz ([email protected]) wrote:
>
> On Thu, 2008-01-31 at 07:10 -0500, Mathieu Desnoyers wrote:
> > * Steven Rostedt ([email protected]) wrote:
> > > From: John Stultz <[email protected]>
> > >
> > > Handle accurate time even if there's a long delay between
> > > accumulated clock cycles.
> > >
> >
> > About this one.. we talked a lot about the importance of timekeeping at
> > the first Montreal Tracing Summit this week. Actually, someone
> > mentioned a very interesting point : in order to be able to synchronize
> > traces taken from the machine with traces taken on external hardware
> > (i.e. memory bus tracer on Freescale), taking the "real" counter value
> > rather that using the "cumulated cycles" approach (which creates a
> > virtual counted instead) would be better.
> >
> > So I would recommend using an algorithm that would return a clock value
> > which is the same as the underlying hardware counter.
>
> Hmm. It is an interesting issue. Clearly having the raw cycle value
> match up so hardware analysis could be mapped to software timestamps
> would be useful(although obscure) feature. However with the variety of
> clocksources, dealing properly with the clocksource wrap issue (ACPI PM
> for instance wraps about every 5 seconds) also has to be addressed.
>
> I think you were mentioning an idea that required some work on the read
> side to handle the wraps, basically managing the high order bits by
> hand. This sounds like it would be an additional feature that could be
> added on to the infrastructure being provided in the
> get_monotonic_cycles() patch. No?
>
Yup, exactly.
>
> However, all of the above is a separate issue then what this (the
> timekeeping over long delay) patch addresses, as it is not really
> directly related to the get_monotonic_cycles() patch, but instead allows
> for correct timekeeping, making update_wall_time() to function properly
> if it was deferred for longer then the clocksource's wrap time.
>
I agree, that could apply on top of the monotonic cycles patch. It's
just a different way to see it : dealing with wrapping TSC bits,
returning the LSBs given by the hardware, rather than simply
accumulating time. This is what the patch I sent earlier (which I use in
LTTng) does. I currently expects 32 LSBs to be given by the hardware,
but it would be trivial to extend it to support any given number of
hardware LSBs.
Mathieu
> thanks
> -john
>
>
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Fri, 1 Feb 2008, Mathieu Desnoyers wrote:
> > > > accumulated clock cycles.
> > > >
> > >
> > > About this one.. we talked a lot about the importance of timekeeping at
> > > the first Montreal Tracing Summit this week. Actually, someone
> > > mentioned a very interesting point : in order to be able to synchronize
> > > traces taken from the machine with traces taken on external hardware
> > > (i.e. memory bus tracer on Freescale), taking the "real" counter value
> > > rather that using the "cumulated cycles" approach (which creates a
> > > virtual counted instead) would be better.
> > >
> > > So I would recommend using an algorithm that would return a clock value
> > > which is the same as the underlying hardware counter.
> >
> > Hmm. It is an interesting issue. Clearly having the raw cycle value
> > match up so hardware analysis could be mapped to software timestamps
> > would be useful(although obscure) feature. However with the variety of
> > clocksources, dealing properly with the clocksource wrap issue (ACPI PM
> > for instance wraps about every 5 seconds) also has to be addressed.
> >
> > I think you were mentioning an idea that required some work on the read
> > side to handle the wraps, basically managing the high order bits by
> > hand. This sounds like it would be an additional feature that could be
> > added on to the infrastructure being provided in the
> > get_monotonic_cycles() patch. No?
> >
>
> Yup, exactly.
>
> >
> > However, all of the above is a separate issue then what this (the
> > timekeeping over long delay) patch addresses, as it is not really
> > directly related to the get_monotonic_cycles() patch, but instead allows
> > for correct timekeeping, making update_wall_time() to function properly
> > if it was deferred for longer then the clocksource's wrap time.
> >
>
> I agree, that could apply on top of the monotonic cycles patch. It's
> just a different way to see it : dealing with wrapping TSC bits,
> returning the LSBs given by the hardware, rather than simply
> accumulating time. This is what the patch I sent earlier (which I use in
> LTTng) does. I currently expects 32 LSBs to be given by the hardware,
> but it would be trivial to extend it to support any given number of
> hardware LSBs.
>
So you are saying that you can trivally make it work with a clock that is,
say 24 bits? And this same code can work if we boot up with a clock with
32 bits or more?
-- Steve
* Steven Rostedt ([email protected]) wrote:
>
>
>
> On Fri, 1 Feb 2008, Mathieu Desnoyers wrote:
>
> > > > > accumulated clock cycles.
> > > > >
> > > >
> > > > About this one.. we talked a lot about the importance of timekeeping at
> > > > the first Montreal Tracing Summit this week. Actually, someone
> > > > mentioned a very interesting point : in order to be able to synchronize
> > > > traces taken from the machine with traces taken on external hardware
> > > > (i.e. memory bus tracer on Freescale), taking the "real" counter value
> > > > rather that using the "cumulated cycles" approach (which creates a
> > > > virtual counted instead) would be better.
> > > >
> > > > So I would recommend using an algorithm that would return a clock value
> > > > which is the same as the underlying hardware counter.
> > >
> > > Hmm. It is an interesting issue. Clearly having the raw cycle value
> > > match up so hardware analysis could be mapped to software timestamps
> > > would be useful(although obscure) feature. However with the variety of
> > > clocksources, dealing properly with the clocksource wrap issue (ACPI PM
> > > for instance wraps about every 5 seconds) also has to be addressed.
> > >
> > > I think you were mentioning an idea that required some work on the read
> > > side to handle the wraps, basically managing the high order bits by
> > > hand. This sounds like it would be an additional feature that could be
> > > added on to the infrastructure being provided in the
> > > get_monotonic_cycles() patch. No?
> > >
> >
> > Yup, exactly.
> >
> > >
> > > However, all of the above is a separate issue then what this (the
> > > timekeeping over long delay) patch addresses, as it is not really
> > > directly related to the get_monotonic_cycles() patch, but instead allows
> > > for correct timekeeping, making update_wall_time() to function properly
> > > if it was deferred for longer then the clocksource's wrap time.
> > >
> >
> > I agree, that could apply on top of the monotonic cycles patch. It's
> > just a different way to see it : dealing with wrapping TSC bits,
> > returning the LSBs given by the hardware, rather than simply
> > accumulating time. This is what the patch I sent earlier (which I use in
> > LTTng) does. I currently expects 32 LSBs to be given by the hardware,
> > but it would be trivial to extend it to support any given number of
> > hardware LSBs.
> >
>
> So you are saying that you can trivally make it work with a clock that is,
> say 24 bits? And this same code can work if we boot up with a clock with
> 32 bits or more?
>
> -- Steve
>
Yes, with this updated version. It supports HW clocks with various
number of bits. I limit myself to the 32 LSBs of the clock even if the
clock provides more bits for performance reasons. This module is aimed
at 32 bits architectures because such tricks are not necessary on 64
bits architectures given they provide atomic 64 bits updates.
(it's only compile-tested)
LTTng timestamp
LTTng synthetic TSC code for timestamping. Extracts 64 bits tsc from a [0..32]
bits counter, kept up to date by periodical timer interrupt. Lockless.
> do you actually use the RCU internals? or do you just reimplement an RCU
> algorithm?
>
Nope, I don't use RCU internals in this code. Preempt disable seemed
like the best way to handle this utterly short code path and I wanted
the write side to be fast enough to be called periodically. What I do is:
- Disable preemption at the read-side :
it makes sure the pointer I get will point to a data structure that
will never change while I am in the preempt disabled code. (see *)
- I use per-cpu data to allow the read-side to be as fast as possible
(only need to disable preemption, does not race against other CPUs and
won't generate cache line bouncing). It also allows dealing with
unsynchronized TSCs if needed.
- Periodical write side : it's called from an IPI running on each CPU.
(*) We expect the read-side (preempt off region) to last shorter than
the interval between IPI updates so we can guarantee the data structure
it uses won't be modified underneath it. Since the IPI update is
launched each seconds or so (depends on the frequency of the counter we
are trying to extend), it's more than ok.
Changelog:
- Support [0..32] bits -> 64 bits.
I volountarily limit the code to use at most 32 bits of the hardware clock for
performance considerations. If this is a problem it could be changed. Also, the
algorithm is aimed at a 32 bits architecture. The code becomes muuuch simpler on
a 64 bits arch, since we can do the updates atomically.
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
init/Kconfig | 2
ltt/Kconfig | 17 ++++
ltt/Makefile | 1
ltt/ltt-timestamp.c | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 230 insertions(+)
Index: linux-2.6-lttng/ltt/ltt-timestamp.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/ltt-timestamp.c 2008-02-02 11:26:08.000000000 -0500
@@ -0,0 +1,210 @@
+/*
+ * (C) Copyright 2006,2007 -
+ * Mathieu Desnoyers ([email protected])
+ *
+ * notes : ltt-timestamp timer-based clock cannot be used for early tracing in
+ * the boot process, as it depends on timer interrupts.
+ *
+ * The timer is only on one CPU to support hotplug.
+ * We have the choice between schedule_delayed_work_on and an IPI to get each
+ * CPU to write the heartbeat. IPI has been chosen because it is considered
+ * faster than passing through the timer to get the work scheduled on all the
+ * CPUs.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/ltt.h>
+#include <linux/smp.h>
+#include <linux/sched.h> /* FIX for m68k local_irq_enable in on_each_cpu */
+
+/*
+ * Number of hardware clock bits. The higher order bits are expected to be 0.
+ * If the hardware clock source has more than 32 bits, the bits higher than the
+ * 32nd will be truncated by a cast to a 32 bits unsigned. Range : 1 - 32.
+ * (too few bits would be unrealistic though, since we depend on the timer to
+ * detect the overflows).
+ */
+#define HW_BITS 32
+
+#define HW_BITMASK ((1ULL << HW_BITS) - 1)
+#define HW_LSB(hw) ((hw) & HW_BITMASK)
+#define SW_MSB(sw) ((sw) & ~HW_BITMASK)
+
+/* Expected maximum interrupt latency in ms : 15ms, *2 for security */
+#define EXPECTED_INTERRUPT_LATENCY 30
+
+atomic_t lttng_generic_clock;
+EXPORT_SYMBOL(lttng_generic_clock);
+
+static struct timer_list stsc_timer;
+static unsigned int precalc_expire;
+
+static struct synthetic_tsc_struct {
+ union {
+ u64 val;
+ struct {
+#ifdef __BIG_ENDIAN
+ u32 msb;
+ u32 lsb;
+#else
+ u32 lsb;
+ u32 msb;
+#endif
+ } sel;
+ } tsc[2];
+ unsigned int index; /* Index of the current synth. tsc. */
+} ____cacheline_aligned synthetic_tsc[NR_CPUS];
+
+/* Called from IPI : either in interrupt or process context */
+static void ltt_update_synthetic_tsc(void)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u32 tsc;
+
+ preempt_disable();
+ cpu_synth = &synthetic_tsc[smp_processor_id()];
+ tsc = ltt_get_timestamp32(); /* Hardware clocksource read */
+
+ if (tsc < HW_LSB(cpu_synth->tsc[cpu_synth->index].sel.lsb)) {
+ unsigned int new_index = 1 - cpu_synth->index; /* 0 <-> 1 */
+ /*
+ * Overflow
+ * Non atomic update of the non current synthetic TSC, followed
+ * by an atomic index change. There is no write concurrency,
+ * so the index read/write does not need to be atomic.
+ */
+ cpu_synth->tsc[new_index].val =
+ (SW_MSB(cpu_synth->tsc[cpu_synth->index].val)
+ | (u64)tsc) + (1ULL << HW_BITS);
+ cpu_synth->index = new_index; /* atomic change of index */
+ } else {
+ /*
+ * No overflow : We know that the only bits changed are
+ * contained in the 32 LSBs, which can be written to atomically.
+ */
+ cpu_synth->tsc[cpu_synth->index].sel.lsb =
+ SW_MSB(cpu_synth->tsc[cpu_synth->index].sel.lsb) | tsc;
+ }
+ preempt_enable();
+}
+
+/* Called from buffer switch : in _any_ context (even NMI) */
+u64 ltt_read_synthetic_tsc(void)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u64 ret;
+ unsigned int index;
+ u32 tsc;
+
+ preempt_disable();
+ cpu_synth = &synthetic_tsc[smp_processor_id()];
+ index = cpu_synth->index; /* atomic read */
+ tsc = ltt_get_timestamp32(); /* Hardware clocksource read */
+
+ /* Overflow detection */
+ if (unlikely(tsc < HW_LSB(cpu_synth->tsc[index].sel.lsb)))
+ ret = (SW_MSB(cpu_synth->tsc[index].val) | (u64)tsc)
+ + (1ULL << HW_BITS);
+ else
+ ret = SW_MSB(cpu_synth->tsc[index].val) | (u64)tsc;
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_read_synthetic_tsc);
+
+static void synthetic_tsc_ipi(void *info)
+{
+ ltt_update_synthetic_tsc();
+}
+
+/* We need to be in process context to do an IPI */
+static void synthetic_tsc_work(struct work_struct *work)
+{
+ on_each_cpu(synthetic_tsc_ipi, NULL, 1, 1);
+}
+static DECLARE_WORK(stsc_work, synthetic_tsc_work);
+
+/*
+ * stsc_timer : - Timer function synchronizing synthetic TSC.
+ * @data: unused
+ *
+ * Guarantees at least 1 execution before low word of TSC wraps.
+ */
+static void stsc_timer_fct(unsigned long data)
+{
+ PREPARE_WORK(&stsc_work, synthetic_tsc_work);
+ schedule_work(&stsc_work);
+
+ mod_timer(&stsc_timer, jiffies + precalc_expire);
+}
+
+/*
+ * precalc_stsc_interval: - Precalculates the interval between the clock
+ * wraparounds.
+ */
+static int __init precalc_stsc_interval(void)
+{
+ precalc_expire =
+ (HW_BITMASK / ((ltt_frequency() / HZ * ltt_freq_scale()) << 1)
+ - 1 - (EXPECTED_INTERRUPT_LATENCY * HZ / 1000)) >> 1;
+ WARN_ON(precalc_expire == 0);
+ printk(KERN_DEBUG "Synthetic TSC timer will fire each %u jiffies.\n",
+ precalc_expire);
+ return 0;
+}
+
+/*
+ * hotcpu_callback - CPU hotplug callback
+ * @nb: notifier block
+ * @action: hotplug action to take
+ * @hcpu: CPU number
+ *
+ * Sets the new CPU's current synthetic TSC to the same value as the
+ * currently running CPU.
+ *
+ * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
+ */
+static int __cpuinit hotcpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int hotcpu = (unsigned long)hcpu;
+ struct synthetic_tsc_struct *cpu_synth;
+ u64 local_count;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ cpu_synth = &synthetic_tsc[hotcpu];
+ local_count = ltt_read_synthetic_tsc();
+ cpu_synth->tsc[0].val = local_count;
+ cpu_synth->index = 0;
+ smp_wmb(); /* Writing in data of CPU about to come up */
+ break;
+ case CPU_ONLINE:
+ /* As we are preemptible, make sure it runs on the right cpu */
+ smp_call_function_single(hotcpu, synthetic_tsc_ipi, NULL, 1, 0);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/* Called from one CPU, before any tracing starts, to init each structure */
+static int __init ltt_init_synthetic_tsc(void)
+{
+ hotcpu_notifier(hotcpu_callback, 3);
+ precalc_stsc_interval();
+ init_timer(&stsc_timer);
+ stsc_timer.function = stsc_timer_fct;
+ stsc_timer.expires = jiffies + precalc_expire;
+ add_timer(&stsc_timer);
+ return 0;
+}
+
+__initcall(ltt_init_synthetic_tsc);
Index: linux-2.6-lttng/ltt/Kconfig
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/Kconfig 2008-02-02 11:25:36.000000000 -0500
@@ -0,0 +1,17 @@
+menu "Linux Trace Toolkit"
+
+config LTT_TIMESTAMP
+ bool "LTTng fine-grained timestamping"
+ default y
+ help
+ Allow fine-grained timestamps to be taken from tracing applications.
+
+config HAVE_LTT_CLOCK
+ def_bool n
+
+config HAVE_LTT_SYNTHETIC_TSC
+ bool
+ default y if (!HAVE_LTT_CLOCK)
+ default n if HAVE_LTT_CLOCK
+
+endmenu
Index: linux-2.6-lttng/ltt/Makefile
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/ltt/Makefile 2008-02-02 11:25:36.000000000 -0500
@@ -0,0 +1 @@
+obj-$(CONFIG_HAVE_LTT_SYNTHETIC_TSC) += ltt-timestamp.o
Index: linux-2.6-lttng/init/Kconfig
===================================================================
--- linux-2.6-lttng.orig/init/Kconfig 2008-02-02 10:07:33.000000000 -0500
+++ linux-2.6-lttng/init/Kconfig 2008-02-02 10:07:36.000000000 -0500
@@ -691,6 +691,8 @@ config MARKERS
Place an empty function call at each marker site. Can be
dynamically changed for a probe function.
+source "ltt/Kconfig"
+
source "arch/Kconfig"
config DISABLE_IMMEDIATE
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68