2009-07-21 19:21:10

by Martin Schwidefsky

[permalink] [raw]
Subject: [RFC][patch 1/5] move clock source related code to clocksource.c

From: Martin Schwidefsky <[email protected]>

Move clock source related code from timekeeping.c to clocksource.c
where they belong. The selected clocks source "clock" is now defined
in clocksource.c and clocksource_init is added to set up the initial
clock.

Cc: Ingo Molnar <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: john stultz <[email protected]>
Signed-off-by: Martin Schwidefsky <[email protected]>
---
include/linux/clocksource.h | 44 ++-------
kernel/time/clocksource.c | 199 ++++++++++++++++++++++++++++++++++++++++++++
kernel/time/timekeeping.c | 161 -----------------------------------
3 files changed, 211 insertions(+), 193 deletions(-)

Index: linux-2.6/include/linux/clocksource.h
===================================================================
--- linux-2.6.orig/include/linux/clocksource.h
+++ linux-2.6/include/linux/clocksource.h
@@ -14,6 +14,7 @@
#include <linux/list.h>
#include <linux/cache.h>
#include <linux/timer.h>
+#include <linux/init.h>
#include <asm/div64.h>
#include <asm/io.h>

@@ -329,46 +330,23 @@ static inline s64 cyc2ns(struct clocksou
return ret;
}

-/**
- * clocksource_calculate_interval - Calculates a clocksource interval struct
- *
- * @c: Pointer to clocksource.
- * @length_nsec: Desired interval length in nanoseconds.
- *
- * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
- * pair and interval request.
- *
- * Unless you're the timekeeping code, you should not be using this!
- */
-static inline void clocksource_calculate_interval(struct clocksource *c,
- unsigned long length_nsec)
-{
- u64 tmp;
-
- /* Do the ns -> cycle conversion first, using original mult */
- tmp = length_nsec;
- tmp <<= c->shift;
- tmp += c->mult_orig/2;
- do_div(tmp, c->mult_orig);
-
- c->cycle_interval = (cycle_t)tmp;
- if (c->cycle_interval == 0)
- c->cycle_interval = 1;
-
- /* Go back from cycles -> shifted ns, this time use ntp adjused mult */
- c->xtime_interval = (u64)c->cycle_interval * c->mult;
- c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
-}
-
-
/* used to install a new clocksource */
+extern void __init clocksource_init(void);
extern int clocksource_register(struct clocksource*);
extern void clocksource_unregister(struct clocksource*);
extern void clocksource_touch_watchdog(void);
-extern struct clocksource* clocksource_get_next(void);
extern void clocksource_change_rating(struct clocksource *cs, int rating);
+extern void clocksource_adjust(s64 offset);
extern void clocksource_resume(void);

+#ifdef CONFIG_GENERIC_TIME
+extern void clocksource_forward_now(void);
+extern void change_clocksource(void);
+#else
+static inline void clocksource_forward_now(void) { }
+static inline void change_clocksource(void) { }
+#endif
+
#ifdef CONFIG_GENERIC_TIME_VSYSCALL
extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
extern void update_vsyscall_tz(void);
Index: linux-2.6/kernel/time/clocksource.c
===================================================================
--- linux-2.6.orig/kernel/time/clocksource.c
+++ linux-2.6/kernel/time/clocksource.c
@@ -110,6 +110,9 @@ EXPORT_SYMBOL(timecounter_cyc2time);
/* XXX - Would like a better way for initializing curr_clocksource */
extern struct clocksource clocksource_jiffies;

+/* Currently selected clock source. */
+struct clocksource *clock;
+
/*[Clocksource internal variables]---------
* curr_clocksource:
* currently selected clocksource. Initialized to clocksource_jiffies.
@@ -392,6 +395,191 @@ static int clocksource_enqueue(struct cl
}

/**
+ * clocksource_calculate_interval - Calculates a clocksource interval struct
+ *
+ * @c: Pointer to clocksource.
+ * @length_nsec: Desired interval length in nanoseconds.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timekeeping code, you should not be using this!
+ */
+static void clocksource_calculate_interval(struct clocksource *c,
+ unsigned long length_nsec)
+{
+ u64 tmp;
+
+ /* Do the ns -> cycle conversion first, using original mult */
+ tmp = length_nsec;
+ tmp <<= c->shift;
+ tmp += c->mult_orig/2;
+ do_div(tmp, c->mult_orig);
+
+ c->cycle_interval = (cycle_t)tmp;
+ if (c->cycle_interval == 0)
+ c->cycle_interval = 1;
+
+ /* Go back from cycles -> shifted ns, this time use ntp adjused mult */
+ c->xtime_interval = (u64)c->cycle_interval * c->mult;
+ c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
+}
+
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+ s64 *offset)
+{
+ s64 tick_error, i;
+ u32 look_ahead, adj;
+ s32 error2, mult;
+
+ /*
+ * Use the current error value to determine how much to look ahead.
+ * The larger the error the slower we adjust for it to avoid problems
+ * with losing too many ticks, otherwise we would overadjust and
+ * produce an even larger error. The smaller the adjustment the
+ * faster we try to adjust for it, as lost ticks can do less harm
+ * here. This is tuned so that an error of about 1 msec is adjusted
+ * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+ */
+ error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = abs(error2);
+ for (look_ahead = 0; error2 > 0; look_ahead++)
+ error2 >>= 2;
+
+ /*
+ * Now calculate the error in (1 << look_ahead) ticks, but first
+ * remove the single look ahead already included in the error.
+ */
+ tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
+ tick_error -= clock->xtime_interval >> 1;
+ error = ((error - tick_error) >> look_ahead) + tick_error;
+
+ /* Finally calculate the adjustment shift value. */
+ i = *interval;
+ mult = 1;
+ if (error < 0) {
+ error = -error;
+ *interval = -*interval;
+ *offset = -*offset;
+ mult = -1;
+ }
+ for (adj = 0; error > i; adj++)
+ error >>= 1;
+
+ *interval <<= adj;
+ *offset <<= adj;
+ return mult << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+void clocksource_adjust(s64 offset)
+{
+ s64 error, interval = clock->cycle_interval;
+ int adj;
+
+ error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
+ if (error > interval) {
+ error >>= 2;
+ if (likely(error <= interval))
+ adj = 1;
+ else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else if (error < -interval) {
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else
+ return;
+
+ clock->mult += adj;
+ clock->xtime_interval += interval;
+ clock->xtime_nsec -= offset;
+ clock->error -= (interval - offset) <<
+ (NTP_SCALE_SHIFT - clock->shift);
+}
+
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * clocksource_forward_now - update clock to the current time
+ *
+ * Forward the current clock to update its state since the last call to
+ * update_wall_time(). This is useful before significant clock changes,
+ * as it avoids having to deal with this time offset explicitly.
+ */
+void clocksource_forward_now(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ s64 nsec;
+
+ cycle_now = clocksource_read(clock);
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ clock->cycle_last = cycle_now;
+
+ nsec = cyc2ns(clock, cycle_delta);
+
+ /* If arch requires, add in gettimeoffset() */
+ nsec += arch_gettimeoffset();
+
+ timespec_add_ns(&xtime, nsec);
+
+ nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+ clock->raw_time.tv_nsec += nsec;
+}
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+void change_clocksource(void)
+{
+ struct clocksource *new, *old;
+
+ new = clocksource_get_next();
+
+ if (clock == new)
+ return;
+
+ clocksource_forward_now();
+
+ if (clocksource_enable(new))
+ return;
+
+ new->raw_time = clock->raw_time;
+ old = clock;
+ clock = new;
+ clocksource_disable(old);
+
+ clock->cycle_last = 0;
+ clock->cycle_last = clocksource_read(clock);
+ clock->error = 0;
+ clock->xtime_nsec = 0;
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+
+ tick_clock_notify();
+
+ /*
+ * We're holding xtime lock and waking up klogd would deadlock
+ * us on enqueue. So no printing!
+ printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+ clock->name);
+ */
+}
+#endif
+
+/**
* clocksource_register - Used to install new clocksources
* @t: clocksource to be registered
*
@@ -444,6 +632,17 @@ void clocksource_unregister(struct clock
spin_unlock_irqrestore(&clocksource_lock, flags);
}

+/**
+ * clocksource_init - set up initial clock source
+ */
+void __init clocksource_init(void)
+{
+ clock = clocksource_get_next();
+ clocksource_enable(clock);
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+ clock->cycle_last = clock->read(clock);
+}
+
#ifdef CONFIG_SYSFS
/**
* sysfs_show_current_clocksources - sysfs interface for current clocksource
Index: linux-2.6/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.orig/kernel/time/timekeeping.c
+++ linux-2.6/kernel/time/timekeeping.c
@@ -56,38 +56,9 @@ void update_xtime_cache(u64 nsec)
timespec_add_ns(&xtime_cache, nsec);
}

-struct clocksource *clock;
-

#ifdef CONFIG_GENERIC_TIME
/**
- * clocksource_forward_now - update clock to the current time
- *
- * Forward the current clock to update its state since the last call to
- * update_wall_time(). This is useful before significant clock changes,
- * as it avoids having to deal with this time offset explicitly.
- */
-static void clocksource_forward_now(void)
-{
- cycle_t cycle_now, cycle_delta;
- s64 nsec;
-
- cycle_now = clocksource_read(clock);
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- clock->cycle_last = cycle_now;
-
- nsec = cyc2ns(clock, cycle_delta);
-
- /* If arch requires, add in gettimeoffset() */
- nsec += arch_gettimeoffset();
-
- timespec_add_ns(&xtime, nsec);
-
- nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
- clock->raw_time.tv_nsec += nsec;
-}
-
-/**
* getnstimeofday - Returns the time of day in a timespec
* @ts: pointer to the timespec to be set
*
@@ -251,48 +222,7 @@ int do_settimeofday(struct timespec *tv)

EXPORT_SYMBOL(do_settimeofday);

-/**
- * change_clocksource - Swaps clocksources if a new one is available
- *
- * Accumulates current time interval and initializes new clocksource
- */
-static void change_clocksource(void)
-{
- struct clocksource *new, *old;
-
- new = clocksource_get_next();
-
- if (clock == new)
- return;
-
- clocksource_forward_now();
-
- if (clocksource_enable(new))
- return;
-
- new->raw_time = clock->raw_time;
- old = clock;
- clock = new;
- clocksource_disable(old);
-
- clock->cycle_last = 0;
- clock->cycle_last = clocksource_read(clock);
- clock->error = 0;
- clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-
- tick_clock_notify();
-
- /*
- * We're holding xtime lock and waking up klogd would deadlock
- * us on enqueue. So no printing!
- printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
- */
-}
#else /* GENERIC_TIME */
-static inline void clocksource_forward_now(void) { }
-static inline void change_clocksource(void) { }

/**
* ktime_get - get the monotonic time in ktime_t format
@@ -426,11 +356,7 @@ void __init timekeeping_init(void)
write_seqlock_irqsave(&xtime_lock, flags);

ntp_init();
-
- clock = clocksource_get_next();
- clocksource_enable(clock);
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
- clock->cycle_last = clocksource_read(clock);
+ clocksource_init();

xtime.tv_sec = sec;
xtime.tv_nsec = 0;
@@ -524,91 +450,6 @@ static int __init timekeeping_init_devic

device_initcall(timekeeping_init_device);

-/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
- */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
- s64 *offset)
-{
- s64 tick_error, i;
- u32 look_ahead, adj;
- s32 error2, mult;
-
- /*
- * Use the current error value to determine how much to look ahead.
- * The larger the error the slower we adjust for it to avoid problems
- * with losing too many ticks, otherwise we would overadjust and
- * produce an even larger error. The smaller the adjustment the
- * faster we try to adjust for it, as lost ticks can do less harm
- * here. This is tuned so that an error of about 1 msec is adjusted
- * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
- */
- error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
- error2 = abs(error2);
- for (look_ahead = 0; error2 > 0; look_ahead++)
- error2 >>= 2;
-
- /*
- * Now calculate the error in (1 << look_ahead) ticks, but first
- * remove the single look ahead already included in the error.
- */
- tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
- tick_error -= clock->xtime_interval >> 1;
- error = ((error - tick_error) >> look_ahead) + tick_error;
-
- /* Finally calculate the adjustment shift value. */
- i = *interval;
- mult = 1;
- if (error < 0) {
- error = -error;
- *interval = -*interval;
- *offset = -*offset;
- mult = -1;
- }
- for (adj = 0; error > i; adj++)
- error >>= 1;
-
- *interval <<= adj;
- *offset <<= adj;
- return mult << adj;
-}
-
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void clocksource_adjust(s64 offset)
-{
- s64 error, interval = clock->cycle_interval;
- int adj;
-
- error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
- if (error > interval) {
- error >>= 2;
- if (likely(error <= interval))
- adj = 1;
- else
- adj = clocksource_bigadjust(error, &interval, &offset);
- } else if (error < -interval) {
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else
- adj = clocksource_bigadjust(error, &interval, &offset);
- } else
- return;
-
- clock->mult += adj;
- clock->xtime_interval += interval;
- clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) <<
- (NTP_SCALE_SHIFT - clock->shift);
-}
-
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.


2009-07-21 19:51:05

by Daniel Walker

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Tue, 2009-07-21 at 21:17 +0200, Martin Schwidefsky wrote:
> plain text document attachment (clocksource-move.diff)
> From: Martin Schwidefsky <[email protected]>
>
> Move clock source related code from timekeeping.c to clocksource.c
> where they belong. The selected clocks source "clock" is now defined
> in clocksource.c and clocksource_init is added to set up the initial
> clock.

The problem is most (if not all) the code your moving is actually time
keeping code .. The reason it seems like clocksource code is cause John
wasn't very choosy about which structure he added variables too .. So
really this clean up needs to be in reverse, remove all the timekeeping
code from the clocksource code.

Daniel

2009-07-21 21:55:45

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Tue, 21 Jul 2009 12:50:51 -0700
Daniel Walker <[email protected]> wrote:

> On Tue, 2009-07-21 at 21:17 +0200, Martin Schwidefsky wrote:
> > plain text document attachment (clocksource-move.diff)
> > From: Martin Schwidefsky <[email protected]>
> >
> > Move clock source related code from timekeeping.c to clocksource.c
> > where they belong. The selected clocks source "clock" is now defined
> > in clocksource.c and clocksource_init is added to set up the initial
> > clock.
>
> The problem is most (if not all) the code your moving is actually time
> keeping code .. The reason it seems like clocksource code is cause John
> wasn't very choosy about which structure he added variables too .. So
> really this clean up needs to be in reverse, remove all the timekeeping
> code from the clocksource code.

You are refering to the following variables, aren't you?

cycle_t cycle_interval;
u64 xtime_interval;
u32 raw_interval;
u64 xtime_nsec;
s64 error;
struct timespec raw_time;

There is a dependency between these values and the clocksource. But they
are only needed for the current active clock, so we could move them out
of the clocksource data structure and make them static variables in
timekeeping.c. I guess it would improve the code separation for
update_wall_time as well. Hmm, worth a try.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2009-07-21 22:10:07

by john stultz

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Tue, 2009-07-21 at 12:50 -0700, Daniel Walker wrote:
> On Tue, 2009-07-21 at 21:17 +0200, Martin Schwidefsky wrote:
> > plain text document attachment (clocksource-move.diff)
> > From: Martin Schwidefsky <[email protected]>
> >
> > Move clock source related code from timekeeping.c to clocksource.c
> > where they belong. The selected clocks source "clock" is now defined
> > in clocksource.c and clocksource_init is added to set up the initial
> > clock.
>
> The problem is most (if not all) the code your moving is actually time
> keeping code .. The reason it seems like clocksource code is cause John
> wasn't very choosy about which structure he added variables too .. So
> really this clean up needs to be in reverse, remove all the timekeeping
> code from the clocksource code.

Not so much that I wasn't very choosy, but that I had to pick my battles
there. At the time, Roman claimed keeping the timekeeping values in the
clocksource (instead of global to timekeeping.c) actually produced
better code.

I do agree with Daniel's main point, that the patch mixes the layers I
tried to establish in the design.

Clocksource: Abstracts out a hardware counter.
NTP: Provides the reference time.
Timekeeping: Manages accumulating the clocksource, and combining input
from ntp's reference time to steer the hardware frequency.

Unfortunately, many timekeeping values got stuffed into the struct
clocksource. I've had plans to try to clean this up and utilize Patrick
Ohly's simpler clockcounter struct as a basis for a clocksource, nesting
the structures somewhat to look something like:


/* minimal structure only giving hardware info and access methods */
struct cyclecounter {
char *name;
cycle_t (*read)(const struct cyclecounter *cc);
cycle_t (*vread)(const struct cyclecounter *cc);
cycle_t mask;
u32 mult;
u32 shift;
};

/* more complicated structure holding timekeeping values */
struct timesource {
struct cyclecounter counter;
u32 corrected_mult;
cycle_t cycle_interval;
u64 xtime_interval;
u32 raw_interval;
cycle_t cycle_last;
u64 xtime_nsec;
s64 error; /* probably should be ntp_error */
...
}

However such a change would be quite a bit of churn to much of the
timekeeping code, and to only marginal benefit. So I've put it off.


Martin, I've not been able to review your changes in extreme detail, but
I'm curious what the motivation for the drastic code rearrangement was?

I see you pushing a fair amount of code down a level, for instance,
except for the locking, getmonotonicraw() basically gets pushed down to
clocksource_read_raw(). The ktime_get/ktime_get_ts/getnstimeofday do
reduce some duplicate code, but that could still be minimized without
pushing stuff down to the clocksource level.

Is there any streamlining performance benefit from the way you moved
things around?

thanks
-john

2009-07-22 07:25:27

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Tue, 21 Jul 2009 15:00:07 -0700
john stultz <[email protected]> wrote:

> On Tue, 2009-07-21 at 12:50 -0700, Daniel Walker wrote:
> > On Tue, 2009-07-21 at 21:17 +0200, Martin Schwidefsky wrote:
> > > plain text document attachment (clocksource-move.diff)
> > > From: Martin Schwidefsky <[email protected]>
> > >
> > > Move clock source related code from timekeeping.c to clocksource.c
> > > where they belong. The selected clocks source "clock" is now defined
> > > in clocksource.c and clocksource_init is added to set up the initial
> > > clock.
> >
> > The problem is most (if not all) the code your moving is actually time
> > keeping code .. The reason it seems like clocksource code is cause John
> > wasn't very choosy about which structure he added variables too .. So
> > really this clean up needs to be in reverse, remove all the timekeeping
> > code from the clocksource code.
>
> Not so much that I wasn't very choosy, but that I had to pick my battles
> there. At the time, Roman claimed keeping the timekeeping values in the
> clocksource (instead of global to timekeeping.c) actually produced
> better code.

There are only a couple of functions where the performance is
important, these are the ktime_get functions (and variants) and probably
the update_wall_time function.

> I do agree with Daniel's main point, that the patch mixes the layers I
> tried to establish in the design.
>
> Clocksource: Abstracts out a hardware counter.
> NTP: Provides the reference time.
> Timekeeping: Manages accumulating the clocksource, and combining input
> from ntp's reference time to steer the hardware frequency.

Imho what makes the code hard to understand is that the internals of
the clocksource have leaked into the timekeeping code. I'm getting at
the cycle, mult and shift values here. The code would be much easier to
understand if the clocksource would just return nanoseconds. The bad
thing here is that we would loose some bits of precision.

> Unfortunately, many timekeeping values got stuffed into the struct
> clocksource. I've had plans to try to clean this up and utilize Patrick
> Ohly's simpler clockcounter struct as a basis for a clocksource, nesting
> the structures somewhat to look something like:
>
>
> /* minimal structure only giving hardware info and access methods */
> struct cyclecounter {
> char *name;
> cycle_t (*read)(const struct cyclecounter *cc);
> cycle_t (*vread)(const struct cyclecounter *cc);
> cycle_t mask;
> u32 mult;
> u32 shift;
> };
>
> /* more complicated structure holding timekeeping values */
> struct timesource {
> struct cyclecounter counter;
> u32 corrected_mult;
> cycle_t cycle_interval;
> u64 xtime_interval;
> u32 raw_interval;
> cycle_t cycle_last;
> u64 xtime_nsec;
> s64 error; /* probably should be ntp_error */
> ...
> }
>
> However such a change would be quite a bit of churn to much of the
> timekeeping code, and to only marginal benefit. So I've put it off.

That would be an improvement, but there are still these pesky cycles in
the timesource.

> Martin, I've not been able to review your changes in extreme detail, but
> I'm curious what the motivation for the drastic code rearrangement was?

It started of with a minor performance optimization, I wanted to get
rid of the change_clocksource call every tick. When I looked at the
code to understand it I started to move things around.

> I see you pushing a fair amount of code down a level, for instance,
> except for the locking, getmonotonicraw() basically gets pushed down to
> clocksource_read_raw(). The ktime_get/ktime_get_ts/getnstimeofday do
> reduce some duplicate code, but that could still be minimized without
> pushing stuff down to the clocksource level.

The background here is that I want to isolate the use ofthe cycles, mult
and shift values to clocksource.[ch]

> Is there any streamlining performance benefit from the way you moved
> things around?

The biggest gain is the ktime_get optimization that is already in the
tip tree. The call to change_clocksource showed up in the instructions
trace for a cpu wake up and I started to wonder if this is necessary.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2009-07-22 17:45:40

by john stultz

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Wed, 2009-07-22 at 09:25 +0200, Martin Schwidefsky wrote:
> On Tue, 21 Jul 2009 15:00:07 -0700
> john stultz <[email protected]> wrote:
>
> > On Tue, 2009-07-21 at 12:50 -0700, Daniel Walker wrote:
> > > On Tue, 2009-07-21 at 21:17 +0200, Martin Schwidefsky wrote:
> > > > plain text document attachment (clocksource-move.diff)
> > > > From: Martin Schwidefsky <[email protected]>
> > > >
> > > > Move clock source related code from timekeeping.c to clocksource.c
> > > > where they belong. The selected clocks source "clock" is now defined
> > > > in clocksource.c and clocksource_init is added to set up the initial
> > > > clock.
> > >
> > > The problem is most (if not all) the code your moving is actually time
> > > keeping code .. The reason it seems like clocksource code is cause John
> > > wasn't very choosy about which structure he added variables too .. So
> > > really this clean up needs to be in reverse, remove all the timekeeping
> > > code from the clocksource code.
> >
> > Not so much that I wasn't very choosy, but that I had to pick my battles
> > there. At the time, Roman claimed keeping the timekeeping values in the
> > clocksource (instead of global to timekeeping.c) actually produced
> > better code.
>
> There are only a couple of functions where the performance is
> important, these are the ktime_get functions (and variants) and probably
> the update_wall_time function.
>
> > I do agree with Daniel's main point, that the patch mixes the layers I
> > tried to establish in the design.
> >
> > Clocksource: Abstracts out a hardware counter.
> > NTP: Provides the reference time.
> > Timekeeping: Manages accumulating the clocksource, and combining input
> > from ntp's reference time to steer the hardware frequency.
>
> Imho what makes the code hard to understand is that the internals of
> the clocksource have leaked into the timekeeping code. I'm getting at
> the cycle, mult and shift values here. The code would be much easier to
> understand if the clocksource would just return nanoseconds. The bad
> thing here is that we would loose some bits of precision.

While I completely agree the code is hard to understand, I really don't
think that pushing that down to clocksource.c will improve things.

As much as you'd prefer it not, I feel the timekeeping code has to deal
with cycles. The consistent translation and accumulation of clocksource
cycles into nanoseconds is what timekeeping.c is all about.

We already have interfaces that return nanoseconds, they're
gensttimeofday, ktime_get, ktime_get_ts.



> > Unfortunately, many timekeeping values got stuffed into the struct
> > clocksource. I've had plans to try to clean this up and utilize Patrick
> > Ohly's simpler clockcounter struct as a basis for a clocksource, nesting
> > the structures somewhat to look something like:
> >
> >
> > /* minimal structure only giving hardware info and access methods */
> > struct cyclecounter {
> > char *name;
> > cycle_t (*read)(const struct cyclecounter *cc);
> > cycle_t (*vread)(const struct cyclecounter *cc);
> > cycle_t mask;
> > u32 mult;
> > u32 shift;
> > };
> >
> > /* more complicated structure holding timekeeping values */
> > struct timesource {
> > struct cyclecounter counter;
> > u32 corrected_mult;
> > cycle_t cycle_interval;
> > u64 xtime_interval;
> > u32 raw_interval;
> > cycle_t cycle_last;
> > u64 xtime_nsec;
> > s64 error; /* probably should be ntp_error */
> > ...
> > }
> >
> > However such a change would be quite a bit of churn to much of the
> > timekeeping code, and to only marginal benefit. So I've put it off.
>
> That would be an improvement, but there are still these pesky cycles in
> the timesource.

Again, I think there has to be. Since some portion of the current time
is unaccumulated, it is inherently cycles based. The timekeeping core
has to decide when to accumulate those cycles into nanoseconds and store
them into xtime. In order to do that, the timekeeping code has to have
an idea of where the cycle_last value is. Further, for improved
precision, and ntp steering, we use the *_interval values to accumulate
in chunks.


> > Martin, I've not been able to review your changes in extreme detail, but
> > I'm curious what the motivation for the drastic code rearrangement was?
>
> It started of with a minor performance optimization, I wanted to get
> rid of the change_clocksource call every tick. When I looked at the
> code to understand it I started to move things around.
>
> > I see you pushing a fair amount of code down a level, for instance,
> > except for the locking, getmonotonicraw() basically gets pushed down to
> > clocksource_read_raw(). The ktime_get/ktime_get_ts/getnstimeofday do
> > reduce some duplicate code, but that could still be minimized without
> > pushing stuff down to the clocksource level.
>
> The background here is that I want to isolate the use ofthe cycles, mult
> and shift values to clocksource.[ch]

Again I do completely agree the code needs to be cleaned up.
Unfortunately there's still a split between the GENERIC_TIME and non
GENERIC_TIME arches that keeps us from making some cleanups right now.
I'm trying to get this all unified (see my arch_gettimeoffset patches),
but until we get all the arches moved over, there's some unfortunate
uglys we can't get rid of.


If I can find some cycles today, I'll try to take a rough swing at some
of the cleanup I mentioned earlier. Probably won't build, but will maybe
give you an idea of the direction I'm thinking about, and then you can
let me know where you feel its still too complex. Maybe then we can meet
in the middle?

thanks
-john

2009-07-23 00:29:12

by john stultz

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Wed, 2009-07-22 at 10:45 -0700, john stultz wrote:
> On Wed, 2009-07-22 at 09:25 +0200, Martin Schwidefsky wrote:
> > On Tue, 21 Jul 2009 15:00:07 -0700
> > john stultz <[email protected]> wrote:
> > > Unfortunately, many timekeeping values got stuffed into the struct
> > > clocksource. I've had plans to try to clean this up and utilize Patrick
> > > Ohly's simpler clockcounter struct as a basis for a clocksource, nesting
> > > the structures somewhat to look something like:
> > >
> > >
> > > /* minimal structure only giving hardware info and access methods */
> > > struct cyclecounter {
> > > char *name;
> > > cycle_t (*read)(const struct cyclecounter *cc);
> > > cycle_t (*vread)(const struct cyclecounter *cc);
> > > cycle_t mask;
> > > u32 mult;
> > > u32 shift;
> > > };
> > >
> > > /* more complicated structure holding timekeeping values */
> > > struct timesource {
> > > struct cyclecounter counter;
> > > u32 corrected_mult;
> > > cycle_t cycle_interval;
> > > u64 xtime_interval;
> > > u32 raw_interval;
> > > cycle_t cycle_last;
> > > u64 xtime_nsec;
> > > s64 error; /* probably should be ntp_error */
> > > ...
> > > }
> > >
> > > However such a change would be quite a bit of churn to much of the
> > > timekeeping code, and to only marginal benefit. So I've put it off.
> >
[snip]
> If I can find some cycles today, I'll try to take a rough swing at some
> of the cleanup I mentioned earlier. Probably won't build, but will maybe
> give you an idea of the direction I'm thinking about, and then you can
> let me know where you feel its still too complex. Maybe then we can meet
> in the middle?

Hey Martin,
So here's a really quick swipe at breaking apart the clocksource struct
into a clocksource only portion and a timekeeping portion.

Caveats:
1) This doesn't completely build. The core bits do, but there's still a
few left-over issues (see following caveats). Its just here to give you
an idea of what I'm thinking about. I'd of course break it up into more
manageable chunks before submitting it.

2) Structure names aren't too great right now. Not sure timeclock is
what I want to use, probably system_time or something. Will find/replace
before the next revision is sent out.

3) I still need to unify the clocksource and cyclecounter structures, as
they're basically redundant now.

4) I still need to fix the update_vsyscall code (shouldn't be hard, I
didn't want to run through arch code yet).

5) The TSC clocksource uses cycles_last to avoid very slight skew issues
(that otherwise would not be noticed). Not sure how to fix that if we're
pulling cycles_last (which is purely timekeeping state) out of the
clocksource. Will have to think of something.


Other cleanups still out there in the distant future:
1) Once all arches are converted to GENERIC_TIME, we can remove the
ifdefs, and cleanup a lot of the more complicated xtime struct
manipulation. It will cleanup update_wall_time() nicely.

2) I have a logarithmic accumulation patch to update_wall_time that will
remove the need for xtime_cache to be managed and updated. Just have to
spend some additional time making sure its bugfree.

3) Once all arches are converted to using read_persistent_clock(), then
the arch specific time initialization can be dropped. Removing the
majority of direct xtime structure accesses.

4) Then once the remaining direct wall_to_monotonic and xtime accessors
are moved to timekeeping.c we can make those both static and embed them
into the core timekeeping structure.


But let me know if this patch doesn't achieve most of the cleanup you
wanted to see.

thanks
-john


DOES NOT BUILD! FOR REVIEW PURPOSES ONLY!
Signed-off-by: John Stultz <[email protected]>
---
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index c56457c..3ad14b5 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -154,8 +154,6 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
* @flags: flags describing special properties
* @vread: vsyscall based read
* @resume: resume function for the clocksource, if necessary
- * @cycle_interval: Used internally by timekeeping core, please ignore.
- * @xtime_interval: Used internally by timekeeping core, please ignore.
*/
struct clocksource {
/*
@@ -169,7 +167,6 @@ struct clocksource {
void (*disable)(struct clocksource *cs);
cycle_t mask;
u32 mult;
- u32 mult_orig;
u32 shift;
unsigned long flags;
cycle_t (*vread)(void);
@@ -181,19 +178,6 @@ struct clocksource {
#define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0)
#endif

- /* timekeeping specific data, ignore */
- cycle_t cycle_interval;
- u64 xtime_interval;
- u32 raw_interval;
- /*
- * Second part is written at each timer interrupt
- * Keep it in a different cache line to dirty no
- * more than one cache line.
- */
- cycle_t cycle_last ____cacheline_aligned_in_smp;
- u64 xtime_nsec;
- s64 error;
- struct timespec raw_time;

#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
/* Watchdog related data, used by the framework */
@@ -202,7 +186,6 @@ struct clocksource {
#endif
};

-extern struct clocksource *clock; /* current clocksource */

/*
* Clock source flags bits::
@@ -267,99 +250,21 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
return (u32)tmp;
}

-/**
- * clocksource_read: - Access the clocksource's current cycle value
- * @cs: pointer to clocksource being read
- *
- * Uses the clocksource to return the current cycle_t value
- */
-static inline cycle_t clocksource_read(struct clocksource *cs)
-{
- return cs->read(cs);
-}

/**
- * clocksource_enable: - enable clocksource
- * @cs: pointer to clocksource
+ * clocksource_cyc2ns - converts clocksource cycles to nanoseconds
*
- * Enables the specified clocksource. The clocksource callback
- * function should start up the hardware and setup mult and field
- * members of struct clocksource to reflect hardware capabilities.
- */
-static inline int clocksource_enable(struct clocksource *cs)
-{
- int ret = 0;
-
- if (cs->enable)
- ret = cs->enable(cs);
-
- /* save mult_orig on enable */
- cs->mult_orig = cs->mult;
-
- return ret;
-}
-
-/**
- * clocksource_disable: - disable clocksource
- * @cs: pointer to clocksource
- *
- * Disables the specified clocksource. The clocksource callback
- * function should power down the now unused hardware block to
- * save power.
- */
-static inline void clocksource_disable(struct clocksource *cs)
-{
- if (cs->disable)
- cs->disable(cs);
-}
-
-/**
- * cyc2ns - converts clocksource cycles to nanoseconds
- * @cs: Pointer to clocksource
- * @cycles: Cycles
- *
- * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ * Converts cycles to nanoseconds, using the given mult and shift.
*
* XXX - This could use some mult_lxl_ll() asm optimization
*/
-static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles)
+static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
{
u64 ret = (u64)cycles;
- ret = (ret * cs->mult) >> cs->shift;
+ ret = (ret * mult) >> shift;
return ret;
}

-/**
- * clocksource_calculate_interval - Calculates a clocksource interval struct
- *
- * @c: Pointer to clocksource.
- * @length_nsec: Desired interval length in nanoseconds.
- *
- * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
- * pair and interval request.
- *
- * Unless you're the timekeeping code, you should not be using this!
- */
-static inline void clocksource_calculate_interval(struct clocksource *c,
- unsigned long length_nsec)
-{
- u64 tmp;
-
- /* Do the ns -> cycle conversion first, using original mult */
- tmp = length_nsec;
- tmp <<= c->shift;
- tmp += c->mult_orig/2;
- do_div(tmp, c->mult_orig);
-
- c->cycle_interval = (cycle_t)tmp;
- if (c->cycle_interval == 0)
- c->cycle_interval = 1;
-
- /* Go back from cycles -> shifted ns, this time use ntp adjused mult */
- c->xtime_interval = (u64)c->cycle_interval * c->mult;
- c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
-}
-

/* used to install a new clocksource */
extern int clocksource_register(struct clocksource*);
diff --git a/include/linux/time.h b/include/linux/time.h
index ea16c1a..65e94a6 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -147,6 +147,8 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
extern int timekeeping_valid_for_hres(void);
extern void update_wall_time(void);
extern void update_xtime_cache(u64 nsec);
+extern void timekeeping_leap_insert(int leapsecond);
+

struct tms;
extern void do_sys_times(struct tms *);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb8..13db0a8 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -182,7 +182,8 @@ static void clocksource_watchdog(unsigned long data)
resumed = test_and_clear_bit(0, &watchdog_resumed);

wdnow = watchdog->read(watchdog);
- wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
+ wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
+ watchdog->mult, watchdog->shift);
watchdog_last = wdnow;

list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
@@ -209,7 +210,7 @@ static void clocksource_watchdog(unsigned long data)
cs->flags |= CLOCK_SOURCE_WATCHDOG;
cs->wd_last = csnow;
} else {
- cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
+ cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & cs->mask, cs->mult, cs->shift);
cs->wd_last = csnow;
/* Check the delta. Might remove from the list ! */
clocksource_ratewd(cs, cs_nsec - wd_nsec);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30..e26185a 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
.read = jiffies_read,
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
- .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
.shift = JIFFIES_SHIFT,
};

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc6437..5fabaf3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- secs = xtime.tv_sec - time_reftime;
+ secs = get_seconds() - time_reftime;
if (unlikely(time_status & STA_FREQHOLD))
secs = 0;

- time_reftime = xtime.tv_sec;
+ time_reftime = get_seconds();

offset64 = offset;
freq_adj = (offset64 * secs) <<
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
case TIME_OK:
break;
case TIME_INS:
- xtime.tv_sec--;
- wall_to_monotonic.tv_sec++;
+ timekeeping_leap_insert(-1);
time_state = TIME_OOP;
printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
res = HRTIMER_RESTART;
break;
case TIME_DEL:
- xtime.tv_sec++;
+ timekeeping_leap_insert(1);
time_tai--;
- wall_to_monotonic.tv_sec--;
time_state = TIME_WAIT;
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,8 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
time_state = TIME_OK;
break;
}
- update_vsyscall(&xtime, clock);
-
write_sequnlock(&xtime_lock);

return res;
@@ -371,7 +367,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
* reference time to current time.
*/
if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
- time_reftime = xtime.tv_sec;
+ time_reftime = get_seconds();

/* only set allowed bits */
time_status &= STA_RONLY;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9..f95f9e3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -44,47 +44,172 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
*/
struct timespec xtime __attribute__ ((aligned (16)));
struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-static unsigned long total_sleep_time; /* seconds */

-/* flag for if timekeeping is suspended */
-int __read_mostly timekeeping_suspended;

-static struct timespec xtime_cache __attribute__ ((aligned (16)));
+/* background stuff to document:
+ * This code is a little extra complicated as we deal with things
+ * in lots of similar but different units. We have to keep these
+ * different units as we try to do each step of managing time with
+ * the highest precision.
+ *
+ * cycles: these are simply counter cycles
+ * nsecs: these are simply nanoseconds
+ * Clock shifted nsecs: These are high precision nanoseconds,
+ * shifted by struct clocksource.shift.
+ * NTP shifted nsecs: These are high precision nanoseconds,
+ * shifted by NTP_SHIFT_SCALE
+ *
+ * One might ask, why not use one type of shifted nanosecond?
+ * The reason is, clock shifted nanoseconds are carefully managed
+ * so we don't overflow 64bits. We need higher then nanosecond
+ * precision, but NTP_SHIFT SCALE is too large and we might overflow.
+ * However, we want to keep the extra fine precision for error accounting
+ * that NTP_SHIFT_SCALE gives us, so we convert from clock shifted nsecs
+ * to NTP shifted nsecs when its safe.
+ *
+ */
+
+
+/* This structure stores all of the information used to generate or
+ * manage time.
+ */
+static struct timeclock {
+ struct clocksource * source; /* current clocksource */
+ u32 mult; /* ntp adjusted mult */
+
+ cycle_t cycle_interval; /* interval in cycles */
+ u64 xtime_interval; /* interval in clock shifted nsecs */
+ u32 raw_interval; /* raw interval in nsecs */
+
+ /*
+ * The following is written at each timer interrupt
+ * Keep it in a different cache line to dirty no
+ * more than one cache line.
+ */
+ cycle_t cycle_last ____cacheline_aligned_in_smp;
+ u64 xtime_nsec; /* clock shifted nsecs */
+ s64 error; /* ntp shifted nsecs */
+ struct timespec raw_time;
+ struct timespec xtime_cache;
+ unsigned long total_sleep_time; /* seconds */
+ int timekeeping_suspended;
+
+ /* XXX - TODO
+ * o pull in the xtime, wall_to_monotoinc, xtime_lock
+ */
+} clock;
+
+
+/* timeclock helper functions */
+static inline cycle_t timeclock_read(void)
+{
+ return clock.source->read(clock.source);
+}
+
+static inline u64 timeclock_cyc2ns(cycle_t cycles)
+{
+ /* noraml cyc2ns, use the NTP adjusted mult */
+ return clocksource_cyc2ns(cycles, clock.mult, clock.source->shift);
+}
+
+static inline u64 timeclock_cyc2ns_raw(cycle_t cycles)
+{
+ /* raw cyc2ns, use the unadjusted original clocksource mult */
+ return clocksource_cyc2ns(cycles, clock.source->mult,
+ clock.source->shift);
+}
+
+static inline u64 timeclock_getoffset(cycle_t now)
+{
+ cycle_t cycle_delta = (now - clock.cycle_last)&clock.source->mask;
+ s64 nsec = timeclock_cyc2ns(cycle_delta);
+ nsec += arch_gettimeoffset();
+ return nsec;
+}
+
+static inline u64 timeclock_getoffset_raw(cycle_t now)
+{
+ cycle_t cycle_delta = (now - clock.cycle_last)&clock.source->mask;
+ s64 nsec = timeclock_cyc2ns_raw(cycle_delta);
+ nsec += arch_gettimeoffset();
+ return nsec;
+}
+
+static inline void timeclock_calculate_interval(unsigned long length_nsec)
+{
+ u64 tmp;
+
+ /* Do the ns -> cycle conversion first, using original mult */
+ tmp = length_nsec;
+ tmp <<= clock.source->shift;
+ tmp += clock.source->mult/2;
+ do_div(tmp, clock.source->mult);
+
+ clock.cycle_interval = (cycle_t)tmp;
+ if (clock.cycle_interval == 0)
+ clock.cycle_interval = 1;
+
+ /* Go back from cycles -> shifted ns, this time use ntp adjused mult */
+ clock.xtime_interval = (u64)clock.cycle_interval * clock.mult;
+ clock.raw_interval = timeclock_cyc2ns_raw(clock.cycle_interval);
+}
+
+static inline long CLOCK2NTP_SHIFT(void)
+{
+ return NTP_SCALE_SHIFT - clock.source->shift;
+}
+
+
+/* must hold xtime_lock */
void update_xtime_cache(u64 nsec)
{
- xtime_cache = xtime;
- timespec_add_ns(&xtime_cache, nsec);
+ clock.xtime_cache = xtime;
+ timespec_add_ns(&clock.xtime_cache, nsec);
+}
+
+/* must hold xtime_lock */
+void timekeeping_leap_insert(int leapsecond)
+{
+ xtime.tv_sec += leapsecond;
+ wall_to_monotonic.tv_sec -= leapsecond;
+ update_vsyscall(&xtime, clock.source);
}

-struct clocksource *clock;

+static inline void xtime_nsec_add(s64 snsec)
+{
+ clock.xtime_nsec += snsec;
+ if (clock.xtime_nsec >=
+ (u64)NSEC_PER_SEC << clock.source->shift) {
+ clock.xtime_nsec -=
+ (u64)NSEC_PER_SEC << clock.source->shift;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+}

#ifdef CONFIG_GENERIC_TIME
/**
- * clocksource_forward_now - update clock to the current time
+ * timeclock_forward_now - update clock to the current time
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
-static void clocksource_forward_now(void)
+static void timeclock_forward_now(void)
{
- cycle_t cycle_now, cycle_delta;
+ cycle_t cycle_now;
s64 nsec;

- cycle_now = clocksource_read(clock);
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- clock->cycle_last = cycle_now;
-
- nsec = cyc2ns(clock, cycle_delta);
-
- /* If arch requires, add in gettimeoffset() */
- nsec += arch_gettimeoffset();
+ cycle_now = timeclock_read();

+ nsec = timeclock_getoffset(cycle_now);
timespec_add_ns(&xtime, nsec);

- nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
- clock->raw_time.tv_nsec += nsec;
+ nsec = timeclock_getoffset_raw(cycle_now);
+ timespec_add_ns(&clock.raw_time, nsec);
+
+ clock.cycle_last = cycle_now;
}

/**
@@ -95,28 +220,15 @@ static void clocksource_forward_now(void)
*/
void getnstimeofday(struct timespec *ts)
{
- cycle_t cycle_now, cycle_delta;
unsigned long seq;
s64 nsecs;

- WARN_ON(timekeeping_suspended);
-
+ WARN_ON(clock.timekeeping_suspended);
do {
seq = read_seqbegin(&xtime_lock);

*ts = xtime;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
- nsecs = cyc2ns(clock, cycle_delta);
-
- /* If arch requires, add in gettimeoffset() */
- nsecs += arch_gettimeoffset();
+ nsecs = timeclock_getoffset(timeclock_read());

} while (read_seqretry(&xtime_lock, seq));

@@ -157,7 +269,7 @@ int do_settimeofday(struct timespec *tv)

write_seqlock_irqsave(&xtime_lock, flags);

- clocksource_forward_now();
+ timeclock_forward_now();

ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +279,10 @@ int do_settimeofday(struct timespec *tv)

update_xtime_cache(0);

- clock->error = 0;
+ clock.error = 0;
ntp_clear();

- update_vsyscall(&xtime, clock);
+ update_vsyscall(&xtime, clock.source);

write_sequnlock_irqrestore(&xtime_lock, flags);

@@ -193,36 +305,30 @@ static void change_clocksource(void)

new = clocksource_get_next();

- if (clock == new)
+ if (clock.source == new)
return;

- clocksource_forward_now();
+ timeclock_forward_now();

- if (clocksource_enable(new))
+ if (new->enable(new))
return;

- new->raw_time = clock->raw_time;
- old = clock;
- clock = new;
- clocksource_disable(old);
+ old = clock.source;
+ clock.source = new;
+ old->disable(old);

- clock->cycle_last = 0;
- clock->cycle_last = clocksource_read(clock);
- clock->error = 0;
- clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+ /* XXX - ugh.. TSC clocksource uses cycle_last... sort this out*/
+ clock.cycle_last = 0;
+ clock.cycle_last = timeclock_read();
+ clock.mult = clock.source->mult;
+ clock.error = 0;
+ clock.xtime_nsec = 0;
+ timeclock_calculate_interval(NTP_INTERVAL_LENGTH);

tick_clock_notify();
-
- /*
- * We're holding xtime lock and waking up klogd would deadlock
- * us on enqueue. So no printing!
- printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
- */
}
#else
-static inline void clocksource_forward_now(void) { }
+static inline void timeclock_forward_now(void) { }
static inline void change_clocksource(void) { }
#endif

@@ -236,21 +342,12 @@ void getrawmonotonic(struct timespec *ts)
{
unsigned long seq;
s64 nsecs;
- cycle_t cycle_now, cycle_delta;

do {
seq = read_seqbegin(&xtime_lock);

- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
- nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
-
- *ts = clock->raw_time;
+ nsecs = timeclock_getoffset_raw(timeclock_read());
+ *ts = clock.raw_time;

} while (read_seqretry(&xtime_lock, seq));

@@ -260,6 +357,55 @@ EXPORT_SYMBOL(getrawmonotonic);


/**
+ * getboottime - Return the real time of system boot.
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ *
+ * This is based on the wall_to_monotonic offset and the total suspend
+ * time. Calls to settimeofday will affect the value returned (which
+ * basically means that however wrong your real time clock is at boot time,
+ * you get the right time here).
+ */
+void getboottime(struct timespec *ts)
+{
+ set_normalized_timespec(ts,
+ - (wall_to_monotonic.tv_sec + clock.total_sleep_time),
+ - wall_to_monotonic.tv_nsec);
+}
+
+/**
+ * monotonic_to_bootbased - Convert the monotonic time to boot based.
+ * @ts: pointer to the timespec to be converted
+ */
+void monotonic_to_bootbased(struct timespec *ts)
+{
+ ts->tv_sec += clock.total_sleep_time;
+}
+
+unsigned long get_seconds(void)
+{
+ return clock.xtime_cache.tv_sec;
+}
+EXPORT_SYMBOL(get_seconds);
+
+
+struct timespec current_kernel_time(void)
+{
+ struct timespec now;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ now = clock.xtime_cache;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return now;
+}
+EXPORT_SYMBOL(current_kernel_time);
+
+/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
*/
int timekeeping_valid_for_hres(void)
@@ -270,7 +416,7 @@ int timekeeping_valid_for_hres(void)
do {
seq = read_seqbegin(&xtime_lock);

- ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = clock.source->flags & CLOCK_SOURCE_VALID_FOR_HRES;

} while (read_seqretry(&xtime_lock, seq));

@@ -303,17 +449,18 @@ void __init timekeeping_init(void)

ntp_init();

- clock = clocksource_get_next();
- clocksource_enable(clock);
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
- clock->cycle_last = clocksource_read(clock);
+ clock.source = clocksource_get_next();
+ clock.source->enable(clock.source);
+ timeclock_calculate_interval(NTP_INTERVAL_LENGTH);
+ clock.mult = clock.source->mult;
+ clock.cycle_last = timeclock_read();

xtime.tv_sec = sec;
xtime.tv_nsec = 0;
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
update_xtime_cache(0);
- total_sleep_time = 0;
+ clock.total_sleep_time = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
}

@@ -342,14 +489,15 @@ static int timekeeping_resume(struct sys_device *dev)

xtime.tv_sec += sleep_length;
wall_to_monotonic.tv_sec -= sleep_length;
- total_sleep_time += sleep_length;
+ clock.total_sleep_time += sleep_length;
}
update_xtime_cache(0);
/* re-base the last cycle value */
- clock->cycle_last = 0;
- clock->cycle_last = clocksource_read(clock);
- clock->error = 0;
- timekeeping_suspended = 0;
+ /* XXX - TSC bug here */
+ clock.cycle_last = 0;
+ clock.cycle_last = timeclock_read();
+ clock.error = 0;
+ clock.timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);

touch_softlockup_watchdog();
@@ -369,8 +517,8 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
timekeeping_suspend_time = read_persistent_clock();

write_seqlock_irqsave(&xtime_lock, flags);
- clocksource_forward_now();
- timekeeping_suspended = 1;
+ timeclock_forward_now();
+ clock.timekeeping_suspended = 1;
write_sequnlock_irqrestore(&xtime_lock, flags);

clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -404,7 +552,7 @@ device_initcall(timekeeping_init_device);
* If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+static __always_inline int timeclock_bigadjust(s64 error, s64 *interval,
s64 *offset)
{
s64 tick_error, i;
@@ -420,7 +568,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* here. This is tuned so that an error of about 1 msec is adjusted
* within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
*/
- error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = clock.error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
error2 = abs(error2);
for (look_ahead = 0; error2 > 0; look_ahead++)
error2 >>= 2;
@@ -429,8 +577,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* Now calculate the error in (1 << look_ahead) ticks, but first
* remove the single look ahead already included in the error.
*/
- tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
- tick_error -= clock->xtime_interval >> 1;
+ tick_error = tick_length >> (CLOCK2NTP_SHIFT() + 1);
+ tick_error -= clock.xtime_interval >> 1;
error = ((error - tick_error) >> look_ahead) + tick_error;

/* Finally calculate the adjustment shift value. */
@@ -455,18 +603,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* this is optimized for the most common adjustments of -1,0,1,
* for other values we can do a bit more work.
*/
-static void clocksource_adjust(s64 offset)
+static void timeclock_adjust(s64 offset)
{
- s64 error, interval = clock->cycle_interval;
+ s64 error, interval = clock.cycle_interval;
int adj;

- error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
+ error = clock.error >> (CLOCK2NTP_SHIFT() - 1);
if (error > interval) {
error >>= 2;
if (likely(error <= interval))
adj = 1;
else
- adj = clocksource_bigadjust(error, &interval, &offset);
+ adj = timeclock_bigadjust(error, &interval, &offset);
} else if (error < -interval) {
error >>= 2;
if (likely(error >= -interval)) {
@@ -474,15 +622,15 @@ static void clocksource_adjust(s64 offset)
interval = -interval;
offset = -offset;
} else
- adj = clocksource_bigadjust(error, &interval, &offset);
+ adj = timeclock_bigadjust(error, &interval, &offset);
} else
return;

- clock->mult += adj;
- clock->xtime_interval += interval;
- clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) <<
- (NTP_SCALE_SHIFT - clock->shift);
+ clock.mult += adj;
+ clock.xtime_interval += interval;
+ clock.xtime_nsec -= offset;
+ clock.error -= (interval - offset) << CLOCK2NTP_SHIFT();
+
}

/**
@@ -495,44 +643,36 @@ void update_wall_time(void)
cycle_t offset;

/* Make sure we're fully resumed: */
- if (unlikely(timekeeping_suspended))
+ if (unlikely(clock.timekeeping_suspended))
return;

#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+ offset = (timeclock_read() - clock.cycle_last) & clock.source->mask;
#else
- offset = clock->cycle_interval;
+ offset = clock.cycle_interval;
#endif
- clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
+ /* XXX the following line can be dropped when
+ * everyone is convertd GENERIC_TIME.
+ */
+ clock.xtime_nsec = (s64)xtime.tv_nsec << clock.source->shift;

/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
- while (offset >= clock->cycle_interval) {
+ while (offset >= clock.cycle_interval) {
/* accumulate one interval */
- offset -= clock->cycle_interval;
- clock->cycle_last += clock->cycle_interval;
-
- clock->xtime_nsec += clock->xtime_interval;
- if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
- clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
- xtime.tv_sec++;
- second_overflow();
- }
-
- clock->raw_time.tv_nsec += clock->raw_interval;
- if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
- clock->raw_time.tv_nsec -= NSEC_PER_SEC;
- clock->raw_time.tv_sec++;
- }
+ offset -= clock.cycle_interval;
+ clock.cycle_last += clock.cycle_interval;
+ xtime_nsec_add(clock.xtime_interval);
+ timespec_add_ns(&clock.raw_time, clock.raw_interval);

/* accumulate error between NTP and clock interval */
- clock->error += tick_length;
- clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
+ clock.error += tick_length;
+ clock.error -= clock.xtime_interval << CLOCK2NTP_SHIFT();
}

/* correct the clock when NTP error is too big */
- clocksource_adjust(offset);
+ timeclock_adjust(offset);

/*
* Since in the loop above, we accumulate any amount of time
@@ -549,72 +689,30 @@ void update_wall_time(void)
*
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
+ *
+ * XXX - once everyone is converted to GENERIC_TIME this
+ * can be dropped an we'll handle negative values properly
*/
- if (unlikely((s64)clock->xtime_nsec < 0)) {
- s64 neg = -(s64)clock->xtime_nsec;
- clock->xtime_nsec = 0;
- clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+ if (unlikely((s64)clock.xtime_nsec < 0)) {
+ s64 neg = -(s64)clock.xtime_nsec;
+ clock.xtime_nsec = 0;
+ clock.error += neg << CLOCK2NTP_SHIFT();
}

/* store full nanoseconds into xtime after rounding it up and
* add the remainder to the error difference.
+ *
+ * XXX - once everyone is converted to GENERIC_TIME this
+ * (the following three lines) can be dropped
*/
- xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
- clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
- clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
+ xtime.tv_nsec = ((s64)clock.xtime_nsec >> clock.source->shift) + 1;
+ clock.xtime_nsec -= (s64)xtime.tv_nsec << clock.source->shift;
+ clock.error += clock.xtime_nsec << CLOCK2NTP_SHIFT();

- update_xtime_cache(cyc2ns(clock, offset));
+ update_xtime_cache(timeclock_cyc2ns(offset));

/* check to see if there is a new clocksource to use */
change_clocksource();
- update_vsyscall(&xtime, clock);
-}
-
-/**
- * getboottime - Return the real time of system boot.
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- *
- * This is based on the wall_to_monotonic offset and the total suspend
- * time. Calls to settimeofday will affect the value returned (which
- * basically means that however wrong your real time clock is at boot time,
- * you get the right time here).
- */
-void getboottime(struct timespec *ts)
-{
- set_normalized_timespec(ts,
- - (wall_to_monotonic.tv_sec + total_sleep_time),
- - wall_to_monotonic.tv_nsec);
+ update_vsyscall(&xtime, clock.source);
}

-/**
- * monotonic_to_bootbased - Convert the monotonic time to boot based.
- * @ts: pointer to the timespec to be converted
- */
-void monotonic_to_bootbased(struct timespec *ts)
-{
- ts->tv_sec += total_sleep_time;
-}
-
-unsigned long get_seconds(void)
-{
- return xtime_cache.tv_sec;
-}
-EXPORT_SYMBOL(get_seconds);
-
-
-struct timespec current_kernel_time(void)
-{
- struct timespec now;
- unsigned long seq;
-
- do {
- seq = read_seqbegin(&xtime_lock);
-
- now = xtime_cache;
- } while (read_seqretry(&xtime_lock, seq));
-
- return now;
-}
-EXPORT_SYMBOL(current_kernel_time);


2009-07-23 07:23:43

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Wed, 22 Jul 2009 10:45:33 -0700
john stultz <[email protected]> wrote:

> On Wed, 2009-07-22 at 09:25 +0200, Martin Schwidefsky wrote:
> > On Tue, 21 Jul 2009 15:00:07 -0700
> > john stultz <[email protected]> wrote:
> > > I do agree with Daniel's main point, that the patch mixes the layers I
> > > tried to establish in the design.
> > >
> > > Clocksource: Abstracts out a hardware counter.
> > > NTP: Provides the reference time.
> > > Timekeeping: Manages accumulating the clocksource, and combining input
> > > from ntp's reference time to steer the hardware frequency.
> >
> > Imho what makes the code hard to understand is that the internals of
> > the clocksource have leaked into the timekeeping code. I'm getting at
> > the cycle, mult and shift values here. The code would be much easier to
> > understand if the clocksource would just return nanoseconds. The bad
> > thing here is that we would loose some bits of precision.
>
> While I completely agree the code is hard to understand, I really don't
> think that pushing that down to clocksource.c will improve things.
>
> As much as you'd prefer it not, I feel the timekeeping code has to deal
> with cycles. The consistent translation and accumulation of clocksource
> cycles into nanoseconds is what timekeeping.c is all about.
>
> We already have interfaces that return nanoseconds, they're
> gensttimeofday, ktime_get, ktime_get_ts.

After playing around with the idea move some fields from the struct
clocksource to a need private structure in timekeeping.c I now agree.
The new structure I have in mind currently looks like this:

/* Structure holding internal timekeeping values. */
struct timekeeper {
cycle_t cycle_interval;
u64 xtime_interval;
u32 raw_interval;
u64 xtime_nsec;
s64 ntp_error;
int xtime_shift;
int ntp_shift;
};

The raw_time stays in struct clocksource.

> > > Unfortunately, many timekeeping values got stuffed into the struct
> > > clocksource. I've had plans to try to clean this up and utilize Patrick
> > > Ohly's simpler clockcounter struct as a basis for a clocksource, nesting
> > > the structures somewhat to look something like:
> > >
> > >
> > > /* minimal structure only giving hardware info and access methods */
> > > struct cyclecounter {
> > > char *name;
> > > cycle_t (*read)(const struct cyclecounter *cc);
> > > cycle_t (*vread)(const struct cyclecounter *cc);
> > > cycle_t mask;
> > > u32 mult;
> > > u32 shift;
> > > };
> > >
> > > /* more complicated structure holding timekeeping values */
> > > struct timesource {
> > > struct cyclecounter counter;
> > > u32 corrected_mult;
> > > cycle_t cycle_interval;
> > > u64 xtime_interval;
> > > u32 raw_interval;
> > > cycle_t cycle_last;
> > > u64 xtime_nsec;
> > > s64 error; /* probably should be ntp_error */
> > > ...
> > > }
> > >
> > > However such a change would be quite a bit of churn to much of the
> > > timekeeping code, and to only marginal benefit. So I've put it off.
> >
> > That would be an improvement, but there are still these pesky cycles in
> > the timesource.
>
> Again, I think there has to be. Since some portion of the current time
> is unaccumulated, it is inherently cycles based. The timekeeping core
> has to decide when to accumulate those cycles into nanoseconds and store
> them into xtime. In order to do that, the timekeeping code has to have
> an idea of where the cycle_last value is. Further, for improved
> precision, and ntp steering, we use the *_interval values to accumulate
> in chunks.

Yes, I now agree.

> > > Martin, I've not been able to review your changes in extreme detail, but
> > > I'm curious what the motivation for the drastic code rearrangement was?
> >
> > It started of with a minor performance optimization, I wanted to get
> > rid of the change_clocksource call every tick. When I looked at the
> > code to understand it I started to move things around.
> >
> > > I see you pushing a fair amount of code down a level, for instance,
> > > except for the locking, getmonotonicraw() basically gets pushed down to
> > > clocksource_read_raw(). The ktime_get/ktime_get_ts/getnstimeofday do
> > > reduce some duplicate code, but that could still be minimized without
> > > pushing stuff down to the clocksource level.
> >
> > The background here is that I want to isolate the use ofthe cycles, mult
> > and shift values to clocksource.[ch]
>
> Again I do completely agree the code needs to be cleaned up.
> Unfortunately there's still a split between the GENERIC_TIME and non
> GENERIC_TIME arches that keeps us from making some cleanups right now.
> I'm trying to get this all unified (see my arch_gettimeoffset patches),
> but until we get all the arches moved over, there's some unfortunate
> uglys we can't get rid of.
>
>
> If I can find some cycles today, I'll try to take a rough swing at some
> of the cleanup I mentioned earlier. Probably won't build, but will maybe
> give you an idea of the direction I'm thinking about, and then you can
> let me know where you feel its still too complex. Maybe then we can meet
> in the middle?

I'm already in the middle of doing what you suggested. I'll send an
update soonish.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2009-07-23 07:53:40

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Wed, 22 Jul 2009 17:28:20 -0700
john stultz <[email protected]> wrote:

> Hey Martin,
> So here's a really quick swipe at breaking apart the clocksource struct
> into a clocksource only portion and a timekeeping portion.
>
> Caveats:
> 1) This doesn't completely build. The core bits do, but there's still a
> few left-over issues (see following caveats). Its just here to give you
> an idea of what I'm thinking about. I'd of course break it up into more
> manageable chunks before submitting it.
>
> 2) Structure names aren't too great right now. Not sure timeclock is
> what I want to use, probably system_time or something. Will find/replace
> before the next revision is sent out.
>
> 3) I still need to unify the clocksource and cyclecounter structures, as
> they're basically redundant now.
>
> 4) I still need to fix the update_vsyscall code (shouldn't be hard, I
> didn't want to run through arch code yet).
>
> 5) The TSC clocksource uses cycles_last to avoid very slight skew issues
> (that otherwise would not be noticed). Not sure how to fix that if we're
> pulling cycles_last (which is purely timekeeping state) out of the
> clocksource. Will have to think of something.
>
>
> Other cleanups still out there in the distant future:
> 1) Once all arches are converted to GENERIC_TIME, we can remove the
> ifdefs, and cleanup a lot of the more complicated xtime struct
> manipulation. It will cleanup update_wall_time() nicely.
>
> 2) I have a logarithmic accumulation patch to update_wall_time that will
> remove the need for xtime_cache to be managed and updated. Just have to
> spend some additional time making sure its bugfree.
>
> 3) Once all arches are converted to using read_persistent_clock(), then
> the arch specific time initialization can be dropped. Removing the
> majority of direct xtime structure accesses.
>
> 4) Then once the remaining direct wall_to_monotonic and xtime accessors
> are moved to timekeeping.c we can make those both static and embed them
> into the core timekeeping structure.
>
>
> But let me know if this patch doesn't achieve most of the cleanup you
> wanted to see.

Cool, I'll have a look. What I can see right away is that a lot of the
changes I tried yesterday are contained in your patch as well.
But your patch is more radical, a lot more is (re-)moved from the
struct clocksource. That mult_orig goes aways is good :-)

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2009-07-23 10:53:51

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

Hi John,

On Wed, 22 Jul 2009 17:28:20 -0700
john stultz <[email protected]> wrote:

> On Wed, 2009-07-22 at 10:45 -0700, john stultz wrote:
> Hey Martin,
> So here's a really quick swipe at breaking apart the clocksource struct
> into a clocksource only portion and a timekeeping portion.
>
> Caveats:
> 1) This doesn't completely build. The core bits do, but there's still a
> few left-over issues (see following caveats). Its just here to give you
> an idea of what I'm thinking about. I'd of course break it up into more
> manageable chunks before submitting it.

Mine does build but for s390 only - and even there only with a hack ;-)

> 2) Structure names aren't too great right now. Not sure timeclock is
> what I want to use, probably system_time or something. Will find/replace
> before the next revision is sent out.

I've picked the name struct timekeeper.

> 3) I still need to unify the clocksource and cyclecounter structures, as
> they're basically redundant now.

I'll leave that up to you.

> 4) I still need to fix the update_vsyscall code (shouldn't be hard, I
> didn't want to run through arch code yet).

Done in my version of the patch below.

> 5) The TSC clocksource uses cycles_last to avoid very slight skew issues
> (that otherwise would not be noticed). Not sure how to fix that if we're
> pulling cycles_last (which is purely timekeeping state) out of the
> clocksource. Will have to think of something.

That is an ugly one. A similar thing exists in the s390 backend where I
want to reset the timekeeping to precise values after the clocksource
switch from jiffies. The proper solution probably is to allow
architectures to override the default clocksource. The jiffies
clocksource doesn't make any sense on s390.

> Other cleanups still out there in the distant future:
> 1) Once all arches are converted to GENERIC_TIME, we can remove the
> ifdefs, and cleanup a lot of the more complicated xtime struct
> manipulation. It will cleanup update_wall_time() nicely.

Well yes, but that will take a few more arch patches. Until we get
there we have to live with the current state of things.

> 2) I have a logarithmic accumulation patch to update_wall_time that will
> remove the need for xtime_cache to be managed and updated. Just have to
> spend some additional time making sure its bugfree.

Interesting. Getting rid of xtime_cache would be a nice cleanup as well.

> 3) Once all arches are converted to using read_persistent_clock(), then
> the arch specific time initialization can be dropped. Removing the
> majority of direct xtime structure accesses.

Only if the read_persistent_clock allows for a better resolution than
seconds. With my cputime accounting hat on: seconds are not good
enough..

> 4) Then once the remaining direct wall_to_monotonic and xtime accessors
> are moved to timekeeping.c we can make those both static and embed them
> into the core timekeeping structure.

Both should not be accessed at a rate that makes it necessary to read
from the values directly. An accessor should be fine I think.

> But let me know if this patch doesn't achieve most of the cleanup you
> wanted to see.

We are getting there. I wonder if it is really necessary to pull
xtime_cache, raw_time, total_sleep_time and timekeeping_suspended into
the struct timeclock. I would prefer the semantics that the struct
timekeeper / timeclock contains the internal values of the timekeeping
code for the currently selected clock source. xtime is not clock
specific.

For reference here is the current stack of patches I have on my disk.
The stop_machine conversion to install a new clocksource is currently missing.

PRELIMINARY PATCHES, USE AT YOUR OWN RISK.
-------------------------------------------------------------------

Subject: [PATCH] introduce timekeeping_leap_insert

From: john stultz <[email protected]>

---
include/linux/time.h | 1 +
kernel/time/ntp.c | 7 ++-----
kernel/time/timekeeping.c | 7 +++++++
3 files changed, 10 insertions(+), 5 deletions(-)

Index: linux-2.6/include/linux/time.h
===================================================================
--- linux-2.6.orig/include/linux/time.h
+++ linux-2.6/include/linux/time.h
@@ -147,6 +147,7 @@ extern struct timespec timespec_trunc(st
extern int timekeeping_valid_for_hres(void);
extern void update_wall_time(void);
extern void update_xtime_cache(u64 nsec);
+extern void timekeeping_leap_insert(int leapsecond);

struct tms;
extern void do_sys_times(struct tms *);
Index: linux-2.6/kernel/time/ntp.c
===================================================================
--- linux-2.6.orig/kernel/time/ntp.c
+++ linux-2.6/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_sec
case TIME_OK:
break;
case TIME_INS:
- xtime.tv_sec--;
- wall_to_monotonic.tv_sec++;
+ timekeeping_leap_insert(-1);
time_state = TIME_OOP;
printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_sec
res = HRTIMER_RESTART;
break;
case TIME_DEL:
- xtime.tv_sec++;
+ timekeeping_leap_insert(1);
time_tai--;
- wall_to_monotonic.tv_sec--;
time_state = TIME_WAIT;
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_sec
time_state = TIME_OK;
break;
}
- update_vsyscall(&xtime, clock);

write_sequnlock(&xtime_lock);

Index: linux-2.6/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.orig/kernel/time/timekeeping.c
+++ linux-2.6/kernel/time/timekeeping.c
@@ -58,6 +58,13 @@ void update_xtime_cache(u64 nsec)

struct clocksource *clock;

+/* must hold xtime_lock */
+void timekeeping_leap_insert(int leapsecond)
+{
+ xtime.tv_sec += leapsecond;
+ wall_to_monotonic.tv_sec -= leapsecond;
+ update_vsyscall(&xtime, clock);
+}

#ifdef CONFIG_GENERIC_TIME
/**

-------------------------------------------------------------------

Subject: [PATCH] remove clocksource inline functions

From: Martin Schwidefsky <[email protected]>

Remove clocksource_read, clocksource_enable and clocksource_disable
inline functions. No functional change.

Cc: Ingo Molnar <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: john stultz <[email protected]>
Cc: Daniel Walker <[email protected]>
Signed-off-by: Martin Schwidefsky <[email protected]>
---
include/linux/clocksource.h | 46 --------------------------------------------
kernel/time/timekeeping.c | 32 +++++++++++++++++-------------
2 files changed, 18 insertions(+), 60 deletions(-)

Index: linux-2.6/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.orig/kernel/time/timekeeping.c
+++ linux-2.6/kernel/time/timekeeping.c
@@ -79,7 +79,7 @@ static void clocksource_forward_now(void
cycle_t cycle_now, cycle_delta;
s64 nsec;

- cycle_now = clocksource_read(clock);
+ cycle_now = clock->read(clock);
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
clock->cycle_last = cycle_now;

@@ -114,7 +114,7 @@ void getnstimeofday(struct timespec *ts)
*ts = xtime;

/* read clocksource: */
- cycle_now = clocksource_read(clock);
+ cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
@@ -146,7 +146,7 @@ ktime_t ktime_get(void)
nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;

/* read clocksource: */
- cycle_now = clocksource_read(clock);
+ cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
@@ -186,7 +186,7 @@ void ktime_get_ts(struct timespec *ts)
tomono = wall_to_monotonic;

/* read clocksource: */
- cycle_now = clocksource_read(clock);
+ cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
@@ -274,16 +274,18 @@ static void change_clocksource(void)

clocksource_forward_now();

- if (clocksource_enable(new))
+ if (new->enable && ! new->enable(new))
return;
+ /* save mult_orig on enable */
+ new->mult_orig = new->mult;

new->raw_time = clock->raw_time;
old = clock;
clock = new;
- clocksource_disable(old);
+ if (old->disable)
+ old->disable(old);

- clock->cycle_last = 0;
- clock->cycle_last = clocksource_read(clock);
+ clock->cycle_last = clock->read(clock);
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -373,7 +375,7 @@ void getrawmonotonic(struct timespec *ts
seq = read_seqbegin(&xtime_lock);

/* read clocksource: */
- cycle_now = clocksource_read(clock);
+ cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
@@ -435,9 +437,12 @@ void __init timekeeping_init(void)
ntp_init();

clock = clocksource_get_next();
- clocksource_enable(clock);
+ if (clock->enable)
+ clock->enable(clock);
+ /* save mult_orig on enable */
+ clock->mult_orig = clock->mult;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
- clock->cycle_last = clocksource_read(clock);
+ clock->cycle_last = clock->read(clock);

xtime.tv_sec = sec;
xtime.tv_nsec = 0;
@@ -477,8 +482,7 @@ static int timekeeping_resume(struct sys
}
update_xtime_cache(0);
/* re-base the last cycle value */
- clock->cycle_last = 0;
- clock->cycle_last = clocksource_read(clock);
+ clock->cycle_last = clock->read(clock);
clock->error = 0;
timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -630,7 +634,7 @@ void update_wall_time(void)
return;

#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+ offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#else
offset = clock->cycle_interval;
#endif
Index: linux-2.6/include/linux/clocksource.h
===================================================================
--- linux-2.6.orig/include/linux/clocksource.h
+++ linux-2.6/include/linux/clocksource.h
@@ -268,52 +268,6 @@ static inline u32 clocksource_hz2mult(u3
}

/**
- * clocksource_read: - Access the clocksource's current cycle value
- * @cs: pointer to clocksource being read
- *
- * Uses the clocksource to return the current cycle_t value
- */
-static inline cycle_t clocksource_read(struct clocksource *cs)
-{
- return cs->read(cs);
-}
-
-/**
- * clocksource_enable: - enable clocksource
- * @cs: pointer to clocksource
- *
- * Enables the specified clocksource. The clocksource callback
- * function should start up the hardware and setup mult and field
- * members of struct clocksource to reflect hardware capabilities.
- */
-static inline int clocksource_enable(struct clocksource *cs)
-{
- int ret = 0;
-
- if (cs->enable)
- ret = cs->enable(cs);
-
- /* save mult_orig on enable */
- cs->mult_orig = cs->mult;
-
- return ret;
-}
-
-/**
- * clocksource_disable: - disable clocksource
- * @cs: pointer to clocksource
- *
- * Disables the specified clocksource. The clocksource callback
- * function should power down the now unused hardware block to
- * save power.
- */
-static inline void clocksource_disable(struct clocksource *cs)
-{
- if (cs->disable)
- cs->disable(cs);
-}
-
-/**
* cyc2ns - converts clocksource cycles to nanoseconds
* @cs: Pointer to clocksource
* @cycles: Cycles

-------------------------------------------------------------------

Subject: [PATCH] cleanup clocksource selection

From: Martin Schwidefsky <[email protected]>

Introduce clocksource_dequeue & clocksource_update and move spinlock
calls. clocksource_update does nothing for GENERIC_TIME=n since
change_clocksource does nothing as well.

Cc: Ingo Molnar <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: john stultz <[email protected]>
Cc: Daniel Walker <[email protected]>
Signed-off-by: Martin Schwidefsky <[email protected]>
---
kernel/time/clocksource.c | 91 +++++++++++++++++++++++++++-------------------
1 file changed, 55 insertions(+), 36 deletions(-)

Index: linux-2.6/kernel/time/clocksource.c
===================================================================
--- linux-2.6.orig/kernel/time/clocksource.c
+++ linux-2.6/kernel/time/clocksource.c
@@ -348,21 +348,13 @@ struct clocksource *clocksource_get_next
*/
static struct clocksource *select_clocksource(void)
{
- struct clocksource *next;
-
if (list_empty(&clocksource_list))
return NULL;

if (clocksource_override)
- next = clocksource_override;
- else
- next = list_entry(clocksource_list.next, struct clocksource,
- list);
-
- if (next == curr_clocksource)
- return NULL;
+ return clocksource_override;

- return next;
+ return list_entry(clocksource_list.next, struct clocksource, list);
}

/*
@@ -371,13 +363,19 @@ static struct clocksource *select_clocks
static int clocksource_enqueue(struct clocksource *c)
{
struct list_head *tmp, *entry = &clocksource_list;
+ unsigned long flags;
+ int rc;

+ spin_lock_irqsave(&clocksource_lock, flags);
+ rc = 0;
list_for_each(tmp, &clocksource_list) {
struct clocksource *cs;

cs = list_entry(tmp, struct clocksource, list);
- if (cs == c)
- return -EBUSY;
+ if (cs == c) {
+ rc = -EBUSY;
+ goto out;
+ }
/* Keep track of the place, where to insert */
if (cs->rating >= c->rating)
entry = tmp;
@@ -387,10 +385,44 @@ static int clocksource_enqueue(struct cl
if (strlen(c->name) == strlen(override_name) &&
!strcmp(c->name, override_name))
clocksource_override = c;
+out:
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+ return rc;
+}

- return 0;
+/*
+ * Dequeue a clocksource
+ */
+static void clocksource_dequeue(struct clocksource *cs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ list_del(&cs->list);
+ if (clocksource_override == cs)
+ clocksource_override = NULL;
+ spin_unlock_irqrestore(&clocksource_lock, flags);
}

+#ifdef CONFIG_GENERIC_TIME
+/**
+ * clocksource_update - Check if a better clocksource is available
+ */
+static void clocksource_update(void)
+{
+ struct clocksource *new;
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ new = select_clocksource();
+ if (new)
+ next_clocksource = new;
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+#else /* CONFIG_GENERIC_TIME */
+static inline void clocksource_update(void) { }
+#endif /* CONFIG_GENERIC_TIME */
+
/**
* clocksource_register - Used to install new clocksources
* @t: clocksource to be registered
@@ -399,16 +431,13 @@ static int clocksource_enqueue(struct cl
*/
int clocksource_register(struct clocksource *c)
{
- unsigned long flags;
int ret;

- spin_lock_irqsave(&clocksource_lock, flags);
ret = clocksource_enqueue(c);
- if (!ret)
- next_clocksource = select_clocksource();
- spin_unlock_irqrestore(&clocksource_lock, flags);
- if (!ret)
+ if (!ret) {
+ clocksource_update();
clocksource_check_watchdog(c);
+ }
return ret;
}
EXPORT_SYMBOL(clocksource_register);
@@ -419,14 +448,10 @@ EXPORT_SYMBOL(clocksource_register);
*/
void clocksource_change_rating(struct clocksource *cs, int rating)
{
- unsigned long flags;
-
- spin_lock_irqsave(&clocksource_lock, flags);
- list_del(&cs->list);
+ clocksource_dequeue(cs);
cs->rating = rating;
clocksource_enqueue(cs);
- next_clocksource = select_clocksource();
- spin_unlock_irqrestore(&clocksource_lock, flags);
+ clocksource_update();
}

/**
@@ -434,14 +459,8 @@ void clocksource_change_rating(struct cl
*/
void clocksource_unregister(struct clocksource *cs)
{
- unsigned long flags;
-
- spin_lock_irqsave(&clocksource_lock, flags);
- list_del(&cs->list);
- if (clocksource_override == cs)
- clocksource_override = NULL;
- next_clocksource = select_clocksource();
- spin_unlock_irqrestore(&clocksource_lock, flags);
+ clocksource_dequeue(cs);
+ clocksource_update();
}

#ifdef CONFIG_SYSFS
@@ -522,13 +541,13 @@ static ssize_t sysfs_override_clocksourc
}

/* Reselect, when the override name has changed */
- if (ovr != clocksource_override) {
+ if (ovr != clocksource_override)
clocksource_override = ovr;
- next_clocksource = select_clocksource();
- }

spin_unlock_irq(&clocksource_lock);

+ clocksource_update();
+
return ret;
}

-------------------------------------------------------------------

Subject: [PATCH] introduce struct timekeeper

From: Martin Schwidefsky <[email protected]>

Add struct timekeeper to keep all the internal values timekeeping.c
needs in regard to the currently selected clock source. This moves
all timekeeping related values out of the struct clocksource.

Cc: Ingo Molnar <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: john stultz <[email protected]>
Cc: Daniel Walker <[email protected]>
Signed-off-by: Martin Schwidefsky <[email protected]>
---
arch/ia64/kernel/time.c | 5
arch/powerpc/kernel/time.c | 5
arch/s390/kernel/time.c | 9 -
arch/x86/kernel/vsyscall_64.c | 5
include/linux/clocksource.h | 51 ---------
kernel/time/timekeeping.c | 224 +++++++++++++++++++++++++++---------------
6 files changed, 161 insertions(+), 138 deletions(-)

Index: linux-2.6/include/linux/clocksource.h
===================================================================
--- linux-2.6.orig/include/linux/clocksource.h
+++ linux-2.6/include/linux/clocksource.h
@@ -181,20 +181,6 @@ struct clocksource {
#define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0)
#endif

- /* timekeeping specific data, ignore */
- cycle_t cycle_interval;
- u64 xtime_interval;
- u32 raw_interval;
- /*
- * Second part is written at each timer interrupt
- * Keep it in a different cache line to dirty no
- * more than one cache line.
- */
- cycle_t cycle_last ____cacheline_aligned_in_smp;
- u64 xtime_nsec;
- s64 error;
- struct timespec raw_time;
-
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
/* Watchdog related data, used by the framework */
struct list_head wd_list;
@@ -202,8 +188,6 @@ struct clocksource {
#endif
};

-extern struct clocksource *clock; /* current clocksource */
-
/*
* Clock source flags bits::
*/
@@ -283,37 +267,6 @@ static inline s64 cyc2ns(struct clocksou
return ret;
}

-/**
- * clocksource_calculate_interval - Calculates a clocksource interval struct
- *
- * @c: Pointer to clocksource.
- * @length_nsec: Desired interval length in nanoseconds.
- *
- * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
- * pair and interval request.
- *
- * Unless you're the timekeeping code, you should not be using this!
- */
-static inline void clocksource_calculate_interval(struct clocksource *c,
- unsigned long length_nsec)
-{
- u64 tmp;
-
- /* Do the ns -> cycle conversion first, using original mult */
- tmp = length_nsec;
- tmp <<= c->shift;
- tmp += c->mult_orig/2;
- do_div(tmp, c->mult_orig);
-
- c->cycle_interval = (cycle_t)tmp;
- if (c->cycle_interval == 0)
- c->cycle_interval = 1;
-
- /* Go back from cycles -> shifted ns, this time use ntp adjused mult */
- c->xtime_interval = (u64)c->cycle_interval * c->mult;
- c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
-}
-

/* used to install a new clocksource */
extern int clocksource_register(struct clocksource*);
@@ -324,10 +277,10 @@ extern void clocksource_change_rating(st
extern void clocksource_resume(void);

#ifdef CONFIG_GENERIC_TIME_VSYSCALL
-extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
+extern void update_vsyscall(struct timespec *ts, struct clocksource *c, cycle_t cycle_last);
extern void update_vsyscall_tz(void);
#else
-static inline void update_vsyscall(struct timespec *ts, struct clocksource *c)
+static inline void update_vsyscall(struct timespec *ts, struct clocksource *c, cycle_t cycle_last)
{
}

Index: linux-2.6/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.orig/kernel/time/timekeeping.c
+++ linux-2.6/kernel/time/timekeeping.c
@@ -19,6 +19,67 @@
#include <linux/time.h>
#include <linux/tick.h>

+/* Structure holding internal timekeeping values. */
+struct timekeeper {
+ struct clocksource *clock;
+ cycle_t cycle_interval;
+ u64 xtime_interval;
+ u32 raw_interval;
+ u64 xtime_nsec;
+ s64 ntp_error;
+ int xtime_shift;
+ int ntp_shift;
+
+ /*
+ * The following is written at each timer interrupt
+ * Keep it in a different cache line to dirty no
+ * more than one cache line.
+ */
+ cycle_t cycle_last ____cacheline_aligned_in_smp;
+};
+
+struct timekeeper timekeeper;
+
+/**
+ * timekeeper_setup_internals - Set up internals to use clocksource clock.
+ *
+ * @clock: Pointer to clocksource.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timekeeping code, you should not be using this!
+ */
+static void timekeeper_setup_internals(struct clocksource *clock)
+{
+ cycle_t interval;
+ u64 tmp;
+
+ timekeeper.clock = clock;
+ timekeeper.cycle_last = clock->read(clock);
+
+ /* Do the ns -> cycle conversion first, using original mult */
+ tmp = NTP_INTERVAL_LENGTH;
+ tmp <<= clock->shift;
+ tmp += clock->mult_orig/2;
+ do_div(tmp, clock->mult_orig);
+ if (tmp == 0)
+ tmp = 1;
+
+ interval = (cycle_t) tmp;
+ timekeeper.cycle_interval = interval;
+
+ /* Go back from cycles -> shifted ns, this time use ntp adjused mult */
+ timekeeper.xtime_interval = (u64) interval * clock->mult;
+ timekeeper.raw_interval =
+ ((u64) interval * clock->mult_orig) >> clock->shift;
+
+ timekeeper.xtime_shift = clock->shift;
+ timekeeper.ntp_shift = NTP_SCALE_SHIFT - clock->shift;
+
+ timekeeper.xtime_nsec = 0;
+ timekeeper.ntp_error = 0;
+}

/*
* This read-write spinlock protects us from races in SMP while
@@ -46,6 +107,11 @@ struct timespec xtime __attribute__ ((al
struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
static unsigned long total_sleep_time; /* seconds */

+/*
+ * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
+ */
+struct timespec raw_time;
+
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;

@@ -56,32 +122,32 @@ void update_xtime_cache(u64 nsec)
timespec_add_ns(&xtime_cache, nsec);
}

-struct clocksource *clock;
-
/* must hold xtime_lock */
void timekeeping_leap_insert(int leapsecond)
{
xtime.tv_sec += leapsecond;
wall_to_monotonic.tv_sec -= leapsecond;
- update_vsyscall(&xtime, clock);
+ update_vsyscall(&xtime, timekeeper.clock, timekeeper.cycle_last);
}

#ifdef CONFIG_GENERIC_TIME
/**
- * clocksource_forward_now - update clock to the current time
+ * timekeeping_forward_now - update clock to the current time
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
-static void clocksource_forward_now(void)
+static void timekeeping_forward_now(void)
{
cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
s64 nsec;

+ clock = timekeeper.clock;
cycle_now = clock->read(clock);
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- clock->cycle_last = cycle_now;
+ cycle_delta = (cycle_now - timekeeper.cycle_last) & clock->mask;
+ timekeeper.cycle_last = cycle_now;

nsec = cyc2ns(clock, cycle_delta);

@@ -91,7 +157,7 @@ static void clocksource_forward_now(void
timespec_add_ns(&xtime, nsec);

nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
- clock->raw_time.tv_nsec += nsec;
+ timespec_add_ns(&raw_time, nsec);
}

/**
@@ -103,6 +169,7 @@ static void clocksource_forward_now(void
void getnstimeofday(struct timespec *ts)
{
cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
unsigned long seq;
s64 nsecs;

@@ -114,10 +181,11 @@ void getnstimeofday(struct timespec *ts)
*ts = xtime;

/* read clocksource: */
+ clock = timekeeper.clock;
cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ cycle_delta = (cycle_now - timekeeper.cycle_last) & clock->mask;

/* convert to nanoseconds: */
nsecs = cyc2ns(clock, cycle_delta);
@@ -135,6 +203,7 @@ EXPORT_SYMBOL(getnstimeofday);
ktime_t ktime_get(void)
{
cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
unsigned int seq;
s64 secs, nsecs;

@@ -146,10 +215,11 @@ ktime_t ktime_get(void)
nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;

/* read clocksource: */
+ clock = timekeeper.clock;
cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ cycle_delta = (cycle_now - timekeeper.cycle_last) & clock->mask;

/* convert to nanoseconds: */
nsecs += cyc2ns(clock, cycle_delta);
@@ -174,6 +244,7 @@ EXPORT_SYMBOL_GPL(ktime_get);
void ktime_get_ts(struct timespec *ts)
{
cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
struct timespec tomono;
unsigned int seq;
s64 nsecs;
@@ -186,10 +257,11 @@ void ktime_get_ts(struct timespec *ts)
tomono = wall_to_monotonic;

/* read clocksource: */
+ clock = timekeeper.clock;
cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ cycle_delta = (cycle_now - timekeeper.cycle_last) & clock->mask;

/* convert to nanoseconds: */
nsecs = cyc2ns(clock, cycle_delta);
@@ -233,7 +305,7 @@ int do_settimeofday(struct timespec *tv)

write_seqlock_irqsave(&xtime_lock, flags);

- clocksource_forward_now();
+ timekeeping_forward_now();

ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -243,10 +315,10 @@ int do_settimeofday(struct timespec *tv)

update_xtime_cache(0);

- clock->error = 0;
+ timekeeper.ntp_error = 0;
ntp_clear();

- update_vsyscall(&xtime, clock);
+ update_vsyscall(&xtime, timekeeper.clock, timekeeper.cycle_last);

write_sequnlock_irqrestore(&xtime_lock, flags);

@@ -269,38 +341,25 @@ static void change_clocksource(void)

new = clocksource_get_next();

- if (clock == new)
+ if (timekeeper.clock == new)
return;

- clocksource_forward_now();
+ timekeeping_forward_now();

if (new->enable && ! new->enable(new))
return;
/* save mult_orig on enable */
new->mult_orig = new->mult;

- new->raw_time = clock->raw_time;
- old = clock;
- clock = new;
+ old = timekeeper.clock;
+ timekeeper_setup_internals(new);
if (old->disable)
old->disable(old);

- clock->cycle_last = clock->read(clock);
- clock->error = 0;
- clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-
tick_clock_notify();
-
- /*
- * We're holding xtime lock and waking up klogd would deadlock
- * us on enqueue. So no printing!
- printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
- */
}
#else /* GENERIC_TIME */
-static inline void clocksource_forward_now(void) { }
+static inline void timekeeping_forward_now(void) { }
static inline void change_clocksource(void) { }

/**
@@ -370,20 +429,22 @@ void getrawmonotonic(struct timespec *ts
unsigned long seq;
s64 nsecs;
cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;

do {
seq = read_seqbegin(&xtime_lock);

/* read clocksource: */
+ clock = timekeeper.clock;
cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ cycle_delta = (cycle_now - timekeeper.cycle_last) & clock->mask;

/* convert to nanoseconds: */
nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;

- *ts = clock->raw_time;
+ *ts = raw_time;

} while (read_seqretry(&xtime_lock, seq));

@@ -403,7 +464,7 @@ int timekeeping_valid_for_hres(void)
do {
seq = read_seqbegin(&xtime_lock);

- ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;

} while (read_seqretry(&xtime_lock, seq));

@@ -429,6 +490,7 @@ unsigned long __attribute__((weak)) read
*/
void __init timekeeping_init(void)
{
+ struct clocksource *clock;
unsigned long flags;
unsigned long sec = read_persistent_clock();

@@ -441,11 +503,13 @@ void __init timekeeping_init(void)
clock->enable(clock);
/* save mult_orig on enable */
clock->mult_orig = clock->mult;
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
- clock->cycle_last = clock->read(clock);
+
+ timekeeper_setup_internals(clock);

xtime.tv_sec = sec;
xtime.tv_nsec = 0;
+ raw_time.tv_sec = 0;
+ raw_time.tv_nsec = 0;
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
update_xtime_cache(0);
@@ -482,8 +546,8 @@ static int timekeeping_resume(struct sys
}
update_xtime_cache(0);
/* re-base the last cycle value */
- clock->cycle_last = clock->read(clock);
- clock->error = 0;
+ timekeeper.cycle_last = timekeeper.clock->read(timekeeper.clock);
+ timekeeper.ntp_error = 0;
timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);

@@ -504,7 +568,7 @@ static int timekeeping_suspend(struct sy
timekeeping_suspend_time = read_persistent_clock();

write_seqlock_irqsave(&xtime_lock, flags);
- clocksource_forward_now();
+ timekeeping_forward_now();
timekeeping_suspended = 1;
write_sequnlock_irqrestore(&xtime_lock, flags);

@@ -539,7 +603,7 @@ device_initcall(timekeeping_init_device)
* If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
s64 *offset)
{
s64 tick_error, i;
@@ -555,7 +619,7 @@ static __always_inline int clocksource_b
* here. This is tuned so that an error of about 1 msec is adjusted
* within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
*/
- error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
error2 = abs(error2);
for (look_ahead = 0; error2 > 0; look_ahead++)
error2 >>= 2;
@@ -564,8 +628,8 @@ static __always_inline int clocksource_b
* Now calculate the error in (1 << look_ahead) ticks, but first
* remove the single look ahead already included in the error.
*/
- tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
- tick_error -= clock->xtime_interval >> 1;
+ tick_error = tick_length >> (timekeeper.ntp_shift + 1);
+ tick_error -= timekeeper.xtime_interval >> 1;
error = ((error - tick_error) >> look_ahead) + tick_error;

/* Finally calculate the adjustment shift value. */
@@ -590,18 +654,18 @@ static __always_inline int clocksource_b
* this is optimized for the most common adjustments of -1,0,1,
* for other values we can do a bit more work.
*/
-static void clocksource_adjust(s64 offset)
+static void timekeeping_adjust(s64 offset)
{
- s64 error, interval = clock->cycle_interval;
+ s64 error, interval = timekeeper.cycle_interval;
int adj;

- error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
+ error = timekeeper.ntp_error >> (timekeeper.ntp_shift - 1);
if (error > interval) {
error >>= 2;
if (likely(error <= interval))
adj = 1;
else
- adj = clocksource_bigadjust(error, &interval, &offset);
+ adj = timekeeping_bigadjust(error, &interval, &offset);
} else if (error < -interval) {
error >>= 2;
if (likely(error >= -interval)) {
@@ -609,15 +673,14 @@ static void clocksource_adjust(s64 offse
interval = -interval;
offset = -offset;
} else
- adj = clocksource_bigadjust(error, &interval, &offset);
+ adj = timekeeping_bigadjust(error, &interval, &offset);
} else
return;

- clock->mult += adj;
- clock->xtime_interval += interval;
- clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) <<
- (NTP_SCALE_SHIFT - clock->shift);
+ timekeeper.clock->mult += adj;
+ timekeeper.xtime_interval += interval;
+ timekeeper.xtime_nsec -= offset;
+ timekeeper.ntp_error -= (interval - offset) << timekeeper.ntp_shift;
}

/**
@@ -627,53 +690,56 @@ static void clocksource_adjust(s64 offse
*/
void update_wall_time(void)
{
+ struct clocksource *clock;
cycle_t offset;

/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;

+ clock = timekeeper.clock;
#ifdef CONFIG_GENERIC_TIME
- offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
+ offset = (clock->read(clock) - timekeeper.cycle_last) & clock->mask;
#else
- offset = clock->cycle_interval;
+ offset = timekeeper.cycle_interval;
#endif
- clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
+ timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.xtime_shift;

/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
- while (offset >= clock->cycle_interval) {
+ while (offset >= timekeeper.cycle_interval) {
/* accumulate one interval */
- offset -= clock->cycle_interval;
- clock->cycle_last += clock->cycle_interval;
+ offset -= timekeeper.cycle_interval;
+ timekeeper.cycle_last += timekeeper.cycle_interval;

- clock->xtime_nsec += clock->xtime_interval;
- if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
- clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+ timekeeper.xtime_nsec += timekeeper.xtime_interval;
+ if (timekeeper.xtime_nsec >= (u64)NSEC_PER_SEC << timekeeper.xtime_shift) {
+ timekeeper.xtime_nsec -= (u64)NSEC_PER_SEC << timekeeper.xtime_shift;
xtime.tv_sec++;
second_overflow();
}

- clock->raw_time.tv_nsec += clock->raw_interval;
- if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
- clock->raw_time.tv_nsec -= NSEC_PER_SEC;
- clock->raw_time.tv_sec++;
+ raw_time.tv_nsec += timekeeper.raw_interval;
+ if (raw_time.tv_nsec >= NSEC_PER_SEC) {
+ raw_time.tv_nsec -= NSEC_PER_SEC;
+ raw_time.tv_sec++;
}

/* accumulate error between NTP and clock interval */
- clock->error += tick_length;
- clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
+ timekeeper.ntp_error += tick_length;
+ timekeeper.ntp_error -= timekeeper.xtime_interval <<
+ timekeeper.ntp_shift;
}

/* correct the clock when NTP error is too big */
- clocksource_adjust(offset);
+ timekeeping_adjust(offset);

/*
* Since in the loop above, we accumulate any amount of time
* in xtime_nsec over a second into xtime.tv_sec, its possible for
* xtime_nsec to be fairly small after the loop. Further, if we're
- * slightly speeding the clocksource up in clocksource_adjust(),
+ * slightly speeding the clocksource up in timekeeping_adjust(),
* its possible the required corrective factor to xtime_nsec could
* cause it to underflow.
*
@@ -685,24 +751,24 @@ void update_wall_time(void)
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
- if (unlikely((s64)clock->xtime_nsec < 0)) {
- s64 neg = -(s64)clock->xtime_nsec;
- clock->xtime_nsec = 0;
- clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+ if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
+ s64 neg = -(s64)timekeeper.xtime_nsec;
+ timekeeper.xtime_nsec = 0;
+ timekeeper.ntp_error += neg << timekeeper.ntp_shift;
}

/* store full nanoseconds into xtime after rounding it up and
* add the remainder to the error difference.
*/
- xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
- clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
- clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
+ xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> timekeeper.xtime_shift) + 1;
+ timekeeper.xtime_nsec -= (s64)xtime.tv_nsec << timekeeper.xtime_shift;
+ timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_shift;

update_xtime_cache(cyc2ns(clock, offset));

/* check to see if there is a new clocksource to use */
change_clocksource();
- update_vsyscall(&xtime, clock);
+ update_vsyscall(&xtime, clock, timekeeper.cycle_last);
}

/**
Index: linux-2.6/arch/ia64/kernel/time.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/time.c
+++ linux-2.6/arch/ia64/kernel/time.c
@@ -473,7 +473,8 @@ void update_vsyscall_tz(void)
{
}

-void update_vsyscall(struct timespec *wall, struct clocksource *c)
+void update_vsyscall(struct timespec *wall, struct clocksource *c,#
+ cycle_t cycle_last)
{
unsigned long flags;

@@ -484,7 +485,7 @@ void update_vsyscall(struct timespec *wa
fsyscall_gtod_data.clk_mult = c->mult;
fsyscall_gtod_data.clk_shift = c->shift;
fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio;
- fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
+ fsyscall_gtod_data.clk_cycle_last = cycle_last;

/* copy kernel time structures */
fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec;
Index: linux-2.6/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/time.c
+++ linux-2.6/arch/powerpc/kernel/time.c
@@ -802,7 +802,8 @@ static cycle_t timebase_read(struct cloc
return (cycle_t)get_tb();
}

-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+ cycle_t cycle_last)
{
u64 t2x, stamp_xsec;

@@ -819,7 +820,7 @@ void update_vsyscall(struct timespec *wa
stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
do_div(stamp_xsec, 1000000000);
stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
- update_gtod(clock->cycle_last, stamp_xsec, t2x);
+ update_gtod(cycle_last, stamp_xsec, t2x);
}

void update_vsyscall_tz(void)
Index: linux-2.6/arch/s390/kernel/time.c
===================================================================
--- linux-2.6.orig/arch/s390/kernel/time.c
+++ linux-2.6/arch/s390/kernel/time.c
@@ -206,7 +206,8 @@ static struct clocksource clocksource_to
};


-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+ cycle_t cycle_last)
{
if (clock != &clocksource_tod)
return;
@@ -214,7 +215,7 @@ void update_vsyscall(struct timespec *wa
/* Make userspace gettimeofday spin until we're done. */
++vdso_data->tb_update_count;
smp_wmb();
- vdso_data->xtime_tod_stamp = clock->cycle_last;
+ vdso_data->xtime_tod_stamp = cycle_last;
vdso_data->xtime_clock_sec = xtime.tv_sec;
vdso_data->xtime_clock_nsec = xtime.tv_nsec;
vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
@@ -275,8 +276,8 @@ void __init time_init(void)
write_seqlock_irqsave(&xtime_lock, flags);
now = get_clock();
tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime);
- clocksource_tod.cycle_last = now;
- clocksource_tod.raw_time = xtime;
+// clocksource_tod.cycle_last = now;
+// clocksource_tod.raw_time = xtime;
tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts);
set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec);
write_sequnlock_irqrestore(&xtime_lock, flags);
Index: linux-2.6/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vsyscall_64.c
+++ linux-2.6/arch/x86/kernel/vsyscall_64.c
@@ -73,14 +73,15 @@ void update_vsyscall_tz(void)
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}

-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+ cycle_t cycle_last)
{
unsigned long flags;

write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
/* copy vsyscall data */
vsyscall_gtod_data.clock.vread = clock->vread;
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
+ vsyscall_gtod_data.clock.cycle_last = cycle_last;
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = clock->mult;
vsyscall_gtod_data.clock.shift = clock->shift;

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2009-07-25 00:09:37

by john stultz

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Thu, 2009-07-23 at 12:52 +0200, Martin Schwidefsky wrote:
> On Wed, 22 Jul 2009 17:28:20 -0700
> john stultz <[email protected]> wrote:
> > 2) Structure names aren't too great right now. Not sure timeclock is
> > what I want to use, probably system_time or something. Will find/replace
> > before the next revision is sent out.
>
> I've picked the name struct timekeeper.

Nice. I like this name.


> > 5) The TSC clocksource uses cycles_last to avoid very slight skew issues
> > (that otherwise would not be noticed). Not sure how to fix that if we're
> > pulling cycles_last (which is purely timekeeping state) out of the
> > clocksource. Will have to think of something.
>
> That is an ugly one. A similar thing exists in the s390 backend where I
> want to reset the timekeeping to precise values after the clocksource
> switch from jiffies. The proper solution probably is to allow
> architectures to override the default clocksource. The jiffies
> clocksource doesn't make any sense on s390.

Yea. Still not sure what a good fix is here. We need some way for the
TSC to check if its slightly behind another cpu. Alternatively we could
create a flag (SLIGHTLY_UNSYNCED or something) where we then do the
quick comparison in the timekeeping code instead of the clocksource read
implementation? It would really only be useful for the TSC and would
clutter the code up somewhat, so I'm not sure I really like it, but
maybe that would work.

> > 3) Once all arches are converted to using read_persistent_clock(), then
> > the arch specific time initialization can be dropped. Removing the
> > majority of direct xtime structure accesses.
>
> Only if the read_persistent_clock allows for a better resolution than
> seconds. With my cputime accounting hat on: seconds are not good
> enough..

cputime accounting? read_persistent_clock is only used for time
initialization. But yea, it could be extended to return a timespec.

> > 4) Then once the remaining direct wall_to_monotonic and xtime accessors
> > are moved to timekeeping.c we can make those both static and embed them
> > into the core timekeeping structure.
>
> Both should not be accessed at a rate that makes it necessary to read
> from the values directly. An accessor should be fine I think.

Yea its just a matter of cleaning things up.

> > But let me know if this patch doesn't achieve most of the cleanup you
> > wanted to see.
>
> We are getting there. I wonder if it is really necessary to pull
> xtime_cache, raw_time, total_sleep_time and timekeeping_suspended into
> the struct timeclock. I would prefer the semantics that the struct
> timekeeper / timeclock contains the internal values of the timekeeping
> code for the currently selected clock source. xtime is not clock
> specific.

Its not necessary. Although I do like the idea of pulling all the values
protected by the xtime_lock into one structure so its clear what we're
protecting. As it is now, jiffies, ntp state and a whole host of other
data is covered by it, so it would be a ways off until all that logic
could be dethreaded.

The other reason for putting those values into the timekeeping struct is
that they are actually fairly tightly connected with the clocksource
state. There is some redundancy: xtime_nsec stores the highres remainder
of xtime.tv_nsec, xtime_cache is also mostly duplicate, so hopefully we
can combine those once xtime is made static.

> For reference here is the current stack of patches I have on my disk.
> The stop_machine conversion to install a new clocksource is currently missing.
>
> PRELIMINARY PATCHES, USE AT YOUR OWN RISK.
> -------------------------------------------------------------------
>
> Subject: [PATCH] introduce timekeeping_leap_insert
>
> From: john stultz <[email protected]>
>
> ---
> include/linux/time.h | 1 +
> kernel/time/ntp.c | 7 ++-----
> kernel/time/timekeeping.c | 7 +++++++
> 3 files changed, 10 insertions(+), 5 deletions(-)
>
[snip]

I think this one is pretty easy and should be able to be submitted on
the sooner side.




> -------------------------------------------------------------------
>
> Subject: [PATCH] remove clocksource inline functions
>
> From: Martin Schwidefsky <[email protected]>
>
> Remove clocksource_read, clocksource_enable and clocksource_disable
> inline functions. No functional change.
>
> Cc: Ingo Molnar <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: john stultz <[email protected]>
> Cc: Daniel Walker <[email protected]>
> Signed-off-by: Martin Schwidefsky <[email protected]>
> ---
> include/linux/clocksource.h | 46 --------------------------------------------
> kernel/time/timekeeping.c | 32 +++++++++++++++++-------------
> 2 files changed, 18 insertions(+), 60 deletions(-)

[snip]

Again, the outstanding mult_orig fix (I need to pester Thomas or Andrew
to push that before 2.6.31 comes out) will collide with this, but
overall it looks ok.


> -------------------------------------------------------------------
>
> Subject: [PATCH] cleanup clocksource selection
>
> From: Martin Schwidefsky <[email protected]>
>
> Introduce clocksource_dequeue & clocksource_update and move spinlock
> calls. clocksource_update does nothing for GENERIC_TIME=n since
> change_clocksource does nothing as well.
>
> Cc: Ingo Molnar <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: john stultz <[email protected]>
> Cc: Daniel Walker <[email protected]>
> Signed-off-by: Martin Schwidefsky <[email protected]>

[snip]

Haven't had enough time to closely review or test this, but it looks
like an agreeable change.




> -------------------------------------------------------------------
>
> Subject: [PATCH] introduce struct timekeeper
>
> From: Martin Schwidefsky <[email protected]>
>
> Add struct timekeeper to keep all the internal values timekeeping.c
> needs in regard to the currently selected clock source. This moves
> all timekeeping related values out of the struct clocksource.
>
> Cc: Ingo Molnar <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: john stultz <[email protected]>
> Cc: Daniel Walker <[email protected]>
> Signed-off-by: Martin Schwidefsky <[email protected]>
> ---
> arch/ia64/kernel/time.c | 5
> arch/powerpc/kernel/time.c | 5
> arch/s390/kernel/time.c | 9 -
> arch/x86/kernel/vsyscall_64.c | 5
> include/linux/clocksource.h | 51 ---------
> kernel/time/timekeeping.c | 224 +++++++++++++++++++++++++++---------------
> 6 files changed, 161 insertions(+), 138 deletions(-)
>

[snip]

> +/* Structure holding internal timekeeping values. */
> +struct timekeeper {
> + struct clocksource *clock;
> + cycle_t cycle_interval;
> + u64 xtime_interval;
> + u32 raw_interval;
> + u64 xtime_nsec;
> + s64 ntp_error;
> + int xtime_shift;
> + int ntp_shift;
> +
> + /*
> + * The following is written at each timer interrupt
> + * Keep it in a different cache line to dirty no
> + * more than one cache line.
> + */
> + cycle_t cycle_last ____cacheline_aligned_in_smp;
> +};


The cacheline aligned bit probably needs to cover almost everything here
(not clock, not the shift values), as it will all be updated each tick.


[snip]
> @@ -564,8 +628,8 @@ static __always_inline int clocksource_b
> * Now calculate the error in (1 << look_ahead) ticks, but first
> * remove the single look ahead already included in the error.
> */
> - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
> - tick_error -= clock->xtime_interval >> 1;
> + tick_error = tick_length >> (timekeeper.ntp_shift + 1);
> + tick_error -= timekeeper.xtime_interval >> 1;
> error = ((error - tick_error) >> look_ahead) + tick_error;
>
> /* Finally calculate the adjustment shift value. */
> @@ -590,18 +654,18 @@ static __always_inline int clocksource_b
> * this is optimized for the most common adjustments of -1,0,1,
> * for other values we can do a bit more work.
> */
> -static void clocksource_adjust(s64 offset)
> +static void timekeeping_adjust(s64 offset)
> {
> - s64 error, interval = clock->cycle_interval;
> + s64 error, interval = timekeeper.cycle_interval;
> int adj;
>
> - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
> + error = timekeeper.ntp_error >> (timekeeper.ntp_shift - 1);
> if (error > interval) {
> error >>= 2;
> if (likely(error <= interval))
> adj = 1;
> else
> - adj = clocksource_bigadjust(error, &interval, &offset);
> + adj = timekeeping_bigadjust(error, &interval, &offset);
> } else if (error < -interval) {
> error >>= 2;
> if (likely(error >= -interval)) {
> @@ -609,15 +673,14 @@ static void clocksource_adjust(s64 offse
> interval = -interval;
> offset = -offset;
> } else
> - adj = clocksource_bigadjust(error, &interval, &offset);
> + adj = timekeeping_bigadjust(error, &interval, &offset);
> } else
> return;
>
> - clock->mult += adj;
> - clock->xtime_interval += interval;
> - clock->xtime_nsec -= offset;
> - clock->error -= (interval - offset) <<
> - (NTP_SCALE_SHIFT - clock->shift);
> + timekeeper.clock->mult += adj;
> + timekeeper.xtime_interval += interval;
> + timekeeper.xtime_nsec -= offset;
> + timekeeper.ntp_error -= (interval - offset) << timekeeper.ntp_shift;
> }
>
> /**
> @@ -627,53 +690,56 @@ static void clocksource_adjust(s64 offse
> */
> void update_wall_time(void)
> {
> + struct clocksource *clock;
> cycle_t offset;
>
> /* Make sure we're fully resumed: */
> if (unlikely(timekeeping_suspended))
> return;
>
> + clock = timekeeper.clock;
> #ifdef CONFIG_GENERIC_TIME
> - offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
> + offset = (clock->read(clock) - timekeeper.cycle_last) & clock->mask;
> #else
> - offset = clock->cycle_interval;
> + offset = timekeeper.cycle_interval;
> #endif
> - clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
> + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.xtime_shift;
>
> /* normally this loop will run just once, however in the
> * case of lost or late ticks, it will accumulate correctly.
> */
> - while (offset >= clock->cycle_interval) {
> + while (offset >= timekeeper.cycle_interval) {
> /* accumulate one interval */
> - offset -= clock->cycle_interval;
> - clock->cycle_last += clock->cycle_interval;
> + offset -= timekeeper.cycle_interval;
> + timekeeper.cycle_last += timekeeper.cycle_interval;
>
> - clock->xtime_nsec += clock->xtime_interval;
> - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
> - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
> + timekeeper.xtime_nsec += timekeeper.xtime_interval;
> + if (timekeeper.xtime_nsec >= (u64)NSEC_PER_SEC << timekeeper.xtime_shift) {
> + timekeeper.xtime_nsec -= (u64)NSEC_PER_SEC << timekeeper.xtime_shift;
> xtime.tv_sec++;
> second_overflow();
> }
>
> - clock->raw_time.tv_nsec += clock->raw_interval;
> - if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
> - clock->raw_time.tv_nsec -= NSEC_PER_SEC;
> - clock->raw_time.tv_sec++;
> + raw_time.tv_nsec += timekeeper.raw_interval;
> + if (raw_time.tv_nsec >= NSEC_PER_SEC) {
> + raw_time.tv_nsec -= NSEC_PER_SEC;
> + raw_time.tv_sec++;
> }
>
> /* accumulate error between NTP and clock interval */
> - clock->error += tick_length;
> - clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
> + timekeeper.ntp_error += tick_length;
> + timekeeper.ntp_error -= timekeeper.xtime_interval <<
> + timekeeper.ntp_shift;
> }


Just to be super careful, I'd probably separate the introduction of the
timekeeper code and the ntp_shift changes into separate patches. Just to
keep the amount of change to very complex code down to a more easily
review-able amount.


So yea, so I think we're pretty much seeing the same vision here!
There's still the outstanding cycles_last access by the TSC clocksource,
so let me know what you think about my idea for that I mentioned
earlier.

Looks good. If I can get some more time soon I'll try to put all of
these patches together for some testing and see what else I can add.

thanks again!
-john

2009-07-27 11:56:04

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC][patch 1/5] move clock source related code to clocksource.c

On Fri, 24 Jul 2009 17:08:47 -0700
john stultz <[email protected]> wrote:

> On Thu, 2009-07-23 at 12:52 +0200, Martin Schwidefsky wrote:
> > On Wed, 22 Jul 2009 17:28:20 -0700
> > john stultz <[email protected]> wrote:
> > > 2) Structure names aren't too great right now. Not sure timeclock is
> > > what I want to use, probably system_time or something. Will find/replace
> > > before the next revision is sent out.
> >
> > I've picked the name struct timekeeper.
>
> Nice. I like this name.

Agreed then.

> > > 5) The TSC clocksource uses cycles_last to avoid very slight skew issues
> > > (that otherwise would not be noticed). Not sure how to fix that if we're
> > > pulling cycles_last (which is purely timekeeping state) out of the
> > > clocksource. Will have to think of something.
> >
> > That is an ugly one. A similar thing exists in the s390 backend where I
> > want to reset the timekeeping to precise values after the clocksource
> > switch from jiffies. The proper solution probably is to allow
> > architectures to override the default clocksource. The jiffies
> > clocksource doesn't make any sense on s390.
>
> Yea. Still not sure what a good fix is here. We need some way for the
> TSC to check if its slightly behind another cpu. Alternatively we could
> create a flag (SLIGHTLY_UNSYNCED or something) where we then do the
> quick comparison in the timekeeping code instead of the clocksource read
> implementation? It would really only be useful for the TSC and would
> clutter the code up somewhat, so I'm not sure I really like it, but
> maybe that would work.

For my next version of the patch series I will leave the cycle_last in
the struct clocksource. Small, manageable steps, if we find a way to
move it then we can do that later on as well.

> > > 3) Once all arches are converted to using read_persistent_clock(), then
> > > the arch specific time initialization can be dropped. Removing the
> > > majority of direct xtime structure accesses.
> >
> > Only if the read_persistent_clock allows for a better resolution than
> > seconds. With my cputime accounting hat on: seconds are not good
> > enough..
>
> cputime accounting? read_persistent_clock is only used for time
> initialization. But yea, it could be extended to return a timespec.

Exactly!

> > > 4) Then once the remaining direct wall_to_monotonic and xtime accessors
> > > are moved to timekeeping.c we can make those both static and embed them
> > > into the core timekeeping structure.
> >
> > Both should not be accessed at a rate that makes it necessary to read
> > from the values directly. An accessor should be fine I think.
>
> Yea its just a matter of cleaning things up.

Nod.

> > > But let me know if this patch doesn't achieve most of the cleanup you
> > > wanted to see.
> >
> > We are getting there. I wonder if it is really necessary to pull
> > xtime_cache, raw_time, total_sleep_time and timekeeping_suspended into
> > the struct timeclock. I would prefer the semantics that the struct
> > timekeeper / timeclock contains the internal values of the timekeeping
> > code for the currently selected clock source. xtime is not clock
> > specific.
>
> Its not necessary. Although I do like the idea of pulling all the values
> protected by the xtime_lock into one structure so its clear what we're
> protecting. As it is now, jiffies, ntp state and a whole host of other
> data is covered by it, so it would be a ways off until all that logic
> could be dethreaded.

Yea, I guess that makes sense. Gives us something to do in the near
future. For now I do what is not too complicated to implement.

> The other reason for putting those values into the timekeeping struct is
> that they are actually fairly tightly connected with the clocksource
> state. There is some redundancy: xtime_nsec stores the highres remainder
> of xtime.tv_nsec, xtime_cache is also mostly duplicate, so hopefully we
> can combine those once xtime is made static.
>
> > For reference here is the current stack of patches I have on my disk.
> > The stop_machine conversion to install a new clocksource is currently missing.
> >
> > PRELIMINARY PATCHES, USE AT YOUR OWN RISK.
> > -------------------------------------------------------------------
> >
> > Subject: [PATCH] introduce timekeeping_leap_insert
> >
> > From: john stultz <[email protected]>
> >
> > ---
> > include/linux/time.h | 1 +
> > kernel/time/ntp.c | 7 ++-----
> > kernel/time/timekeeping.c | 7 +++++++
> > 3 files changed, 10 insertions(+), 5 deletions(-)
> >
> [snip]
>
> I think this one is pretty easy and should be able to be submitted on
> the sooner side.

A good question is who will upstream all this, will you take care of
it ?

> > -------------------------------------------------------------------
> >
> > Subject: [PATCH] remove clocksource inline functions
> >
> > From: Martin Schwidefsky <[email protected]>
> >
> > Remove clocksource_read, clocksource_enable and clocksource_disable
> > inline functions. No functional change.
> >
> > Cc: Ingo Molnar <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > Cc: john stultz <[email protected]>
> > Cc: Daniel Walker <[email protected]>
> > Signed-off-by: Martin Schwidefsky <[email protected]>
> > ---
> > include/linux/clocksource.h | 46 --------------------------------------------
> > kernel/time/timekeeping.c | 32 +++++++++++++++++-------------
> > 2 files changed, 18 insertions(+), 60 deletions(-)
>
> [snip]
>
> Again, the outstanding mult_orig fix (I need to pester Thomas or Andrew
> to push that before 2.6.31 comes out) will collide with this, but
> overall it looks ok.

The conflict with mult_orig is a small issue, I'll work around it. In
the end mult_orig will be gone anyway.

> > -------------------------------------------------------------------
> >
> > Subject: [PATCH] cleanup clocksource selection
> >
> > From: Martin Schwidefsky <[email protected]>
> >
> > Introduce clocksource_dequeue & clocksource_update and move spinlock
> > calls. clocksource_update does nothing for GENERIC_TIME=n since
> > change_clocksource does nothing as well.
> >
> > Cc: Ingo Molnar <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > Cc: john stultz <[email protected]>
> > Cc: Daniel Walker <[email protected]>
> > Signed-off-by: Martin Schwidefsky <[email protected]>
>
> [snip]
>
> Haven't had enough time to closely review or test this, but it looks
> like an agreeable change.

Don't know yet, the change to clocksource for the stop_machine rework
will be bigger than I though. The clocksource watchdog code changes the
rating of the TSC clock from interrupt context. I need to get out of
the interrupt context if I want to use stop_machine for the clocksource
update. Not nice..

> > +/* Structure holding internal timekeeping values. */
> > +struct timekeeper {
> > + struct clocksource *clock;
> > + cycle_t cycle_interval;
> > + u64 xtime_interval;
> > + u32 raw_interval;
> > + u64 xtime_nsec;
> > + s64 ntp_error;
> > + int xtime_shift;
> > + int ntp_shift;
> > +
> > + /*
> > + * The following is written at each timer interrupt
> > + * Keep it in a different cache line to dirty no
> > + * more than one cache line.
> > + */
> > + cycle_t cycle_last ____cacheline_aligned_in_smp;
> > +};
>
>
> The cacheline aligned bit probably needs to cover almost everything here
> (not clock, not the shift values), as it will all be updated each tick.

cycle_last is back in the clocksource structure for now.

> [snip]
> > @@ -564,8 +628,8 @@ static __always_inline int clocksource_b
> > * Now calculate the error in (1 << look_ahead) ticks, but first
> > * remove the single look ahead already included in the error.
> > */
> > - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
> > - tick_error -= clock->xtime_interval >> 1;
> > + tick_error = tick_length >> (timekeeper.ntp_shift + 1);
> > + tick_error -= timekeeper.xtime_interval >> 1;
> > error = ((error - tick_error) >> look_ahead) + tick_error;
> >
> > /* Finally calculate the adjustment shift value. */
> > @@ -590,18 +654,18 @@ static __always_inline int clocksource_b
> > * this is optimized for the most common adjustments of -1,0,1,
> > * for other values we can do a bit more work.
> > */
> > -static void clocksource_adjust(s64 offset)
> > +static void timekeeping_adjust(s64 offset)
> > {
> > - s64 error, interval = clock->cycle_interval;
> > + s64 error, interval = timekeeper.cycle_interval;
> > int adj;
> >
> > - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
> > + error = timekeeper.ntp_error >> (timekeeper.ntp_shift - 1);
> > if (error > interval) {
> > error >>= 2;
> > if (likely(error <= interval))
> > adj = 1;
> > else
> > - adj = clocksource_bigadjust(error, &interval, &offset);
> > + adj = timekeeping_bigadjust(error, &interval, &offset);
> > } else if (error < -interval) {
> > error >>= 2;
> > if (likely(error >= -interval)) {
> > @@ -609,15 +673,14 @@ static void clocksource_adjust(s64 offse
> > interval = -interval;
> > offset = -offset;
> > } else
> > - adj = clocksource_bigadjust(error, &interval, &offset);
> > + adj = timekeeping_bigadjust(error, &interval, &offset);
> > } else
> > return;
> >
> > - clock->mult += adj;
> > - clock->xtime_interval += interval;
> > - clock->xtime_nsec -= offset;
> > - clock->error -= (interval - offset) <<
> > - (NTP_SCALE_SHIFT - clock->shift);
> > + timekeeper.clock->mult += adj;
> > + timekeeper.xtime_interval += interval;
> > + timekeeper.xtime_nsec -= offset;
> > + timekeeper.ntp_error -= (interval - offset) << timekeeper.ntp_shift;
> > }
> >
> > /**
> > @@ -627,53 +690,56 @@ static void clocksource_adjust(s64 offse
> > */
> > void update_wall_time(void)
> > {
> > + struct clocksource *clock;
> > cycle_t offset;
> >
> > /* Make sure we're fully resumed: */
> > if (unlikely(timekeeping_suspended))
> > return;
> >
> > + clock = timekeeper.clock;
> > #ifdef CONFIG_GENERIC_TIME
> > - offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
> > + offset = (clock->read(clock) - timekeeper.cycle_last) & clock->mask;
> > #else
> > - offset = clock->cycle_interval;
> > + offset = timekeeper.cycle_interval;
> > #endif
> > - clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
> > + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.xtime_shift;
> >
> > /* normally this loop will run just once, however in the
> > * case of lost or late ticks, it will accumulate correctly.
> > */
> > - while (offset >= clock->cycle_interval) {
> > + while (offset >= timekeeper.cycle_interval) {
> > /* accumulate one interval */
> > - offset -= clock->cycle_interval;
> > - clock->cycle_last += clock->cycle_interval;
> > + offset -= timekeeper.cycle_interval;
> > + timekeeper.cycle_last += timekeeper.cycle_interval;
> >
> > - clock->xtime_nsec += clock->xtime_interval;
> > - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
> > - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
> > + timekeeper.xtime_nsec += timekeeper.xtime_interval;
> > + if (timekeeper.xtime_nsec >= (u64)NSEC_PER_SEC << timekeeper.xtime_shift) {
> > + timekeeper.xtime_nsec -= (u64)NSEC_PER_SEC << timekeeper.xtime_shift;
> > xtime.tv_sec++;
> > second_overflow();
> > }
> >
> > - clock->raw_time.tv_nsec += clock->raw_interval;
> > - if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
> > - clock->raw_time.tv_nsec -= NSEC_PER_SEC;
> > - clock->raw_time.tv_sec++;
> > + raw_time.tv_nsec += timekeeper.raw_interval;
> > + if (raw_time.tv_nsec >= NSEC_PER_SEC) {
> > + raw_time.tv_nsec -= NSEC_PER_SEC;
> > + raw_time.tv_sec++;
> > }
> >
> > /* accumulate error between NTP and clock interval */
> > - clock->error += tick_length;
> > - clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
> > + timekeeper.ntp_error += tick_length;
> > + timekeeper.ntp_error -= timekeeper.xtime_interval <<
> > + timekeeper.ntp_shift;
> > }
>
>
> Just to be super careful, I'd probably separate the introduction of the
> timekeeper code and the ntp_shift changes into separate patches. Just to
> keep the amount of change to very complex code down to a more easily
> review-able amount.

Done, I now have multiple patches for the cleanup. I will resend soon.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.