Accumulating one tick at a time works well unless we're using NOHZ. Then
it can be an issue, since we may have to run through the loop a few
thousand times, which can increase timer interrupt caused latency.
The current solution was to accumulate in half-second intervals with
NOHZ. This kept the number of loops down, however it did slightly change
how we make NTP adjustments. While not an issue with NTPd users, as NTPd
makes adjustments over a longer period of time, other adjtimex() users
have noticed the half-second granularity with which we can apply
frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either get
no correction, or a 50us correction.
Now, there will always be some granularity error for applying frequency
corrections. However with users sensitive to this error have seen a
50-500x increase with NOHZ compared to running without NOHZ.
So I figured I'd try another approach then just simply increasing the
interval. My approach is to consume the time interval logarithmically.
This reduces the number of times through the loop needed keeping
latency down, while still preserving the original granularity error for
adjtimex() changes.
Further, this change allows us to remove the xtime_cache code (patch to
follow), as xtime is always within one tick of the current time, instead
of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in the
RedHat MRG releases for awhile without issue, but I've reworked this
version to be even more careful about avoiding possible overflows if the
shift value gets too large.
Since this is not the most trivial code, and its slightly different then
whats been tested for awhile, it would be good to get this into some
trees for testing. Be it -tip or -mm, either would work. If there's no
problems it could be a 2.6.33 or 2.6.34 item.
Any comments or feedback would be appreciated!
Signed-off-by: John Stultz <[email protected]>
diff --git a/include/linux/timex.h b/include/linux/timex.h
index e6967d1..0c0ef7d 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -261,11 +261,7 @@ static inline int ntp_synced(void)
#define NTP_SCALE_SHIFT 32
-#ifdef CONFIG_NO_HZ
-#define NTP_INTERVAL_FREQ (2)
-#else
#define NTP_INTERVAL_FREQ (HZ)
-#endif
#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46f..4cc5656 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
timekeeper.ntp_error_shift;
}
+
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+ u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+
+ /* If the offset is smaller then a shifted interval, do nothing */
+ if (offset < timekeeper.cycle_interval<<shift)
+ return offset;
+
+ /* accumulate one shifted interval */
+ offset -= timekeeper.cycle_interval << shift;
+ timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+
+ timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+ while (timekeeper.xtime_nsec >= nsecps) {
+ timekeeper.xtime_nsec -= nsecps;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+
+ /* accumulate into raw time */
+ raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+ while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+ raw_time.tv_nsec -= NSEC_PER_SEC;
+ raw_time.tv_sec++;
+ }
+
+ /* accumulate error between NTP and clock interval */
+ timekeeper.ntp_error += tick_length << shift;
+ timekeeper.ntp_error -= timekeeper.xtime_interval <<
+ (timekeeper.ntp_error_shift + shift);
+
+ return offset;
+}
+
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -731,6 +776,7 @@ void update_wall_time(void)
struct clocksource *clock;
cycle_t offset;
u64 nsecs;
+ int shift = 0, maxshift;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
@@ -744,33 +790,22 @@ void update_wall_time(void)
#endif
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
- /* normally this loop will run just once, however in the
- * case of lost or late ticks, it will accumulate correctly.
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller then the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
*/
+ shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less then what overflows tick_length */
+ maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+ shift = min(shift, maxshift);
while (offset >= timekeeper.cycle_interval) {
- u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
-
- /* accumulate one interval */
- offset -= timekeeper.cycle_interval;
- clock->cycle_last += timekeeper.cycle_interval;
-
- timekeeper.xtime_nsec += timekeeper.xtime_interval;
- if (timekeeper.xtime_nsec >= nsecps) {
- timekeeper.xtime_nsec -= nsecps;
- xtime.tv_sec++;
- second_overflow();
- }
-
- raw_time.tv_nsec += timekeeper.raw_interval;
- if (raw_time.tv_nsec >= NSEC_PER_SEC) {
- raw_time.tv_nsec -= NSEC_PER_SEC;
- raw_time.tv_sec++;
- }
-
- /* accumulate error between NTP and clock interval */
- timekeeper.ntp_error += tick_length;
- timekeeper.ntp_error -= timekeeper.xtime_interval <<
- timekeeper.ntp_error_shift;
+ offset = logarithmic_accumulation(offset, shift);
+ shift--;
}
/* correct the clock when NTP error is too big */
With the prior logarithmic time accumulation patch, xtime will now
always be within one "tick" of the current time, instead of possibly
half a second off.
This removes the need for the xtime_cache value, which always stored the
time at the last interrupt, so this patch cleans that up removing the
xtime_cache related code.
This is a bit simpler, but still could use some wider testing.
Any comments or feedback would be appreciated!
thanks
-john
Signed-off-by: John Stultz <[email protected]>
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469..2ef4fe2 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
write_seqlock_irq(&xtime_lock);
wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
xtime.tv_sec += sys_tz.tz_minuteswest * 60;
- update_xtime_cache(0);
write_sequnlock_irq(&xtime_lock);
clock_was_set();
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4cc5656..a1513d0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -164,13 +164,6 @@ struct timespec raw_time;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
- xtime_cache = xtime;
- timespec_add_ns(&xtime_cache, nsec);
-}
-
/* must hold xtime_lock */
void timekeeping_leap_insert(int leapsecond)
{
@@ -331,8 +324,6 @@ int do_settimeofday(struct timespec *tv)
xtime = *tv;
- update_xtime_cache(0);
-
timekeeper.ntp_error = 0;
ntp_clear();
@@ -547,7 +538,6 @@ void __init timekeeping_init(void)
}
set_normalized_timespec(&wall_to_monotonic,
-boot.tv_sec, -boot.tv_nsec);
- update_xtime_cache(0);
total_sleep_time.tv_sec = 0;
total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -581,7 +571,6 @@ static int timekeeping_resume(struct sys_device *dev)
wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
total_sleep_time = timespec_add_safe(total_sleep_time, ts);
}
- update_xtime_cache(0);
/* re-base the last cycle value */
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
timekeeper.ntp_error = 0;
@@ -721,7 +710,6 @@ static void timekeeping_adjust(s64 offset)
timekeeper.ntp_error_shift;
}
-
/**
* logarithmic_accumulation - shifted accumulation of cycles
*
@@ -765,7 +753,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
return offset;
}
-
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -775,7 +762,6 @@ void update_wall_time(void)
{
struct clocksource *clock;
cycle_t offset;
- u64 nsecs;
int shift = 0, maxshift;
/* Make sure we're fully resumed: */
@@ -841,9 +827,6 @@ void update_wall_time(void)
timekeeper.ntp_error += timekeeper.xtime_nsec <<
timekeeper.ntp_error_shift;
- nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
- update_xtime_cache(nsecs);
-
/* check to see if there is a new clocksource to use */
update_vsyscall(&xtime, timekeeper.clock);
}
@@ -880,13 +863,13 @@ void monotonic_to_bootbased(struct timespec *ts)
unsigned long get_seconds(void)
{
- return xtime_cache.tv_sec;
+ return xtime.tv_sec;
}
EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
- return xtime_cache;
+ return xtime;
}
struct timespec current_kernel_time(void)
@@ -896,8 +879,7 @@ struct timespec current_kernel_time(void)
do {
seq = read_seqbegin(&xtime_lock);
-
- now = xtime_cache;
+ now = xtime;
} while (read_seqretry(&xtime_lock, seq));
return now;
@@ -911,8 +893,7 @@ struct timespec get_monotonic_coarse(void)
do {
seq = read_seqbegin(&xtime_lock);
-
- now = xtime_cache;
+ now = xtime;
mono = wall_to_monotonic;
} while (read_seqretry(&xtime_lock, seq));
On Fri, 2 Oct 2009, john stultz wrote:
> Accumulating one tick at a time works well unless we're using NOHZ. Then
> it can be an issue, since we may have to run through the loop a few
> thousand times, which can increase timer interrupt caused latency.
>
> The current solution was to accumulate in half-second intervals with
> NOHZ. This kept the number of loops down, however it did slightly change
> how we make NTP adjustments. While not an issue with NTPd users, as NTPd
> makes adjustments over a longer period of time, other adjtimex() users
> have noticed the half-second granularity with which we can apply
> frequency changes to the clock.
>
> For instance, if a application tries to apply a 100ppm frequency
> correction for 20ms to correct a 2us offset, with NOHZ they either get
> no correction, or a 50us correction.
>
> Now, there will always be some granularity error for applying frequency
> corrections. However with users sensitive to this error have seen a
> 50-500x increase with NOHZ compared to running without NOHZ.
>
> So I figured I'd try another approach then just simply increasing the
> interval. My approach is to consume the time interval logarithmically.
> This reduces the number of times through the loop needed keeping
> latency down, while still preserving the original granularity error for
> adjtimex() changes.
>
> Further, this change allows us to remove the xtime_cache code (patch to
> follow), as xtime is always within one tick of the current time, instead
> of the half-second updates it saw before.
>
> An earlier version of this patch has been shipping to x86 users in the
> RedHat MRG releases for awhile without issue, but I've reworked this
> version to be even more careful about avoiding possible overflows if the
> shift value gets too large.
>
> Since this is not the most trivial code, and its slightly different then
> whats been tested for awhile, it would be good to get this into some
> trees for testing. Be it -tip or -mm, either would work. If there's no
> problems it could be a 2.6.33 or 2.6.34 item.
>
> Any comments or feedback would be appreciated!
>
> Signed-off-by: John Stultz <[email protected]>
>
> diff --git a/include/linux/timex.h b/include/linux/timex.h
> index e6967d1..0c0ef7d 100644
> --- a/include/linux/timex.h
> +++ b/include/linux/timex.h
> @@ -261,11 +261,7 @@ static inline int ntp_synced(void)
>
> #define NTP_SCALE_SHIFT 32
>
> -#ifdef CONFIG_NO_HZ
> -#define NTP_INTERVAL_FREQ (2)
> -#else
> #define NTP_INTERVAL_FREQ (HZ)
> -#endif
> #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
>
> /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index fb0f46f..4cc5656 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
> timekeeper.ntp_error_shift;
> }
>
> +
> +/**
> + * logarithmic_accumulation - shifted accumulation of cycles
> + *
> + * This functions accumulates a shifted interval of cycles into
> + * into a shifted interval nanoseconds. Allows for O(log) accumulation
> + * loop.
> + *
> + * Returns the unconsumed cycles.
> + */
> +static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
> +{
> + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
> +
> + /* If the offset is smaller then a shifted interval, do nothing */
> + if (offset < timekeeper.cycle_interval<<shift)
> + return offset;
> +
> + /* accumulate one shifted interval */
> + offset -= timekeeper.cycle_interval << shift;
> + timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
> +
> + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
> + while (timekeeper.xtime_nsec >= nsecps) {
> + timekeeper.xtime_nsec -= nsecps;
> + xtime.tv_sec++;
> + second_overflow();
> + }
> +
> + /* accumulate into raw time */
> + raw_time.tv_nsec += timekeeper.raw_interval << shift;;
> + while (raw_time.tv_nsec >= NSEC_PER_SEC) {
> + raw_time.tv_nsec -= NSEC_PER_SEC;
> + raw_time.tv_sec++;
> + }
> +
> + /* accumulate error between NTP and clock interval */
> + timekeeper.ntp_error += tick_length << shift;
> + timekeeper.ntp_error -= timekeeper.xtime_interval <<
> + (timekeeper.ntp_error_shift + shift);
> +
> + return offset;
> +}
> +
> +
> /**
> * update_wall_time - Uses the current clocksource to increment the wall time
> *
> @@ -731,6 +776,7 @@ void update_wall_time(void)
> struct clocksource *clock;
> cycle_t offset;
> u64 nsecs;
> + int shift = 0, maxshift;
>
> /* Make sure we're fully resumed: */
> if (unlikely(timekeeping_suspended))
> @@ -744,33 +790,22 @@ void update_wall_time(void)
> #endif
> timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
>
> - /* normally this loop will run just once, however in the
> - * case of lost or late ticks, it will accumulate correctly.
> + /*
> + * With NO_HZ we may have to accumulate many cycle_intervals
> + * (think "ticks") worth of time at once. To do this efficiently,
> + * we calculate the largest doubling multiple of cycle_intervals
> + * that is smaller then the offset. We then accumulate that
> + * chunk in one go, and then try to consume the next smaller
> + * doubled multiple.
> */
> + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
> + shift = max(0, shift);
> + /* Bound shift to one less then what overflows tick_length */
> + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
> + shift = min(shift, maxshift);
> while (offset >= timekeeper.cycle_interval) {
> - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
> -
> - /* accumulate one interval */
> - offset -= timekeeper.cycle_interval;
> - clock->cycle_last += timekeeper.cycle_interval;
> -
> - timekeeper.xtime_nsec += timekeeper.xtime_interval;
> - if (timekeeper.xtime_nsec >= nsecps) {
> - timekeeper.xtime_nsec -= nsecps;
> - xtime.tv_sec++;
> - second_overflow();
> - }
> -
> - raw_time.tv_nsec += timekeeper.raw_interval;
> - if (raw_time.tv_nsec >= NSEC_PER_SEC) {
> - raw_time.tv_nsec -= NSEC_PER_SEC;
> - raw_time.tv_sec++;
> - }
> -
> - /* accumulate error between NTP and clock interval */
> - timekeeper.ntp_error += tick_length;
> - timekeeper.ntp_error -= timekeeper.xtime_interval <<
> - timekeeper.ntp_error_shift;
> + offset = logarithmic_accumulation(offset, shift);
> + shift--;
> }
>
> /* correct the clock when NTP error is too big */
>
>
>
There are several (6) trailing whitespace errors that checkpatch exposes,
but other than that:
Signed-off-by: John Kacur <[email protected]>
Commit-ID: 8a45af8eb809a0d3512877296bd128e73fcee379
Gitweb: http://git.kernel.org/tip/8a45af8eb809a0d3512877296bd128e73fcee379
Author: john stultz <[email protected]>
AuthorDate: Fri, 2 Oct 2009 16:17:53 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Sun, 4 Oct 2009 19:31:39 +0200
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Cc: John Kacur <[email protected]>
Cc: Clark Williams <[email protected]>
Cc: Martin Schwidefsky <[email protected]>
Cc: Andrew Morton <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/timex.h | 4 --
kernel/time/timekeeping.c | 85 +++++++++++++++++++++++++++++++-------------
2 files changed, 60 insertions(+), 29 deletions(-)
diff --git a/include/linux/timex.h b/include/linux/timex.h
index e6967d1..0c0ef7d 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -261,11 +261,7 @@ static inline int ntp_synced(void)
#define NTP_SCALE_SHIFT 32
-#ifdef CONFIG_NO_HZ
-#define NTP_INTERVAL_FREQ (2)
-#else
#define NTP_INTERVAL_FREQ (HZ)
-#endif
#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46f..5fdd78e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
timekeeper.ntp_error_shift;
}
+
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+ u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+
+ /* If the offset is smaller then a shifted interval, do nothing */
+ if (offset < timekeeper.cycle_interval<<shift)
+ return offset;
+
+ /* Accumulate one shifted interval */
+ offset -= timekeeper.cycle_interval << shift;
+ timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+
+ timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+ while (timekeeper.xtime_nsec >= nsecps) {
+ timekeeper.xtime_nsec -= nsecps;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+
+ /* Accumulate into raw time */
+ raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+ while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+ raw_time.tv_nsec -= NSEC_PER_SEC;
+ raw_time.tv_sec++;
+ }
+
+ /* Accumulate error between NTP and clock interval */
+ timekeeper.ntp_error += tick_length << shift;
+ timekeeper.ntp_error -= timekeeper.xtime_interval <<
+ (timekeeper.ntp_error_shift + shift);
+
+ return offset;
+}
+
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -731,6 +776,7 @@ void update_wall_time(void)
struct clocksource *clock;
cycle_t offset;
u64 nsecs;
+ int shift = 0, maxshift;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
@@ -744,33 +790,22 @@ void update_wall_time(void)
#endif
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
- /* normally this loop will run just once, however in the
- * case of lost or late ticks, it will accumulate correctly.
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller then the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
*/
+ shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less then what overflows tick_length */
+ maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+ shift = min(shift, maxshift);
while (offset >= timekeeper.cycle_interval) {
- u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
-
- /* accumulate one interval */
- offset -= timekeeper.cycle_interval;
- clock->cycle_last += timekeeper.cycle_interval;
-
- timekeeper.xtime_nsec += timekeeper.xtime_interval;
- if (timekeeper.xtime_nsec >= nsecps) {
- timekeeper.xtime_nsec -= nsecps;
- xtime.tv_sec++;
- second_overflow();
- }
-
- raw_time.tv_nsec += timekeeper.raw_interval;
- if (raw_time.tv_nsec >= NSEC_PER_SEC) {
- raw_time.tv_nsec -= NSEC_PER_SEC;
- raw_time.tv_sec++;
- }
-
- /* accumulate error between NTP and clock interval */
- timekeeper.ntp_error += tick_length;
- timekeeper.ntp_error -= timekeeper.xtime_interval <<
- timekeeper.ntp_error_shift;
+ offset = logarithmic_accumulation(offset, shift);
+ shift--;
}
/* correct the clock when NTP error is too big */
Commit-ID: 5335c1c371aa32fcef97c4979f04b065772194ab
Gitweb: http://git.kernel.org/tip/5335c1c371aa32fcef97c4979f04b065772194ab
Author: john stultz <[email protected]>
AuthorDate: Fri, 2 Oct 2009 16:24:15 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Sun, 4 Oct 2009 19:31:39 +0200
time: Remove xtime_cache
With the prior logarithmic time accumulation patch, xtime will now
always be within one "tick" of the current time, instead of
possibly half a second off.
This removes the need for the xtime_cache value, which always
stored the time at the last interrupt, so this patch cleans that up
removing the xtime_cache related code.
This is a bit simpler, but still could use some wider testing.
Signed-off-by: John Stultz <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Cc: John Kacur <[email protected]>
Cc: Clark Williams <[email protected]>
Cc: Martin Schwidefsky <[email protected]>
Cc: Andrew Morton <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/time.c | 1 -
kernel/time/timekeeping.c | 27 ++++-----------------------
2 files changed, 4 insertions(+), 24 deletions(-)
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469..2ef4fe2 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
write_seqlock_irq(&xtime_lock);
wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
xtime.tv_sec += sys_tz.tz_minuteswest * 60;
- update_xtime_cache(0);
write_sequnlock_irq(&xtime_lock);
clock_was_set();
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fdd78e..96b3f0d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -164,13 +164,6 @@ struct timespec raw_time;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
- xtime_cache = xtime;
- timespec_add_ns(&xtime_cache, nsec);
-}
-
/* must hold xtime_lock */
void timekeeping_leap_insert(int leapsecond)
{
@@ -331,8 +324,6 @@ int do_settimeofday(struct timespec *tv)
xtime = *tv;
- update_xtime_cache(0);
-
timekeeper.ntp_error = 0;
ntp_clear();
@@ -547,7 +538,6 @@ void __init timekeeping_init(void)
}
set_normalized_timespec(&wall_to_monotonic,
-boot.tv_sec, -boot.tv_nsec);
- update_xtime_cache(0);
total_sleep_time.tv_sec = 0;
total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -581,7 +571,6 @@ static int timekeeping_resume(struct sys_device *dev)
wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
total_sleep_time = timespec_add_safe(total_sleep_time, ts);
}
- update_xtime_cache(0);
/* re-base the last cycle value */
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
timekeeper.ntp_error = 0;
@@ -721,7 +710,6 @@ static void timekeeping_adjust(s64 offset)
timekeeper.ntp_error_shift;
}
-
/**
* logarithmic_accumulation - shifted accumulation of cycles
*
@@ -765,7 +753,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
return offset;
}
-
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -775,7 +762,6 @@ void update_wall_time(void)
{
struct clocksource *clock;
cycle_t offset;
- u64 nsecs;
int shift = 0, maxshift;
/* Make sure we're fully resumed: */
@@ -841,9 +827,6 @@ void update_wall_time(void)
timekeeper.ntp_error += timekeeper.xtime_nsec <<
timekeeper.ntp_error_shift;
- nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
- update_xtime_cache(nsecs);
-
/* check to see if there is a new clocksource to use */
update_vsyscall(&xtime, timekeeper.clock);
}
@@ -880,13 +863,13 @@ void monotonic_to_bootbased(struct timespec *ts)
unsigned long get_seconds(void)
{
- return xtime_cache.tv_sec;
+ return xtime.tv_sec;
}
EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
- return xtime_cache;
+ return xtime;
}
struct timespec current_kernel_time(void)
@@ -896,8 +879,7 @@ struct timespec current_kernel_time(void)
do {
seq = read_seqbegin(&xtime_lock);
-
- now = xtime_cache;
+ now = xtime;
} while (read_seqretry(&xtime_lock, seq));
return now;
@@ -911,8 +893,7 @@ struct timespec get_monotonic_coarse(void)
do {
seq = read_seqbegin(&xtime_lock);
-
- now = xtime_cache;
+ now = xtime;
mono = wall_to_monotonic;
} while (read_seqretry(&xtime_lock, seq));
* John Kacur <[email protected]> wrote:
>
>
> On Fri, 2 Oct 2009, john stultz wrote:
>
> > Accumulating one tick at a time works well unless we're using NOHZ. Then
> > it can be an issue, since we may have to run through the loop a few
> > thousand times, which can increase timer interrupt caused latency.
> >
> > The current solution was to accumulate in half-second intervals with
> > NOHZ. This kept the number of loops down, however it did slightly change
> > how we make NTP adjustments. While not an issue with NTPd users, as NTPd
> > makes adjustments over a longer period of time, other adjtimex() users
> > have noticed the half-second granularity with which we can apply
> > frequency changes to the clock.
> >
> > For instance, if a application tries to apply a 100ppm frequency
> > correction for 20ms to correct a 2us offset, with NOHZ they either get
> > no correction, or a 50us correction.
> >
> > Now, there will always be some granularity error for applying frequency
> > corrections. However with users sensitive to this error have seen a
> > 50-500x increase with NOHZ compared to running without NOHZ.
> >
> > So I figured I'd try another approach then just simply increasing the
> > interval. My approach is to consume the time interval logarithmically.
> > This reduces the number of times through the loop needed keeping
> > latency down, while still preserving the original granularity error for
> > adjtimex() changes.
> >
> > Further, this change allows us to remove the xtime_cache code (patch to
> > follow), as xtime is always within one tick of the current time, instead
> > of the half-second updates it saw before.
> >
> > An earlier version of this patch has been shipping to x86 users in the
> > RedHat MRG releases for awhile without issue, but I've reworked this
> > version to be even more careful about avoiding possible overflows if the
> > shift value gets too large.
> >
> > Since this is not the most trivial code, and its slightly different then
> > whats been tested for awhile, it would be good to get this into some
> > trees for testing. Be it -tip or -mm, either would work. If there's no
> > problems it could be a 2.6.33 or 2.6.34 item.
> >
> > Any comments or feedback would be appreciated!
> >
> > Signed-off-by: John Stultz <[email protected]>
> >
> > diff --git a/include/linux/timex.h b/include/linux/timex.h
> > index e6967d1..0c0ef7d 100644
> > --- a/include/linux/timex.h
> > +++ b/include/linux/timex.h
> > @@ -261,11 +261,7 @@ static inline int ntp_synced(void)
> >
> > #define NTP_SCALE_SHIFT 32
> >
> > -#ifdef CONFIG_NO_HZ
> > -#define NTP_INTERVAL_FREQ (2)
> > -#else
> > #define NTP_INTERVAL_FREQ (HZ)
> > -#endif
> > #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
> >
> > /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
> > diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> > index fb0f46f..4cc5656 100644
> > --- a/kernel/time/timekeeping.c
> > +++ b/kernel/time/timekeeping.c
> > @@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
> > timekeeper.ntp_error_shift;
> > }
> >
> > +
> > +/**
> > + * logarithmic_accumulation - shifted accumulation of cycles
> > + *
> > + * This functions accumulates a shifted interval of cycles into
> > + * into a shifted interval nanoseconds. Allows for O(log) accumulation
> > + * loop.
> > + *
> > + * Returns the unconsumed cycles.
> > + */
> > +static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
> > +{
> > + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
> > +
> > + /* If the offset is smaller then a shifted interval, do nothing */
> > + if (offset < timekeeper.cycle_interval<<shift)
> > + return offset;
> > +
> > + /* accumulate one shifted interval */
> > + offset -= timekeeper.cycle_interval << shift;
> > + timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
> > +
> > + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
> > + while (timekeeper.xtime_nsec >= nsecps) {
> > + timekeeper.xtime_nsec -= nsecps;
> > + xtime.tv_sec++;
> > + second_overflow();
> > + }
> > +
> > + /* accumulate into raw time */
> > + raw_time.tv_nsec += timekeeper.raw_interval << shift;;
> > + while (raw_time.tv_nsec >= NSEC_PER_SEC) {
> > + raw_time.tv_nsec -= NSEC_PER_SEC;
> > + raw_time.tv_sec++;
> > + }
> > +
> > + /* accumulate error between NTP and clock interval */
> > + timekeeper.ntp_error += tick_length << shift;
> > + timekeeper.ntp_error -= timekeeper.xtime_interval <<
> > + (timekeeper.ntp_error_shift + shift);
> > +
> > + return offset;
> > +}
> > +
> > +
> > /**
> > * update_wall_time - Uses the current clocksource to increment the wall time
> > *
> > @@ -731,6 +776,7 @@ void update_wall_time(void)
> > struct clocksource *clock;
> > cycle_t offset;
> > u64 nsecs;
> > + int shift = 0, maxshift;
> >
> > /* Make sure we're fully resumed: */
> > if (unlikely(timekeeping_suspended))
> > @@ -744,33 +790,22 @@ void update_wall_time(void)
> > #endif
> > timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
> >
> > - /* normally this loop will run just once, however in the
> > - * case of lost or late ticks, it will accumulate correctly.
> > + /*
> > + * With NO_HZ we may have to accumulate many cycle_intervals
> > + * (think "ticks") worth of time at once. To do this efficiently,
> > + * we calculate the largest doubling multiple of cycle_intervals
> > + * that is smaller then the offset. We then accumulate that
> > + * chunk in one go, and then try to consume the next smaller
> > + * doubled multiple.
> > */
> > + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
> > + shift = max(0, shift);
> > + /* Bound shift to one less then what overflows tick_length */
> > + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
> > + shift = min(shift, maxshift);
> > while (offset >= timekeeper.cycle_interval) {
> > - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
> > -
> > - /* accumulate one interval */
> > - offset -= timekeeper.cycle_interval;
> > - clock->cycle_last += timekeeper.cycle_interval;
> > -
> > - timekeeper.xtime_nsec += timekeeper.xtime_interval;
> > - if (timekeeper.xtime_nsec >= nsecps) {
> > - timekeeper.xtime_nsec -= nsecps;
> > - xtime.tv_sec++;
> > - second_overflow();
> > - }
> > -
> > - raw_time.tv_nsec += timekeeper.raw_interval;
> > - if (raw_time.tv_nsec >= NSEC_PER_SEC) {
> > - raw_time.tv_nsec -= NSEC_PER_SEC;
> > - raw_time.tv_sec++;
> > - }
> > -
> > - /* accumulate error between NTP and clock interval */
> > - timekeeper.ntp_error += tick_length;
> > - timekeeper.ntp_error -= timekeeper.xtime_interval <<
> > - timekeeper.ntp_error_shift;
> > + offset = logarithmic_accumulation(offset, shift);
> > + shift--;
> > }
> >
> > /* correct the clock when NTP error is too big */
> >
> >
> >
>
> There are several (6) trailing whitespace errors that checkpatch exposes,
> but other than that:
Yep, i fixed those already. (also the few inconsistent comment
capitalizations)
The two patches held up fine in -tip testing so far.
> Signed-off-by: John Kacur <[email protected]>
Thanks - i changed that to Reviewed-by (SoB is for being part of the
patch-flow) and added it to the two commits.
Ingo
Commit-ID: a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601
Gitweb: http://git.kernel.org/tip/a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601
Author: john stultz <[email protected]>
AuthorDate: Fri, 2 Oct 2009 16:17:53 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 5 Oct 2009 13:51:48 +0200
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Reviewed-by: John Kacur <[email protected]>
Cc: Clark Williams <[email protected]>
Cc: Martin Schwidefsky <[email protected]>
Cc: Andrew Morton <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/timex.h | 4 --
kernel/time/timekeeping.c | 85 +++++++++++++++++++++++++++++++-------------
2 files changed, 60 insertions(+), 29 deletions(-)
diff --git a/include/linux/timex.h b/include/linux/timex.h
index e6967d1..0c0ef7d 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -261,11 +261,7 @@ static inline int ntp_synced(void)
#define NTP_SCALE_SHIFT 32
-#ifdef CONFIG_NO_HZ
-#define NTP_INTERVAL_FREQ (2)
-#else
#define NTP_INTERVAL_FREQ (HZ)
-#endif
#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46f..5fdd78e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
timekeeper.ntp_error_shift;
}
+
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+ u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+
+ /* If the offset is smaller then a shifted interval, do nothing */
+ if (offset < timekeeper.cycle_interval<<shift)
+ return offset;
+
+ /* Accumulate one shifted interval */
+ offset -= timekeeper.cycle_interval << shift;
+ timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+
+ timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+ while (timekeeper.xtime_nsec >= nsecps) {
+ timekeeper.xtime_nsec -= nsecps;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+
+ /* Accumulate into raw time */
+ raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+ while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+ raw_time.tv_nsec -= NSEC_PER_SEC;
+ raw_time.tv_sec++;
+ }
+
+ /* Accumulate error between NTP and clock interval */
+ timekeeper.ntp_error += tick_length << shift;
+ timekeeper.ntp_error -= timekeeper.xtime_interval <<
+ (timekeeper.ntp_error_shift + shift);
+
+ return offset;
+}
+
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -731,6 +776,7 @@ void update_wall_time(void)
struct clocksource *clock;
cycle_t offset;
u64 nsecs;
+ int shift = 0, maxshift;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
@@ -744,33 +790,22 @@ void update_wall_time(void)
#endif
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
- /* normally this loop will run just once, however in the
- * case of lost or late ticks, it will accumulate correctly.
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller then the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
*/
+ shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less then what overflows tick_length */
+ maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+ shift = min(shift, maxshift);
while (offset >= timekeeper.cycle_interval) {
- u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
-
- /* accumulate one interval */
- offset -= timekeeper.cycle_interval;
- clock->cycle_last += timekeeper.cycle_interval;
-
- timekeeper.xtime_nsec += timekeeper.xtime_interval;
- if (timekeeper.xtime_nsec >= nsecps) {
- timekeeper.xtime_nsec -= nsecps;
- xtime.tv_sec++;
- second_overflow();
- }
-
- raw_time.tv_nsec += timekeeper.raw_interval;
- if (raw_time.tv_nsec >= NSEC_PER_SEC) {
- raw_time.tv_nsec -= NSEC_PER_SEC;
- raw_time.tv_sec++;
- }
-
- /* accumulate error between NTP and clock interval */
- timekeeper.ntp_error += tick_length;
- timekeeper.ntp_error -= timekeeper.xtime_interval <<
- timekeeper.ntp_error_shift;
+ offset = logarithmic_accumulation(offset, shift);
+ shift--;
}
/* correct the clock when NTP error is too big */
Commit-ID: 7bc7d637452383d56ba4368d4336b0dde1bb476d
Gitweb: http://git.kernel.org/tip/7bc7d637452383d56ba4368d4336b0dde1bb476d
Author: john stultz <[email protected]>
AuthorDate: Fri, 2 Oct 2009 16:24:15 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 5 Oct 2009 13:52:02 +0200
time: Remove xtime_cache
With the prior logarithmic time accumulation patch, xtime will now
always be within one "tick" of the current time, instead of
possibly half a second off.
This removes the need for the xtime_cache value, which always
stored the time at the last interrupt, so this patch cleans that up
removing the xtime_cache related code.
This is a bit simpler, but still could use some wider testing.
Signed-off-by: John Stultz <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Reviewed-by: John Kacur <[email protected]>
Cc: Clark Williams <[email protected]>
Cc: Martin Schwidefsky <[email protected]>
Cc: Andrew Morton <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/time.c | 1 -
kernel/time/timekeeping.c | 27 ++++-----------------------
2 files changed, 4 insertions(+), 24 deletions(-)
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469..2ef4fe2 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
write_seqlock_irq(&xtime_lock);
wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
xtime.tv_sec += sys_tz.tz_minuteswest * 60;
- update_xtime_cache(0);
write_sequnlock_irq(&xtime_lock);
clock_was_set();
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fdd78e..96b3f0d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -164,13 +164,6 @@ struct timespec raw_time;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
- xtime_cache = xtime;
- timespec_add_ns(&xtime_cache, nsec);
-}
-
/* must hold xtime_lock */
void timekeeping_leap_insert(int leapsecond)
{
@@ -331,8 +324,6 @@ int do_settimeofday(struct timespec *tv)
xtime = *tv;
- update_xtime_cache(0);
-
timekeeper.ntp_error = 0;
ntp_clear();
@@ -547,7 +538,6 @@ void __init timekeeping_init(void)
}
set_normalized_timespec(&wall_to_monotonic,
-boot.tv_sec, -boot.tv_nsec);
- update_xtime_cache(0);
total_sleep_time.tv_sec = 0;
total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -581,7 +571,6 @@ static int timekeeping_resume(struct sys_device *dev)
wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
total_sleep_time = timespec_add_safe(total_sleep_time, ts);
}
- update_xtime_cache(0);
/* re-base the last cycle value */
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
timekeeper.ntp_error = 0;
@@ -721,7 +710,6 @@ static void timekeeping_adjust(s64 offset)
timekeeper.ntp_error_shift;
}
-
/**
* logarithmic_accumulation - shifted accumulation of cycles
*
@@ -765,7 +753,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
return offset;
}
-
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -775,7 +762,6 @@ void update_wall_time(void)
{
struct clocksource *clock;
cycle_t offset;
- u64 nsecs;
int shift = 0, maxshift;
/* Make sure we're fully resumed: */
@@ -841,9 +827,6 @@ void update_wall_time(void)
timekeeper.ntp_error += timekeeper.xtime_nsec <<
timekeeper.ntp_error_shift;
- nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
- update_xtime_cache(nsecs);
-
/* check to see if there is a new clocksource to use */
update_vsyscall(&xtime, timekeeper.clock);
}
@@ -880,13 +863,13 @@ void monotonic_to_bootbased(struct timespec *ts)
unsigned long get_seconds(void)
{
- return xtime_cache.tv_sec;
+ return xtime.tv_sec;
}
EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
- return xtime_cache;
+ return xtime;
}
struct timespec current_kernel_time(void)
@@ -896,8 +879,7 @@ struct timespec current_kernel_time(void)
do {
seq = read_seqbegin(&xtime_lock);
-
- now = xtime_cache;
+ now = xtime;
} while (read_seqretry(&xtime_lock, seq));
return now;
@@ -911,8 +893,7 @@ struct timespec get_monotonic_coarse(void)
do {
seq = read_seqbegin(&xtime_lock);
-
- now = xtime_cache;
+ now = xtime;
mono = wall_to_monotonic;
} while (read_seqretry(&xtime_lock, seq));
On Mon, 5 Oct 2009 12:43:37 +0200 (CEST) John Kacur wrote:
>
>
> On Fri, 2 Oct 2009, john stultz wrote:
>
> > Accumulating one tick at a time works well unless we're using NOHZ. Then
> > it can be an issue, since we may have to run through the loop a few
> > thousand times, which can increase timer interrupt caused latency.
> >
> > The current solution was to accumulate in half-second intervals with
> > NOHZ. This kept the number of loops down, however it did slightly change
> > how we make NTP adjustments. While not an issue with NTPd users, as NTPd
> > makes adjustments over a longer period of time, other adjtimex() users
> > have noticed the half-second granularity with which we can apply
> > frequency changes to the clock.
> >
> > For instance, if a application tries to apply a 100ppm frequency
> > correction for 20ms to correct a 2us offset, with NOHZ they either get
> > no correction, or a 50us correction.
> >
> > Now, there will always be some granularity error for applying frequency
> > corrections. However with users sensitive to this error have seen a
> > 50-500x increase with NOHZ compared to running without NOHZ.
> >
> > So I figured I'd try another approach then just simply increasing the
> > interval. My approach is to consume the time interval logarithmically.
> > This reduces the number of times through the loop needed keeping
> > latency down, while still preserving the original granularity error for
> > adjtimex() changes.
> >
> > Further, this change allows us to remove the xtime_cache code (patch to
> > follow), as xtime is always within one tick of the current time, instead
> > of the half-second updates it saw before.
> >
> > An earlier version of this patch has been shipping to x86 users in the
> > RedHat MRG releases for awhile without issue, but I've reworked this
> > version to be even more careful about avoiding possible overflows if the
> > shift value gets too large.
> >
> > Since this is not the most trivial code, and its slightly different then
> > whats been tested for awhile, it would be good to get this into some
> > trees for testing. Be it -tip or -mm, either would work. If there's no
> > problems it could be a 2.6.33 or 2.6.34 item.
> >
> > Any comments or feedback would be appreciated!
> >
> > Signed-off-by: John Stultz <[email protected]>
> >
> > diff --git a/include/linux/timex.h b/include/linux/timex.h
> > index e6967d1..0c0ef7d 100644
> > --- a/include/linux/timex.h
> > +++ b/include/linux/timex.h
> > @@ -261,11 +261,7 @@ static inline int ntp_synced(void)
> >
> > #define NTP_SCALE_SHIFT 32
> >
> > -#ifdef CONFIG_NO_HZ
> > -#define NTP_INTERVAL_FREQ (2)
> > -#else
> > #define NTP_INTERVAL_FREQ (HZ)
> > -#endif
> > #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
> >
> > /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
> > diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> > index fb0f46f..4cc5656 100644
> > --- a/kernel/time/timekeeping.c
> > +++ b/kernel/time/timekeeping.c
> > @@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
> > timekeeper.ntp_error_shift;
> > }
> >
> > +
> > +/**
"/**" means "beginning of kernel-doc", so those function parameters
need to be described here also...
> > + * logarithmic_accumulation - shifted accumulation of cycles
> > + *
> > + * This functions accumulates a shifted interval of cycles into
> > + * into a shifted interval nanoseconds. Allows for O(log) accumulation
> > + * loop.
> > + *
> > + * Returns the unconsumed cycles.
> > + */
> > +static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
> > +{
> > + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
> > +
> > + /* If the offset is smaller then a shifted interval, do nothing */
> > + if (offset < timekeeper.cycle_interval<<shift)
> > + return offset;
> > +
> > + /* accumulate one shifted interval */
> > + offset -= timekeeper.cycle_interval << shift;
> > + timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
> > +
> > + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
> > + while (timekeeper.xtime_nsec >= nsecps) {
> > + timekeeper.xtime_nsec -= nsecps;
> > + xtime.tv_sec++;
> > + second_overflow();
> > + }
> > +
> > + /* accumulate into raw time */
> > + raw_time.tv_nsec += timekeeper.raw_interval << shift;;
> > + while (raw_time.tv_nsec >= NSEC_PER_SEC) {
> > + raw_time.tv_nsec -= NSEC_PER_SEC;
> > + raw_time.tv_sec++;
> > + }
> > +
> > + /* accumulate error between NTP and clock interval */
> > + timekeeper.ntp_error += tick_length << shift;
> > + timekeeper.ntp_error -= timekeeper.xtime_interval <<
> > + (timekeeper.ntp_error_shift + shift);
> > +
> > + return offset;
> > +}
> > +
> > +
> > /**
> > * update_wall_time - Uses the current clocksource to increment the wall time
> > *
> > @@ -731,6 +776,7 @@ void update_wall_time(void)
> > struct clocksource *clock;
> > cycle_t offset;
> > u64 nsecs;
> > + int shift = 0, maxshift;
> >
> > /* Make sure we're fully resumed: */
> > if (unlikely(timekeeping_suspended))
> > @@ -744,33 +790,22 @@ void update_wall_time(void)
> > #endif
> > timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
> >
> > - /* normally this loop will run just once, however in the
> > - * case of lost or late ticks, it will accumulate correctly.
> > + /*
> > + * With NO_HZ we may have to accumulate many cycle_intervals
> > + * (think "ticks") worth of time at once. To do this efficiently,
> > + * we calculate the largest doubling multiple of cycle_intervals
> > + * that is smaller then the offset. We then accumulate that
> > + * chunk in one go, and then try to consume the next smaller
> > + * doubled multiple.
> > */
> > + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
> > + shift = max(0, shift);
> > + /* Bound shift to one less then what overflows tick_length */
> > + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
> > + shift = min(shift, maxshift);
> > while (offset >= timekeeper.cycle_interval) {
> > - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
> > -
> > - /* accumulate one interval */
> > - offset -= timekeeper.cycle_interval;
> > - clock->cycle_last += timekeeper.cycle_interval;
> > -
> > - timekeeper.xtime_nsec += timekeeper.xtime_interval;
> > - if (timekeeper.xtime_nsec >= nsecps) {
> > - timekeeper.xtime_nsec -= nsecps;
> > - xtime.tv_sec++;
> > - second_overflow();
> > - }
> > -
> > - raw_time.tv_nsec += timekeeper.raw_interval;
> > - if (raw_time.tv_nsec >= NSEC_PER_SEC) {
> > - raw_time.tv_nsec -= NSEC_PER_SEC;
> > - raw_time.tv_sec++;
> > - }
> > -
> > - /* accumulate error between NTP and clock interval */
> > - timekeeper.ntp_error += tick_length;
> > - timekeeper.ntp_error -= timekeeper.xtime_interval <<
> > - timekeeper.ntp_error_shift;
> > + offset = logarithmic_accumulation(offset, shift);
> > + shift--;
> > }
> >
> > /* correct the clock when NTP error is too big */
> >
> >
> >
>
> There are several (6) trailing whitespace errors that checkpatch exposes,
> but other than that:
> Signed-off-by: John Kacur <[email protected]>
---
~Randy
@John Stultz
I backported your patch to 2.6.31.2-rt13, could you please look it over
and see if it looks okay to you?
@Thomas
Could you please consider queuing this up for -rt14.
Since John submitted it upstream, we will be able to drop it again in the
future.
Thanks
>From 8090f669e58901c1b0c5e8bac4160eaaf7990f4d Mon Sep 17 00:00:00 2001
From: tip-bot for john stultz <[email protected]>
Date: Mon, 5 Oct 2009 11:54:38 +0000
Subject: [PATCH] time: Implement logarithmic time accumulation
Commit-ID: a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601
Gitweb: http://git.kernel.org/tip/a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601
Author: john stultz <[email protected]>
AuthorDate: Fri, 2 Oct 2009 16:17:53 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 5 Oct 2009 13:51:48 +0200
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Reviewed-by: John Kacur <[email protected]>
Cc: Clark Williams <[email protected]>
Cc: Martin Schwidefsky <[email protected]>
Cc: Andrew Morton <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: John Kacur <[email protected]>
---
include/linux/timex.h | 4 --
kernel/time/timekeeping.c | 83 +++++++++++++++++++++++++++++++++------------
2 files changed, 61 insertions(+), 26 deletions(-)
diff --git a/include/linux/timex.h b/include/linux/timex.h
index e6967d1..0c0ef7d 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -261,11 +261,7 @@ static inline int ntp_synced(void)
#define NTP_SCALE_SHIFT 32
-#ifdef CONFIG_NO_HZ
-#define NTP_INTERVAL_FREQ (2)
-#else
#define NTP_INTERVAL_FREQ (HZ)
-#endif
#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9d1bac7..4630874 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -608,6 +608,51 @@ static void clocksource_adjust(s64 offset)
(NTP_SCALE_SHIFT - clock->shift);
}
+
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+ u64 nsecps = (u64)NSEC_PER_SEC << clock->shift;
+
+ /* If the offset is smaller then a shifted interval, do nothing */
+ if (offset < clock->cycle_interval<<shift)
+ return offset;
+
+ /* Accumulate one shifted interval */
+ offset -= clock->cycle_interval << shift;
+ clock->cycle_last += clock->cycle_interval << shift;
+
+ clock->xtime_nsec += clock->xtime_interval << shift;
+ while (clock->xtime_nsec >= nsecps) {
+ clock->xtime_nsec -= nsecps;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+
+ /* Accumulate into raw time */
+ clock->raw_time.tv_nsec += clock->raw_interval << shift;;
+ while (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+ clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+ clock->raw_time.tv_sec++;
+ }
+
+ /* Accumulate error between NTP and clock interval */
+ clock->error += tick_length << shift;
+ clock->error -= clock->xtime_interval <<
+ (NTP_SCALE_SHIFT - clock->shift + shift);
+
+ return offset;
+}
+
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -616,6 +661,8 @@ static void clocksource_adjust(s64 offset)
void update_wall_time(void)
{
cycle_t offset;
+ u64 nsecs;
+ int shift = 0, maxshift;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
@@ -628,30 +675,22 @@ void update_wall_time(void)
#endif
clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
- /* normally this loop will run just once, however in the
- * case of lost or late ticks, it will accumulate correctly.
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller then the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
*/
+ shift = ilog2(offset) - ilog2(clock->cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less then what overflows tick_length */
+ maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+ shift = min(shift, maxshift);
while (offset >= clock->cycle_interval) {
- /* accumulate one interval */
- offset -= clock->cycle_interval;
- clock->cycle_last += clock->cycle_interval;
-
- clock->xtime_nsec += clock->xtime_interval;
- if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
- clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
- xtime.tv_sec++;
- second_overflow();
- }
-
- clock->raw_time.tv_nsec += clock->raw_interval;
- if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
- clock->raw_time.tv_nsec -= NSEC_PER_SEC;
- clock->raw_time.tv_sec++;
- }
-
- /* accumulate error between NTP and clock interval */
- clock->error += tick_length;
- clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
+ offset = logarithmic_accumulation(offset, shift);
+ shift--;
}
/* correct the clock when NTP error is too big */
--
1.6.0.6
@John Stultz
I also backported this patch to 2.6.31.2-rt13, could you please look it
over and see if it looks okay to you?
@Thomas
Same as the previous patch, please consider queuing this for -rt14 and we
can drop it in the future because John Stultz submitted it upstream.
Thanks
>From 868ee3f7346a90ae3b529ac24996f059cc322a82 Mon Sep 17 00:00:00 2001
From: tip-bot for john stultz <[email protected]>
Date: Mon, 5 Oct 2009 11:54:53 +0000
Subject: [PATCH] time: Remove xtime_cache
Commit-ID: 7bc7d637452383d56ba4368d4336b0dde1bb476d
Gitweb: http://git.kernel.org/tip/7bc7d637452383d56ba4368d4336b0dde1bb476d
Author: john stultz <[email protected]>
AuthorDate: Fri, 2 Oct 2009 16:24:15 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 5 Oct 2009 13:52:02 +0200
time: Remove xtime_cache
With the prior logarithmic time accumulation patch, xtime will now
always be within one "tick" of the current time, instead of
possibly half a second off.
This removes the need for the xtime_cache value, which always
stored the time at the last interrupt, so this patch cleans that up
removing the xtime_cache related code.
This is a bit simpler, but still could use some wider testing.
Signed-off-by: John Stultz <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Reviewed-by: John Kacur <[email protected]>
Cc: Clark Williams <[email protected]>
Cc: Martin Schwidefsky <[email protected]>
Cc: Andrew Morton <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: John Kacur <[email protected]>
---
kernel/time.c | 1 -
kernel/time/timekeeping.c | 20 ++------------------
2 files changed, 2 insertions(+), 19 deletions(-)
diff --git a/kernel/time.c b/kernel/time.c
index 35d1aaa..01944b5 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
write_atomic_seqlock_irq(&xtime_lock);
wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
xtime.tv_sec += sys_tz.tz_minuteswest * 60;
- update_xtime_cache(0);
write_atomic_sequnlock_irq(&xtime_lock);
clock_was_set();
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4630874..4a0920d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -48,16 +48,8 @@ static unsigned long total_sleep_time; /* seconds */
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
- xtime_cache = xtime;
- timespec_add_ns(&xtime_cache, nsec);
-}
-
struct clocksource *clock;
-
#ifdef CONFIG_GENERIC_TIME
/**
* clocksource_forward_now - update clock to the current time
@@ -233,7 +225,6 @@ int do_settimeofday(struct timespec *tv)
xtime = *tv;
- update_xtime_cache(0);
clock->error = 0;
ntp_clear();
@@ -435,7 +426,6 @@ void __init timekeeping_init(void)
xtime.tv_nsec = 0;
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
- update_xtime_cache(0);
total_sleep_time = 0;
write_atomic_sequnlock_irqrestore(&xtime_lock, flags);
}
@@ -467,7 +457,6 @@ static int timekeeping_resume(struct sys_device *dev)
wall_to_monotonic.tv_sec -= sleep_length;
total_sleep_time += sleep_length;
}
- update_xtime_cache(0);
/* re-base the last cycle value */
clock->cycle_last = 0;
clock->cycle_last = clocksource_read(clock);
@@ -608,7 +597,6 @@ static void clocksource_adjust(s64 offset)
(NTP_SCALE_SHIFT - clock->shift);
}
-
/**
* logarithmic_accumulation - shifted accumulation of cycles
*
@@ -652,7 +640,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
return offset;
}
-
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -661,7 +648,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
void update_wall_time(void)
{
cycle_t offset;
- u64 nsecs;
int shift = 0, maxshift;
/* Make sure we're fully resumed: */
@@ -725,8 +711,6 @@ void update_wall_time(void)
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
- update_xtime_cache(cyc2ns(clock, offset));
-
/* check to see if there is a new clocksource to use */
change_clocksource();
update_vsyscall(&xtime, clock);
@@ -761,7 +745,7 @@ void monotonic_to_bootbased(struct timespec *ts)
unsigned long get_seconds(void)
{
- return xtime_cache.tv_sec;
+ return xtime.tv_sec;
}
EXPORT_SYMBOL(get_seconds);
@@ -774,7 +758,7 @@ struct timespec current_kernel_time(void)
do {
seq = read_atomic_seqbegin(&xtime_lock);
- now = xtime_cache;
+ now = xtime;
} while (read_atomic_seqretry(&xtime_lock, seq));
return now;
--
1.6.0.6