TSC is either synchronized by design or not reliable
to be used for anything, let alone timekeeping.
Signed-off-by: Jiri Bohac <[email protected]>
Index: linux-2.6.20-rc5/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux-2.6.20-rc5.orig/arch/x86_64/kernel/smpboot.c
+++ linux-2.6.20-rc5/arch/x86_64/kernel/smpboot.c
@@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info
print_cpu_info(c);
}
-/*
- * New Funky TSC sync algorithm borrowed from IA64.
- * Main advantage is that it doesn't reset the TSCs fully and
- * in general looks more robust and it works better than my earlier
- * attempts. I believe it was written by David Mosberger. Some minor
- * adjustments for x86-64 by me -AK
- *
- * Original comment reproduced below.
- *
- * Synchronize TSC of the current (slave) CPU with the TSC of the
- * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
- * eliminate the possibility of unaccounted-for errors (such as
- * getting a machine check in the middle of a calibration step). The
- * basic idea is for the slave to ask the master what itc value it has
- * and to read its own itc before and after the master responds. Each
- * iteration gives us three timestamps:
- *
- * slave master
- *
- * t0 ---\
- * ---\
- * --->
- * tm
- * /---
- * /---
- * t1 <---
- *
- *
- * The goal is to adjust the slave's TSC such that tm falls exactly
- * half-way between t0 and t1. If we achieve this, the clocks are
- * synchronized provided the interconnect between the slave and the
- * master is symmetric. Even if the interconnect were asymmetric, we
- * would still know that the synchronization error is smaller than the
- * roundtrip latency (t0 - t1).
- *
- * When the interconnect is quiet and symmetric, this lets us
- * synchronize the TSC to within one or two cycles. However, we can
- * only *guarantee* that the synchronization is accurate to within a
- * round-trip time, which is typically in the range of several hundred
- * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
- * are usually almost perfectly synchronized, but we shouldn't assume
- * that the accuracy is much better than half a micro second or so.
- *
- * [there are other errors like the latency of RDTSC and of the
- * WRMSR. These can also account to hundreds of cycles. So it's
- * probably worse. It claims 153 cycles error on a dual Opteron,
- * but I suspect the numbers are actually somewhat worse -AK]
- */
-
-#define MASTER 0
-#define SLAVE (SMP_CACHE_BYTES/8)
-
-/* Intentionally don't use cpu_relax() while TSC synchronization
- because we don't want to go into funky power save modi or cause
- hypervisors to schedule us away. Going to sleep would likely affect
- latency and low latency is the primary objective here. -AK */
-#define no_cpu_relax() barrier()
-
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
-static volatile __cpuinitdata unsigned long go[SLAVE + 1];
-static int notscsync __cpuinitdata;
-
-#undef DEBUG_TSC_SYNC
-
-#define NUM_ROUNDS 64 /* magic value */
-#define NUM_ITERS 5 /* likewise */
-
-/* Callback on boot CPU */
-static __cpuinit void sync_master(void *arg)
-{
- unsigned long flags, i;
-
- go[MASTER] = 0;
-
- local_irq_save(flags);
- {
- for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
- while (!go[MASTER])
- no_cpu_relax();
- go[MASTER] = 0;
- rdtscll(go[SLAVE]);
- }
- }
- local_irq_restore(flags);
-}
-
-/*
- * Return the number of cycles by which our tsc differs from the tsc
- * on the master (time-keeper) CPU. A positive number indicates our
- * tsc is ahead of the master, negative that it is behind.
- */
-static inline long
-get_delta(long *rt, long *master)
-{
- unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
- unsigned long tcenter, t0, t1, tm;
- int i;
-
- for (i = 0; i < NUM_ITERS; ++i) {
- rdtscll(t0);
- go[MASTER] = 1;
- while (!(tm = go[SLAVE]))
- no_cpu_relax();
- go[SLAVE] = 0;
- rdtscll(t1);
-
- if (t1 - t0 < best_t1 - best_t0)
- best_t0 = t0, best_t1 = t1, best_tm = tm;
- }
-
- *rt = best_t1 - best_t0;
- *master = best_tm - best_t0;
-
- /* average best_t0 and best_t1 without overflow: */
- tcenter = (best_t0/2 + best_t1/2);
- if (best_t0 % 2 + best_t1 % 2 == 2)
- ++tcenter;
- return tcenter - best_tm;
-}
-
-static __cpuinit void sync_tsc(unsigned int master)
-{
- int i, done = 0;
- long delta, adj, adjust_latency = 0;
- unsigned long flags, rt, master_time_stamp, bound;
-#ifdef DEBUG_TSC_SYNC
- static struct syncdebug {
- long rt; /* roundtrip time */
- long master; /* master's timestamp */
- long diff; /* difference between midpoint and master's timestamp */
- long lat; /* estimate of tsc adjustment latency */
- } t[NUM_ROUNDS] __cpuinitdata;
-#endif
-
- printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
- smp_processor_id(), master);
-
- go[MASTER] = 1;
-
- /* It is dangerous to broadcast IPI as cpus are coming up,
- * as they may not be ready to accept them. So since
- * we only need to send the ipi to the boot cpu direct
- * the message, and avoid the race.
- */
- smp_call_function_single(master, sync_master, NULL, 1, 0);
-
- while (go[MASTER]) /* wait for master to be ready */
- no_cpu_relax();
-
- spin_lock_irqsave(&tsc_sync_lock, flags);
- {
- for (i = 0; i < NUM_ROUNDS; ++i) {
- delta = get_delta(&rt, &master_time_stamp);
- if (delta == 0) {
- done = 1; /* let's lock on to this... */
- bound = rt;
- }
-
- if (!done) {
- unsigned long t;
- if (i > 0) {
- adjust_latency += -delta;
- adj = -delta + adjust_latency/4;
- } else
- adj = -delta;
-
- rdtscll(t);
- wrmsrl(MSR_IA32_TSC, t + adj);
- }
-#ifdef DEBUG_TSC_SYNC
- t[i].rt = rt;
- t[i].master = master_time_stamp;
- t[i].diff = delta;
- t[i].lat = adjust_latency/4;
-#endif
- }
- }
- spin_unlock_irqrestore(&tsc_sync_lock, flags);
-
-#ifdef DEBUG_TSC_SYNC
- for (i = 0; i < NUM_ROUNDS; ++i)
- printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
- t[i].rt, t[i].master, t[i].diff, t[i].lat);
-#endif
-
- printk(KERN_INFO
- "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
- "maxerr %lu cycles)\n",
- smp_processor_id(), master, delta, rt);
-}
-
-static void __cpuinit tsc_sync_wait(void)
-{
- /*
- * When the CPU has synchronized TSCs assume the BIOS
- * or the hardware already synced. Otherwise we could
- * mess up a possible perfect synchronization with a
- * not-quite-perfect algorithm.
- */
- if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
- return;
- sync_tsc(0);
-}
-
-static __init int notscsync_setup(char *s)
-{
- notscsync = 1;
- return 1;
-}
-__setup("notscsync", notscsync_setup);
-
static atomic_t init_deasserted __cpuinitdata;
/*
@@ -565,14 +354,6 @@ void __cpuinit start_secondary(void)
*/
set_cpu_sibling_map(smp_processor_id());
- /*
- * Wait for TSC sync to not schedule things before.
- * We still process interrupts, which could see an inconsistent
- * time in that window unfortunately.
- * Do this here because TSC sync has global unprotected state.
- */
- tsc_sync_wait();
-
/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
--
On Thursday 01 February 2007 10:59, [email protected] wrote:
> TSC is either synchronized by design or not reliable
> to be used for anything, let alone timekeeping.
In my tree this is already done better by a patch from Ingo.
Check if they look synchronized and don't use TSC if they are not.
-Andi
On Thu, Feb 01, 2007 at 12:14:23PM +0100, Andi Kleen wrote:
> On Thursday 01 February 2007 10:59, [email protected] wrote:
> > TSC is either synchronized by design or not reliable
> > to be used for anything, let alone timekeeping.
>
> In my tree this is already done better by a patch from Ingo.
> Check if they look synchronized and don't use TSC if they are not.
The whole purpose of this patchset is to make use of TSC even if
it's not synchronized.
Synchronizing it will not make anything better in any way -- the
implementation just does not care whether TSCs are synchronized.
That's why I think the synchronization code is not needed.
--
Jiri Bohac <[email protected]>
SUSE Labs, SUSE CZ
On Thu, Feb 01, 2007 at 02:17:15PM +0100, Jiri Bohac wrote:
> On Thu, Feb 01, 2007 at 12:14:23PM +0100, Andi Kleen wrote:
> > On Thursday 01 February 2007 10:59, [email protected] wrote:
> > > TSC is either synchronized by design or not reliable
> > > to be used for anything, let alone timekeeping.
> >
> > In my tree this is already done better by a patch from Ingo.
> > Check if they look synchronized and don't use TSC if they are not.
>
> The whole purpose of this patchset is to make use of TSC even if
> it's not synchronized.
>
> Synchronizing it will not make anything better in any way -- the
> implementation just does not care whether TSCs are synchronized.
> That's why I think the synchronization code is not needed.
It might even make sense to desycnhronize the TSCs on such (AMD)
machines on purpose, so that applications that rely on TSC break
immediately and not after some time when the error becomes too large.
--
Vojtech Pavlik
Director SuSE Labs
Andi Kleen wrote:
> On Thursday 01 February 2007 10:59, [email protected] wrote:
>> TSC is either synchronized by design or not reliable
>> to be used for anything, let alone timekeeping.
>
> In my tree this is already done better by a patch from Ingo.
> Check if they look synchronized and don't use TSC if they are not.
Is it going to notice dynamically when one cpu ramps down/up
during runtime, and looses sync?
M.
On Thursday 01 February 2007 16:16, Vojtech Pavlik wrote:
> It might even make sense to desycnhronize the TSCs on such (AMD)
> machines on purpose, so that applications that rely on TSC break
> immediately and not after some time when the error becomes too large.
They won't because they're normally single threaded (and most people
still use single core systems anyways)
I've threatened to just disable RDTSC for ring 3 before, but it'll likely
never happen because too many programs use it.
-Andi
On Thursday 01 February 2007 14:17, Jiri Bohac wrote:
> On Thu, Feb 01, 2007 at 12:14:23PM +0100, Andi Kleen wrote:
> > On Thursday 01 February 2007 10:59, [email protected] wrote:
> > > TSC is either synchronized by design or not reliable
> > > to be used for anything, let alone timekeeping.
> >
> > In my tree this is already done better by a patch from Ingo.
> > Check if they look synchronized and don't use TSC if they are not.
>
> The whole purpose of this patchset is to make use of TSC even if
> it's not synchronized.
It's still useful as a double check for platforms (like Intel single node)
which are supposed to be synchronized.
> Synchronizing it will not make anything better in any way -- the
> implementation just does not care whether TSCs are synchronized.
> That's why I think the synchronization code is not needed.
It doesn't actively synchronize it, just checks if they look synchronized.
-Andi
[email protected] wrote:
> TSC is either synchronized by design or not reliable
> to be used for anything, let alone timekeeping.
This refers to eliminating the offset between multiple synchronized TSCs.
-hpa
On Fri, 2 Feb 2007, Andi Kleen wrote:
> I've threatened to just disable RDTSC for ring 3 before, but it'll likely
> never happen because too many programs use it.
Those programs are aware that they are fiddling around with low level
material but with this patchset we are going to have a non
monotonic time subsystem? Programs expects gettimeofday() etc to be
monotonic. Its going to a big surprise if that is not working anymore.
On Mon, 2007-02-12 at 16:34 -0800, Christoph Lameter wrote:
> On Fri, 2 Feb 2007, Andi Kleen wrote:
>
> > I've threatened to just disable RDTSC for ring 3 before, but it'll likely
> > never happen because too many programs use it.
>
> Those programs are aware that they are fiddling around with low level
> material but with this patchset we are going to have a non
> monotonic time subsystem?
no quite the opposite. gettimeofday() currently is NOT monotonic
unfortunately. With this patchseries it actually has a better chance of
becoming that...
--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org
On Tuesday 13 February 2007 07:40, Arjan van de Ven wrote:
> On Mon, 2007-02-12 at 16:34 -0800, Christoph Lameter wrote:
> > On Fri, 2 Feb 2007, Andi Kleen wrote:
> >
> > > I've threatened to just disable RDTSC for ring 3 before, but it'll likely
> > > never happen because too many programs use it.
> >
> > Those programs are aware that they are fiddling around with low level
> > material but with this patchset we are going to have a non
> > monotonic time subsystem?
>
> no quite the opposite. gettimeofday() currently is NOT monotonic
> unfortunately.
Anytime it is non monotonic that's a bug. We've had bugs
like this before, but recently we're doing reasonably well. Of course
there can be always improvements, but in general I don't agree
with your statement, sorry. You can usually rely on it being monotonic,
short of the known limitations (e.g. don't run ntpd)
Usually it weren't really classical bugs, but more "hardware does
unexpected things under us". x86 hardware is a moving target unfortunately.
> With this patchseries it actually has a better chance of
> becoming that...
ntpd problem is fundamental, nothing will change that. However
it doesn't seem to be a big one in practice.
-Andi
On Tue, 2007-02-13 at 09:28 +0100, Andi Kleen wrote:
> On Tuesday 13 February 2007 07:40, Arjan van de Ven wrote:
> > On Mon, 2007-02-12 at 16:34 -0800, Christoph Lameter wrote:
> > > On Fri, 2 Feb 2007, Andi Kleen wrote:
> > >
> > > > I've threatened to just disable RDTSC for ring 3 before, but it'll likely
> > > > never happen because too many programs use it.
> > >
> > > Those programs are aware that they are fiddling around with low level
> > > material but with this patchset we are going to have a non
> > > monotonic time subsystem?
> >
> > no quite the opposite. gettimeofday() currently is NOT monotonic
> > unfortunately.
>
> Anytime it is non monotonic that's a bug. We've had bugs
> like this before, but recently we're doing reasonably well. Of course
> there can be always improvements, but in general I don't agree
> with your statement, sorry. You can usually rely on it being monotonic,
> short of the known limitations (e.g. don't run ntpd)
oh I agree it should be monotonic, but I remember an argument I had with
you several weeks ago where you were basically saying the opposite ;)
I'm happy to see gtod become more monotonic/reliable any way we can
--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org
On Tue, 13 Feb 2007, Arjan van de Ven wrote:
> no quite the opposite. gettimeofday() currently is NOT monotonic
> unfortunately. With this patchseries it actually has a better chance of
> becoming that...
It is monotonic on IA64 at least and we have found that subtle application
bugs occur if it is not. IA64 (and other arches using time interpolation)
can insure the monotoneity of time sources. Are you sure about this? I
wonder why the new time of day subsystem does not have that?
On Tuesday 13 February 2007 18:09, Christoph Lameter wrote:
> On Tue, 13 Feb 2007, Arjan van de Ven wrote:
>
> > no quite the opposite. gettimeofday() currently is NOT monotonic
> > unfortunately. With this patchseries it actually has a better chance of
> > becoming that...
>
> It is monotonic on IA64 at least and we have found that subtle application
> bugs occur if it is not. IA64 (and other arches using time interpolation)
> can insure the monotoneity of time sources. Are you sure about this? I
> wonder why the new time of day subsystem does not have that?
Just to avoid spreading misinformation: modulo some new broken hardware
(which we always try to work around when found) i386/x86-64 gettimeofday
is monotonic. AFAIK on the currently known hardware it should be generally
ok.
However ntpd can always screw you up, but that's inherent in the design.
Safer in general is to use clock_gettime(CLOCK_MONOTONIC, ...) which guarantees
no interference from ntpd
-Andi
On Tue, Feb 13, 2007 at 06:20:14PM +0100, Andi Kleen wrote:
> On Tuesday 13 February 2007 18:09, Christoph Lameter wrote:
> > On Tue, 13 Feb 2007, Arjan van de Ven wrote:
> >
> > > no quite the opposite. gettimeofday() currently is NOT monotonic
> > > unfortunately. With this patchseries it actually has a better chance of
> > > becoming that...
> >
> > It is monotonic on IA64 at least and we have found that subtle application
> > bugs occur if it is not. IA64 (and other arches using time interpolation)
> > can insure the monotoneity of time sources. Are you sure about this? I
> > wonder why the new time of day subsystem does not have that?
>
> Just to avoid spreading misinformation: modulo some new broken hardware
> (which we always try to work around when found) i386/x86-64 gettimeofday
> is monotonic. AFAIK on the currently known hardware it should be generally
> ok.
>
> However ntpd can always screw you up, but that's inherent in the design.
It's not inherent to ntpd's design, but the current (which may have been
fixed since I looked last) implementation of the NTP PLL in the kernel.
The interaction with ntpd can be fixed and I've done it in the past
once, although the fix wasn't all that nice.
> Safer in general is to use clock_gettime(CLOCK_MONOTONIC, ...) which
> guarantees no interference from ntpd
--
Vojtech Pavlik
Director SuSE Labs
Hi,
On Tue, Feb 13, 2007 at 11:18:48PM +0100, Vojtech Pavlik wrote:
> It's not inherent to ntpd's design, but the current (which may have been
> fixed since I looked last) implementation of the NTP PLL in the kernel.
>
> The interaction with ntpd can be fixed and I've done it in the past
> once, although the fix wasn't all that nice.
Yep, it can slowly move towards the correct time, but ntpdate (or more
generally settimeofday) remains a fundamental issue (and I prefer time
skews to be fixed ASAP, not slowly).
If the admin is good, he can know that if he ever runs the db when the
clock isn't perfectly synchronized with the atomic clock, he risks to
screwup his whole dataset as the apps won't even handle time going
backwards after reboot.
I think there should be a limit to how much an app can pretend from
gtod before generating failures. Certainly it's always better to write
apps that are robust against a not monotonic gtod because eventually
it _can_ happen (either that or remove the stod syscall ;).
About ntpdate at boot and ntpd at runtime, not running them isn't
really an option on the server IMHO, think the liability if system
time runs out of sync of a minute and you need to know exactly when
something bad has happened.
On Tue, 13 Feb 2007, Vojtech Pavlik wrote:
> The interaction with ntpd can be fixed and I've done it in the past
> once, although the fix wasn't all that nice.
It can be and was fixed by gradually moving time instead of jumping to
the new time. F.e. the time interpolator on ia64 gradually adapts the
intervals so that synchronization is obtained.
Andi Kleen writes:
> Just to avoid spreading misinformation: modulo some new broken hardware
> (which we always try to work around when found) i386/x86-64 gettimeofday
> is monotonic. AFAIK on the currently known hardware it should be generally
> ok.
>
> However ntpd can always screw you up, but that's inherent in the design.
On powerpc we manage to keep gettimeofday monotonic even when ntpd is
adjusting the clock. We have 3 parameters used to convert a value
from the timebase register to the time of day, and these parameters
are adjusted if necessary at the beginning of each tick, based on the
value returned by current_tick_length(). The point is that
current_tick_length() tells you at the *beginning* of each tick how
much time will be added on to xtime at the *end* of that tick, and
that makes it possible to aim the interpolation to hit the same value
as xtime at the end of each tick.
Clearly if you make a discrete jump backwards with settimeofday or
adjtime, it's impossible to keep gettimeofday monotonic, but apart
from that it's monotonic on powerpc.
At least, that's the way it's supposed to work. I hope the recent
timekeeping changes haven't broken it. :)
Paul.
On Wed, 2007-02-14 at 11:18 +1100, Paul Mackerras wrote:
> Andi Kleen writes:
>
> > Just to avoid spreading misinformation: modulo some new broken hardware
> > (which we always try to work around when found) i386/x86-64 gettimeofday
> > is monotonic. AFAIK on the currently known hardware it should be generally
> > ok.
> >
> > However ntpd can always screw you up, but that's inherent in the design.
>
> On powerpc we manage to keep gettimeofday monotonic even when ntpd is
> adjusting the clock. We have 3 parameters used to convert a value
> from the timebase register to the time of day, and these parameters
> are adjusted if necessary at the beginning of each tick, based on the
> value returned by current_tick_length(). The point is that
> current_tick_length() tells you at the *beginning* of each tick how
> much time will be added on to xtime at the *end* of that tick, and
> that makes it possible to aim the interpolation to hit the same value
> as xtime at the end of each tick.
>
> Clearly if you make a discrete jump backwards with settimeofday or
> adjtime, it's impossible to keep gettimeofday monotonic, but apart
> from that it's monotonic on powerpc.
>
> At least, that's the way it's supposed to work. I hope the recent
> timekeeping changes haven't broken it. :)
No. Just to even further clarify (and since everyone is speaking up),
the generic timekeeping does a similar scaling adjustment of the
clocksource frequency for NTP adjustments made via sys_adjtimex().
I believe Andi was just referring to ntpd calling settimeofday(), which
will cause clock_gettime(CLOCK_REALTIME,...)/gettimeofday() to possibly
jump backwards. This behavior of NTP is of course configurable (see the
-x option, or the "tinker step 0" option combined w/ "disable kernel" in
ntp.conf)
-john
On Tue, Feb 13, 2007 at 11:38:33PM +0100, Andrea Arcangeli wrote:
> Hi,
>
> On Tue, Feb 13, 2007 at 11:18:48PM +0100, Vojtech Pavlik wrote:
> > It's not inherent to ntpd's design, but the current (which may have been
> > fixed since I looked last) implementation of the NTP PLL in the kernel.
> >
> > The interaction with ntpd can be fixed and I've done it in the past
> > once, although the fix wasn't all that nice.
>
> Yep, it can slowly move towards the correct time, but ntpdate (or more
> generally settimeofday) remains a fundamental issue (and I prefer time
> skews to be fixed ASAP, not slowly).
Skipping forward is trivial. For going backward, you can stop time (or
make it go forward very slowly). Still the output will be strictly
monotonic (but not more than that).
For small changes you simply change your estimate of the base clock
frequency to be different from what the specs say. Tuning that in a PLL
will get you to sync with true atomic GMT.
--
Vojtech Pavlik
Director SuSE Labs