2003-01-16 16:36:25

by Jim Houston

[permalink] [raw]
Subject: [PATCH] improved boot time TSC synchronization


Hi Everyone,

The current TSC synchronization may have an error on the order of
1 microsecond on a quad processor system. Not a big deal but annoying
if you are trying to figure out what order things happen based on
TSC time stamps.

Since the IA64 folk already solved this problem, I did a quick hack
based on the itc sync code. The current code tries to synchronize all
of the processors with a single event. The master processor clears
its TSC and changes a flag. The slave processors watch for the flag
to change and clear their TSCs. On the bus this ends up being a
write which invalidates the other processors caches and a set of
read operations as all of the processors reload the cache line.
It is this scramble to re-read the cache line that introduces the error.
Following the IA64 model the new code synchronizes the TSCs of pairs
of processors. It uses a feedback loop to compensate out the errors.

It should produce perfect synchronization for systems where the memory
access times are symmetrical. I'm still looking for a way to validate
the synchronization which doesn't involve synchronization through memory.

Of course, there are systems with wandering TSCs. This patch doesn't do
anything to keep the TSCs synchronized.

The patch was tested against 2.5.58.

Jim Houston - Concurrent Computer Corp.


--- linux-2.5.58.orig/arch/i386/kernel/smpboot.c Thu Jan 16 10:52:09 2003
+++ linux-2.5.58/arch/i386/kernel/smpboot.c Thu Jan 16 10:55:52 2003
@@ -169,167 +169,231 @@
}

/*
- * TSC synchronization.
- *
- * We first check wether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
+ * TSC synchronization based on ia64 itc synchronization code. Synchronize
+ * pairs of processors rather than tring to synchronize all of the processors
+ * with a single event. When several processors are all waiting for an
+ * event they don't all see it at the same time. The write will cause
+ * an invalidate on each processors cache and then they all scramble to
+ * re-read that cache line.
+ *
+ * Writing the TSC resets the upper 32-bits, so we need to be careful
+ * that all of the cpus can be synchronized before we overflow the
+ * 32-bit count.
*/

-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
+#define MASTER 0
+#define SLAVE (SMP_CACHE_BYTES/sizeof(long))

-#define NR_LOOPS 5
+#define NUM_ROUNDS 64 /* magic value */
+#define NUM_ITERS 5 /* likewise */

-/*
- * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit
- * multiplication. Not terribly optimized but we need it at boot time only
- * anyway.
- *
- * result == a / b
- * == (a1 + a2*(2^32)) / b
- * == a1/b + a2*(2^32/b)
- * == a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b
- * ^---- (this multiplication can overflow)
- */
+static volatile unsigned long go[2*SLAVE] __cacheline_aligned;
+static volatile int current_slave = -1;
+static volatile int tsc_sync_complete = 0;
+static volatile int tsc_adj_latency = 0;
+static unsigned int max_rt = 0;
+static unsigned int max_delta = 0;
+
+#define DEBUG_TSC_SYNC 0
+#if DEBUG_TSC_SYNC
+struct tsc_sync_debug {
+ long rt; /* roundtrip time */
+ long master; /* master's timestamp */
+ long diff; /* difference between midpoint and master's timestamp */
+ long lat; /* estimate of tsc adjustment latency */
+} tsc_sync_debug[NUM_ROUNDS*NR_CPUS];
+#endif

-static unsigned long long __init div64 (unsigned long long a, unsigned long b0)
+void
+sync_master(void)
{
- unsigned int a1, a2;
- unsigned long long res;
-
- a1 = ((unsigned int*)&a)[0];
- a2 = ((unsigned int*)&a)[1];
+ unsigned long n, tsc, last_go_master;

- res = a1/b0 +
- (unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) +
- a2 / b0 +
- (a2 * (0xffffffff % b0)) / b0;
-
- return res;
+ last_go_master = 0;
+ while (1) {
+ while ((n = go[MASTER]) == last_go_master)
+ rep_nop();
+ if (n == ~0)
+ break;
+ rdtscl(tsc);
+ if (unlikely(!tsc))
+ tsc = 1;
+ go[SLAVE] = tsc;
+ last_go_master = n;
+ }
}

-static void __init synchronize_tsc_bp (void)
+/*
+ * Return the number of cycles by which our TSC differs from the TSC on
+ * the master (time-keeper) CPU. A positive number indicates our TSC is
+ * ahead of the master, negative that it is behind.
+ */
+static inline long
+get_delta (long *rt, long *master)
{
- int i;
- unsigned long long t0;
- unsigned long long sum, avg;
- long long delta;
- unsigned long one_usec;
- int buggy = 0;
+ unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
+ unsigned long tcenter, t0, t1, tm, last_go_slave;
+ long i;
+
+ last_go_slave = go[SLAVE];
+ for (i = 0; i < NUM_ITERS; ++i) {
+ rdtscl(t0);
+ go[MASTER] = i+1;
+ while ((tm = go[SLAVE]) == last_go_slave)
+ rep_nop();
+ rdtscl(t1);
+
+ if (t1 - t0 < best_t1 - best_t0)
+ best_t0 = t0, best_t1 = t1, best_tm = tm;
+ last_go_slave = tm;
+ }

- printk("checking TSC synchronization across %u CPUs: ", num_booting_cpus());
+ *rt = best_t1 - best_t0;
+ *master = best_tm - best_t0;

- one_usec = cpu_khz/1000;
+ /* average best_t0 and best_t1 without overflow: */
+ tcenter = (best_t0/2 + best_t1/2);
+ if (best_t0 % 2 + best_t1 % 2 == 2)
+ ++tcenter;
+ return tcenter - best_tm;
+}

- atomic_set(&tsc_start_flag, 1);
- wmb();
+/*
+ * Synchronize TSC of the current (slave) CPU with the TSC of the MASTER CPU
+ * (normally the time-keeper CPU). We use a closed loop to eliminate the
+ * possibility of unaccounted-for errors (such as getting a machine check in
+ * the middle of a calibration step). The basic idea is for the slave to ask
+ * the master what TSC value it has and to read its own TSC before and after
+ * the master responds. Each iteration gives us three
+ * timestamps:
+ *
+ * slave master
+ *
+ * t0 ---\
+ * ---\
+ * --->
+ * tm
+ * /---
+ * /---
+ * t1 <---
+ *
+ *
+ * The goal is to adjust the slave's TSC such that tm falls exactly half-way
+ * between t0 and t1. If we achieve this, the clocks are synchronized provided
+ * the interconnect between the slave and the master is symmetric. Even if the
+ * interconnect were asymmetric, we would still know that the synchronization
+ * error is smaller than the roundtrip latency (t0 - t1).
+ *
+ * When the interconnect is quiet and symmetric, this lets us synchronize the
+ * TSC to within one or two cycles. However, we can only *guarantee* that the
+ * synchronization is accurate to within a round-trip time, which is typically
+ * in the range of several hundred cycles (e.g., ~500 cycles). In practice,
+ * this means that the TSC's are usually almost perfectly synchronized, but we
+ * shouldn't assume that the accuracy is much better than half a micro second
+ * or so.
+ */

+static void __init
+synchronize_tsc_ap (void)
+{
+ long i, delta, adj, adjust_latency, n_rounds;
+ unsigned long rt, master_time_stamp, tsc;
+#if DEBUG_TSC_SYNC
+ struct tsc_sync_debug *t =
+ &tsc_sync_debug[smp_processor_id() * NUM_ROUNDS];
+#endif
+
/*
- * We loop a few times to get a primed instruction cache,
- * then the last pass is more or less synchronized and
- * the BP and APs set their cycle counters to zero all at
- * once. This reduces the chance of having random offsets
- * between the processors, and guarantees that the maximum
- * delay between the cycle counters is never bigger than
- * the latency of information-passing (cachelines) between
- * two CPUs.
+ * Wait for our turn to synchronize with the boot processor.
*/
- for (i = 0; i < NR_LOOPS; i++) {
- /*
- * all APs synchronize but they loop on '== num_cpus'
- */
- while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_stop, 0);
- wmb();
- /*
- * this lets the APs save their current TSC:
- */
- atomic_inc(&tsc_count_start);
-
- rdtscll(tsc_values[smp_processor_id()]);
- /*
- * We clear the TSC in the last loop:
- */
- if (i == NR_LOOPS-1)
- write_tsc(0, 0);
-
- /*
- * Wait for all APs to leave the synchronization point:
- */
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_start, 0);
- wmb();
- atomic_inc(&tsc_count_stop);
+ while (current_slave != smp_processor_id())
+ rep_nop();
+ adjust_latency = tsc_adj_latency;
+
+ go[SLAVE] = 0;
+ go[MASTER] = 0;
+ write_tsc(0,0);
+ for (i = 0; i < NUM_ROUNDS; ++i) {
+ delta = get_delta(&rt, &master_time_stamp);
+ if (delta == 0)
+ break;
+
+ if (i > 0)
+ adjust_latency += -delta;
+ adj = -delta + adjust_latency/8;
+ rdtscl(tsc);
+ write_tsc(tsc + adj, 0);
+#if DEBUG_TSC_SYNC
+ t[i].rt = rt;
+ t[i].master = master_time_stamp;
+ t[i].diff = delta;
+ t[i].lat = adjust_latency/8;
+#endif
}
+ n_rounds = i;
+ go[MASTER] = ~0;

- sum = 0;
- for (i = 0; i < NR_CPUS; i++) {
- if (test_bit(i, &cpu_callout_map)) {
- t0 = tsc_values[i];
- sum += t0;
- }
- }
- avg = div64(sum, num_booting_cpus());
-
- sum = 0;
- for (i = 0; i < NR_CPUS; i++) {
- if (!test_bit(i, &cpu_callout_map))
- continue;
- delta = tsc_values[i] - avg;
- if (delta < 0)
- delta = -delta;
- /*
- * We report bigger than 2 microseconds clock differences.
- */
- if (delta > 2*one_usec) {
- long realdelta;
- if (!buggy) {
- buggy = 1;
- printk("\n");
- }
- realdelta = div64(delta, one_usec);
- if (tsc_values[i] < avg)
- realdelta = -realdelta;
-
- printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
- }
-
- sum += delta;
- }
- if (!buggy)
- printk("passed.\n");
- ;
+#if (DEBUG_TSC_SYNC == 2)
+ for (i = 0; i < n_rounds; ++i)
+ printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
+ t[i].rt, t[i].master, t[i].diff, t[i].lat);
+
+ printk("CPU %d: synchronized TSC (last diff %ld cycles, maxerr %lu cycles)\n",
+ smp_processor_id(), delta, rt);
+
+ printk("It took %d rounds\n", n_rounds);
+#endif
+ if (rt > max_rt)
+ max_rt = rt;
+ if (delta < 0)
+ delta = -delta;
+ if (delta > max_delta)
+ max_delta = delta;
+ tsc_adj_latency = adjust_latency;
+ current_slave = -1;
+ while (!tsc_sync_complete)
+ rep_nop();
}

-static void __init synchronize_tsc_ap (void)
-{
- int i;
+/*
+ * The boot processor set its own TSC to zero and then gives each
+ * slave processor the chance to synchronize itself.
+ */

- /*
- * Not every cpu is online at the time
- * this gets called, so we first wait for the BP to
- * finish SMP initialization:
- */
- while (!atomic_read(&tsc_start_flag)) mb();
+static void __init synchronize_tsc_bp (void)
+{
+ unsigned int tsc_low, tsc_high, error;
+ int cpu;

- for (i = 0; i < NR_LOOPS; i++) {
- atomic_inc(&tsc_count_start);
- while (atomic_read(&tsc_count_start) != num_booting_cpus())
- mb();
-
- rdtscll(tsc_values[smp_processor_id()]);
- if (i == NR_LOOPS-1)
- write_tsc(0, 0);
+ printk("start TSC synchronization\n");
+ write_tsc(0, 0);

- atomic_inc(&tsc_count_stop);
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ if (!(cpu_callin_map & (1<<cpu)))
+ continue;
+ if (cpu == smp_processor_id())
+ continue;
+ go[MASTER] = 0;
+ current_slave = cpu;
+ sync_master();
+ while (current_slave != -1)
+ rep_nop();
+ }
+ rdtsc(tsc_low, tsc_high);
+ if (tsc_high)
+ printk("TSC overflowed during synchronization\n");
+ else
+ printk("TSC synchronization complete max_delta=%d cycles\n",
+ max_delta);
+ if (max_rt < 4293) {
+ error = (max_rt * 1000000)/cpu_khz;
+ printk("TSC sync round-trip time %d.%03d microseconds\n",
+ error/1000, error%1000);
+ } else {
+ printk("TSC sync round-trip time %d cycles\n", max_rt);
}
+ tsc_sync_complete = 1;
}
-#undef NR_LOOPS

extern void calibrate_delay(void);


2003-01-16 17:45:43

by David Mosberger

[permalink] [raw]
Subject: Re: [PATCH] improved boot time TSC synchronization

>>>>> On Thu, 16 Jan 2003 11:44:55 -0500, Jim Houston <[email protected]> said:

Jim> The current TSC synchronization may have an error on the order
Jim> of 1 microsecond on a quad processor system. Not a big deal
Jim> but annoying if you are trying to figure out what order things
Jim> happen based on TSC time stamps.

Jim> Since the IA64 folk already solved this problem, I did a quick
Jim> hack based on the itc sync code.

Cool. I'm glad to see the code is proving useful on other architectures.
I was hoping that would be the case.

BTW: The algorithm is documented in Section 8.5.2 of my book (see
http://www.lia64.org/book/). If there is enough interested, I'd be
willing to try to talk the publisher into releasing that section as a
PDF (or perhaps HTML).

--david

2003-01-16 21:25:00

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] improved boot time TSC synchronization

Jim Houston wrote:
> + * t0 ---\
> + * ---\
> + * --->
> + * tm
> + * /---
> + * /---
> + * t1 <---
> + *
> + *
> + * The goal is to adjust the slave's TSC such that tm falls exactly half-way
> + * between t0 and t1.

It looks like not only can you synchronise with a certain accuracy,
you can determine an upper bound on that accuracy (assuming the
underlying CPU clocks are locked).

Maybe that figure could be put into /proc/cpuinfo?

As well as being an interesting value, it may be useful for programs
to know the effective accuracy of `rdtsc'.

-- Jamie

2003-01-17 00:26:07

by Jim Houston

[permalink] [raw]
Subject: Re: [PATCH] improved boot time TSC synchronization

Jamie Lokier wrote:
>
> It looks like not only can you synchronise with a certain accuracy,
> you can determine an upper bound on that accuracy (assuming the
> underlying CPU clocks are locked).
>
> Maybe that figure could be put into /proc/cpuinfo?
>
> As well as being an interesting value, it may be useful for programs
> to know the effective accuracy of `rdtsc'.
>
> -- Jamie

Hi Jamie,

Yeah, I'd be glad to add the round-trip time to cpuinfo. I see
this as a bogomips like metric. It tells you how quickly you
can move cache lines from chip to chip.

The patch currently prints the round-trip time and the max_delta.
On a Quad P4 Xeon, I got round-trip times in the 0.7 microsecond
range which is disappointing. The max_delta was almost always
zero cycles meaning that the feedback loop thinks that the TSC values
are perfectly synchronized.

Jim Houston - Concurrent Computer Corp.

2003-01-17 00:52:00

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] improved boot time TSC synchronization

Jim Houston wrote:
> The patch currently prints the round-trip time and the max_delta.
> On a Quad P4 Xeon, I got round-trip times in the 0.7 microsecond
> range which is disappointing. The max_delta was almost always
> zero cycles meaning that the feedback loop thinks that the TSC values
> are perfectly synchronized.

Is it reasonable to repeat the test over a duration of 10^6 cycles (or
more) such that you could detect any drift after synchronisation, as
well as variation _during_ that time interval?

I'm thinking of those spread spectrum clocks, which I gather are done
by frequency modulating the clock. It may be possible to detect:

(a) whether multiple CPUs with spread spectrum clocks are
actually locked to each other, or if the modulation
of each is independent

(b) whether multiple CPUs are drifting w.r.t. each other
because of independent clock sources

Although drift tends to be small, it should be possible to determine
"these clocks drifted by <1ppm during the test interval", which is a
pretty good indication of whether it is safe to use the TSC for
gettimeofday() or not.

-- Jamie

2003-01-17 03:26:25

by Jim Houston

[permalink] [raw]
Subject: Re: [PATCH] improved boot time TSC synchronization

Jamie Lokier wrote:
>
> Is it reasonable to repeat the test over a duration of 10^6 cycles (or
> more) such that you could detect any drift after synchronisation, as
> well as variation _during_ that time interval?
>
> I'm thinking of those spread spectrum clocks, which I gather are done
> by frequency modulating the clock. It may be possible to detect:
>
> (a) whether multiple CPUs with spread spectrum clocks are
> actually locked to each other, or if the modulation
> of each is independent
>
> (b) whether multiple CPUs are drifting w.r.t. each other
> because of independent clock sources
>
> Although drift tends to be small, it should be possible to determine
> "these clocks drifted by <1ppm during the test interval", which is a
> pretty good indication of whether it is safe to use the TSC for
> gettimeofday() or not.
>
> -- Jamie

Hi Jamie,

These are problems I haven't run into yet. All of the systems
I have stay nicely locked once they are in sync. It might be fun
to try this experiment if someone who has a system with this
problem volunteered to test.

For systems where the cpu frequency may vary I like the idea of
still using the TSC but doing a software phase locked loop to
synchronize it to another timer. I believe that Linus suggested
this as well. At least he suggested an NTP like approach.
The code necessary to do this could detect properly synchronized
TSCs and avoid most of the work.

Jim Houston - Concurrent Computer Corp.

2003-01-21 01:12:19

by Jim Houston

[permalink] [raw]
Subject: Re: [PATCH] improved boot time TSC synchronization


Stephen Hemminger wrote:
> I want to incorporate and try out this patch, but it doesn't apply
> cleanly against 2.5.58. Could you send a new one to me against 2.5.59

Hi Stephen

I'm sorry about the last patch. This one should apply cleanly
to linux-2.5.59.

Jim Houston - Concurrent Computer Corp.

--- linux-2.5.59.orig/arch/i386/kernel/smpboot.c Mon Jan 20 20:02:02 2003
+++ linux-2.5.59/arch/i386/kernel/smpboot.c Mon Jan 20 20:02:38 2003
@@ -169,169 +169,231 @@
}

/*
- * TSC synchronization.
- *
- * We first check wether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
+ * TSC synchronization based on ia64 itc synchronization code. Synchronize
+ * pairs of processors rather than tring to synchronize all of the processors
+ * with a single event. When several processors are all waiting for an
+ * event they don't all see it at the same time. The write will cause
+ * an invalidate on each processors cache and then they all scramble to
+ * re-read that cache line.
+ *
+ * Writing the TSC resets the upper 32-bits, so we need to be careful
+ * that all of the cpus can be synchronized before we overflow the
+ * 32-bit count.
*/

-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
-
-#define NR_LOOPS 5
+#define MASTER 0
+#define SLAVE (SMP_CACHE_BYTES/sizeof(long))

-extern unsigned long fast_gettimeoffset_quotient;
+#define NUM_ROUNDS 64 /* magic value */
+#define NUM_ITERS 5 /* likewise */

-/*
- * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit
- * multiplication. Not terribly optimized but we need it at boot time only
- * anyway.
- *
- * result == a / b
- * == (a1 + a2*(2^32)) / b
- * == a1/b + a2*(2^32/b)
- * == a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b
- * ^---- (this multiplication can overflow)
- */
+static volatile unsigned long go[2*SLAVE] __cacheline_aligned;
+static volatile int current_slave = -1;
+static volatile int tsc_sync_complete = 0;
+static volatile int tsc_adj_latency = 0;
+static unsigned int max_rt = 0;
+static unsigned int max_delta = 0;
+
+#define DEBUG_TSC_SYNC 0
+#if DEBUG_TSC_SYNC
+struct tsc_sync_debug {
+ long rt; /* roundtrip time */
+ long master; /* master's timestamp */
+ long diff; /* difference between midpoint and master's timestamp */
+ long lat; /* estimate of tsc adjustment latency */
+} tsc_sync_debug[NUM_ROUNDS*NR_CPUS];
+#endif

-static unsigned long long __init div64 (unsigned long long a, unsigned long b0)
+void
+sync_master(void)
{
- unsigned int a1, a2;
- unsigned long long res;
+ unsigned long n, tsc, last_go_master;

- a1 = ((unsigned int*)&a)[0];
- a2 = ((unsigned int*)&a)[1];
-
- res = a1/b0 +
- (unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) +
- a2 / b0 +
- (a2 * (0xffffffff % b0)) / b0;
-
- return res;
+ last_go_master = 0;
+ while (1) {
+ while ((n = go[MASTER]) == last_go_master)
+ rep_nop();
+ if (n == ~0)
+ break;
+ rdtscl(tsc);
+ if (unlikely(!tsc))
+ tsc = 1;
+ go[SLAVE] = tsc;
+ last_go_master = n;
+ }
}

-static void __init synchronize_tsc_bp (void)
+/*
+ * Return the number of cycles by which our TSC differs from the TSC on
+ * the master (time-keeper) CPU. A positive number indicates our TSC is
+ * ahead of the master, negative that it is behind.
+ */
+static inline long
+get_delta (long *rt, long *master)
{
- int i;
- unsigned long long t0;
- unsigned long long sum, avg;
- long long delta;
- unsigned long one_usec;
- int buggy = 0;
+ unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
+ unsigned long tcenter, t0, t1, tm, last_go_slave;
+ long i;
+
+ last_go_slave = go[SLAVE];
+ for (i = 0; i < NUM_ITERS; ++i) {
+ rdtscl(t0);
+ go[MASTER] = i+1;
+ while ((tm = go[SLAVE]) == last_go_slave)
+ rep_nop();
+ rdtscl(t1);
+
+ if (t1 - t0 < best_t1 - best_t0)
+ best_t0 = t0, best_t1 = t1, best_tm = tm;
+ last_go_slave = tm;
+ }

- printk("checking TSC synchronization across %u CPUs: ", num_booting_cpus());
+ *rt = best_t1 - best_t0;
+ *master = best_tm - best_t0;

- one_usec = ((1<<30)/fast_gettimeoffset_quotient)*(1<<2);
+ /* average best_t0 and best_t1 without overflow: */
+ tcenter = (best_t0/2 + best_t1/2);
+ if (best_t0 % 2 + best_t1 % 2 == 2)
+ ++tcenter;
+ return tcenter - best_tm;
+}

- atomic_set(&tsc_start_flag, 1);
- wmb();
+/*
+ * Synchronize TSC of the current (slave) CPU with the TSC of the MASTER CPU
+ * (normally the time-keeper CPU). We use a closed loop to eliminate the
+ * possibility of unaccounted-for errors (such as getting a machine check in
+ * the middle of a calibration step). The basic idea is for the slave to ask
+ * the master what TSC value it has and to read its own TSC before and after
+ * the master responds. Each iteration gives us three
+ * timestamps:
+ *
+ * slave master
+ *
+ * t0 ---\
+ * ---\
+ * --->
+ * tm
+ * /---
+ * /---
+ * t1 <---
+ *
+ *
+ * The goal is to adjust the slave's TSC such that tm falls exactly half-way
+ * between t0 and t1. If we achieve this, the clocks are synchronized provided
+ * the interconnect between the slave and the master is symmetric. Even if the
+ * interconnect were asymmetric, we would still know that the synchronization
+ * error is smaller than the roundtrip latency (t0 - t1).
+ *
+ * When the interconnect is quiet and symmetric, this lets us synchronize the
+ * TSC to within one or two cycles. However, we can only *guarantee* that the
+ * synchronization is accurate to within a round-trip time, which is typically
+ * in the range of several hundred cycles (e.g., ~500 cycles). In practice,
+ * this means that the TSC's are usually almost perfectly synchronized, but we
+ * shouldn't assume that the accuracy is much better than half a micro second
+ * or so.
+ */

+static void __init
+synchronize_tsc_ap (void)
+{
+ long i, delta, adj, adjust_latency, n_rounds;
+ unsigned long rt, master_time_stamp, tsc;
+#if DEBUG_TSC_SYNC
+ struct tsc_sync_debug *t =
+ &tsc_sync_debug[smp_processor_id() * NUM_ROUNDS];
+#endif
+
/*
- * We loop a few times to get a primed instruction cache,
- * then the last pass is more or less synchronized and
- * the BP and APs set their cycle counters to zero all at
- * once. This reduces the chance of having random offsets
- * between the processors, and guarantees that the maximum
- * delay between the cycle counters is never bigger than
- * the latency of information-passing (cachelines) between
- * two CPUs.
+ * Wait for our turn to synchronize with the boot processor.
*/
- for (i = 0; i < NR_LOOPS; i++) {
- /*
- * all APs synchronize but they loop on '== num_cpus'
- */
- while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_stop, 0);
- wmb();
- /*
- * this lets the APs save their current TSC:
- */
- atomic_inc(&tsc_count_start);
-
- rdtscll(tsc_values[smp_processor_id()]);
- /*
- * We clear the TSC in the last loop:
- */
- if (i == NR_LOOPS-1)
- write_tsc(0, 0);
-
- /*
- * Wait for all APs to leave the synchronization point:
- */
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_start, 0);
- wmb();
- atomic_inc(&tsc_count_stop);
+ while (current_slave != smp_processor_id())
+ rep_nop();
+ adjust_latency = tsc_adj_latency;
+
+ go[SLAVE] = 0;
+ go[MASTER] = 0;
+ write_tsc(0,0);
+ for (i = 0; i < NUM_ROUNDS; ++i) {
+ delta = get_delta(&rt, &master_time_stamp);
+ if (delta == 0)
+ break;
+
+ if (i > 0)
+ adjust_latency += -delta;
+ adj = -delta + adjust_latency/8;
+ rdtscl(tsc);
+ write_tsc(tsc + adj, 0);
+#if DEBUG_TSC_SYNC
+ t[i].rt = rt;
+ t[i].master = master_time_stamp;
+ t[i].diff = delta;
+ t[i].lat = adjust_latency/8;
+#endif
}
+ n_rounds = i;
+ go[MASTER] = ~0;

- sum = 0;
- for (i = 0; i < NR_CPUS; i++) {
- if (test_bit(i, &cpu_callout_map)) {
- t0 = tsc_values[i];
- sum += t0;
- }
- }
- avg = div64(sum, num_booting_cpus());
-
- sum = 0;
- for (i = 0; i < NR_CPUS; i++) {
- if (!test_bit(i, &cpu_callout_map))
- continue;
- delta = tsc_values[i] - avg;
- if (delta < 0)
- delta = -delta;
- /*
- * We report bigger than 2 microseconds clock differences.
- */
- if (delta > 2*one_usec) {
- long realdelta;
- if (!buggy) {
- buggy = 1;
- printk("\n");
- }
- realdelta = div64(delta, one_usec);
- if (tsc_values[i] < avg)
- realdelta = -realdelta;
-
- printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
- }
-
- sum += delta;
- }
- if (!buggy)
- printk("passed.\n");
- ;
+#if (DEBUG_TSC_SYNC == 2)
+ for (i = 0; i < n_rounds; ++i)
+ printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
+ t[i].rt, t[i].master, t[i].diff, t[i].lat);
+
+ printk("CPU %d: synchronized TSC (last diff %ld cycles, maxerr %lu cycles)\n",
+ smp_processor_id(), delta, rt);
+
+ printk("It took %d rounds\n", n_rounds);
+#endif
+ if (rt > max_rt)
+ max_rt = rt;
+ if (delta < 0)
+ delta = -delta;
+ if (delta > max_delta)
+ max_delta = delta;
+ tsc_adj_latency = adjust_latency;
+ current_slave = -1;
+ while (!tsc_sync_complete)
+ rep_nop();
}

-static void __init synchronize_tsc_ap (void)
-{
- int i;
+/*
+ * The boot processor set its own TSC to zero and then gives each
+ * slave processor the chance to synchronize itself.
+ */

- /*
- * Not every cpu is online at the time
- * this gets called, so we first wait for the BP to
- * finish SMP initialization:
- */
- while (!atomic_read(&tsc_start_flag)) mb();
+static void __init synchronize_tsc_bp (void)
+{
+ unsigned int tsc_low, tsc_high, error;
+ int cpu;

- for (i = 0; i < NR_LOOPS; i++) {
- atomic_inc(&tsc_count_start);
- while (atomic_read(&tsc_count_start) != num_booting_cpus())
- mb();
-
- rdtscll(tsc_values[smp_processor_id()]);
- if (i == NR_LOOPS-1)
- write_tsc(0, 0);
+ printk("start TSC synchronization\n");
+ write_tsc(0, 0);

- atomic_inc(&tsc_count_stop);
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ if (!(cpu_callin_map & (1<<cpu)))
+ continue;
+ if (cpu == smp_processor_id())
+ continue;
+ go[MASTER] = 0;
+ current_slave = cpu;
+ sync_master();
+ while (current_slave != -1)
+ rep_nop();
+ }
+ rdtsc(tsc_low, tsc_high);
+ if (tsc_high)
+ printk("TSC overflowed during synchronization\n");
+ else
+ printk("TSC synchronization complete max_delta=%d cycles\n",
+ max_delta);
+ if (max_rt < 4293) {
+ error = (max_rt * 1000000)/cpu_khz;
+ printk("TSC sync round-trip time %d.%03d microseconds\n",
+ error/1000, error%1000);
+ } else {
+ printk("TSC sync round-trip time %d cycles\n", max_rt);
}
+ tsc_sync_complete = 1;
}
-#undef NR_LOOPS

extern void calibrate_delay(void);