Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id ; Thu, 16 Jan 2003 11:36:25 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id ; Thu, 16 Jan 2003 11:36:25 -0500 Received: from h00e098094f32.ne.client2.attbi.com ([24.60.234.83]:29058 "EHLO linux.local") by vger.kernel.org with ESMTP id ; Thu, 16 Jan 2003 11:36:03 -0500 Date: Thu, 16 Jan 2003 11:44:55 -0500 Message-Id: <200301161644.h0GGitX02052@linux.local> From: Jim Houston To: linux-kernel@vger.kernel.org, high-res-timers-discourse@lists.sourceforge.net, jim.houston@ccur.com Subject: [PATCH] improved boot time TSC synchronization Reply-to: jim.houston@attbi.com Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Hi Everyone, The current TSC synchronization may have an error on the order of 1 microsecond on a quad processor system. Not a big deal but annoying if you are trying to figure out what order things happen based on TSC time stamps. Since the IA64 folk already solved this problem, I did a quick hack based on the itc sync code. The current code tries to synchronize all of the processors with a single event. The master processor clears its TSC and changes a flag. The slave processors watch for the flag to change and clear their TSCs. On the bus this ends up being a write which invalidates the other processors caches and a set of read operations as all of the processors reload the cache line. It is this scramble to re-read the cache line that introduces the error. Following the IA64 model the new code synchronizes the TSCs of pairs of processors. It uses a feedback loop to compensate out the errors. It should produce perfect synchronization for systems where the memory access times are symmetrical. I'm still looking for a way to validate the synchronization which doesn't involve synchronization through memory. Of course, there are systems with wandering TSCs. This patch doesn't do anything to keep the TSCs synchronized. The patch was tested against 2.5.58. Jim Houston - Concurrent Computer Corp. --- linux-2.5.58.orig/arch/i386/kernel/smpboot.c Thu Jan 16 10:52:09 2003 +++ linux-2.5.58/arch/i386/kernel/smpboot.c Thu Jan 16 10:55:52 2003 @@ -169,167 +169,231 @@ } /* - * TSC synchronization. - * - * We first check wether all CPUs have their TSC's synchronized, - * then we print a warning if not, and always resync. + * TSC synchronization based on ia64 itc synchronization code. Synchronize + * pairs of processors rather than tring to synchronize all of the processors + * with a single event. When several processors are all waiting for an + * event they don't all see it at the same time. The write will cause + * an invalidate on each processors cache and then they all scramble to + * re-read that cache line. + * + * Writing the TSC resets the upper 32-bits, so we need to be careful + * that all of the cpus can be synchronized before we overflow the + * 32-bit count. */ -static atomic_t tsc_start_flag = ATOMIC_INIT(0); -static atomic_t tsc_count_start = ATOMIC_INIT(0); -static atomic_t tsc_count_stop = ATOMIC_INIT(0); -static unsigned long long tsc_values[NR_CPUS]; +#define MASTER 0 +#define SLAVE (SMP_CACHE_BYTES/sizeof(long)) -#define NR_LOOPS 5 +#define NUM_ROUNDS 64 /* magic value */ +#define NUM_ITERS 5 /* likewise */ -/* - * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit - * multiplication. Not terribly optimized but we need it at boot time only - * anyway. - * - * result == a / b - * == (a1 + a2*(2^32)) / b - * == a1/b + a2*(2^32/b) - * == a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b - * ^---- (this multiplication can overflow) - */ +static volatile unsigned long go[2*SLAVE] __cacheline_aligned; +static volatile int current_slave = -1; +static volatile int tsc_sync_complete = 0; +static volatile int tsc_adj_latency = 0; +static unsigned int max_rt = 0; +static unsigned int max_delta = 0; + +#define DEBUG_TSC_SYNC 0 +#if DEBUG_TSC_SYNC +struct tsc_sync_debug { + long rt; /* roundtrip time */ + long master; /* master's timestamp */ + long diff; /* difference between midpoint and master's timestamp */ + long lat; /* estimate of tsc adjustment latency */ +} tsc_sync_debug[NUM_ROUNDS*NR_CPUS]; +#endif -static unsigned long long __init div64 (unsigned long long a, unsigned long b0) +void +sync_master(void) { - unsigned int a1, a2; - unsigned long long res; - - a1 = ((unsigned int*)&a)[0]; - a2 = ((unsigned int*)&a)[1]; + unsigned long n, tsc, last_go_master; - res = a1/b0 + - (unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) + - a2 / b0 + - (a2 * (0xffffffff % b0)) / b0; - - return res; + last_go_master = 0; + while (1) { + while ((n = go[MASTER]) == last_go_master) + rep_nop(); + if (n == ~0) + break; + rdtscl(tsc); + if (unlikely(!tsc)) + tsc = 1; + go[SLAVE] = tsc; + last_go_master = n; + } } -static void __init synchronize_tsc_bp (void) +/* + * Return the number of cycles by which our TSC differs from the TSC on + * the master (time-keeper) CPU. A positive number indicates our TSC is + * ahead of the master, negative that it is behind. + */ +static inline long +get_delta (long *rt, long *master) { - int i; - unsigned long long t0; - unsigned long long sum, avg; - long long delta; - unsigned long one_usec; - int buggy = 0; + unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; + unsigned long tcenter, t0, t1, tm, last_go_slave; + long i; + + last_go_slave = go[SLAVE]; + for (i = 0; i < NUM_ITERS; ++i) { + rdtscl(t0); + go[MASTER] = i+1; + while ((tm = go[SLAVE]) == last_go_slave) + rep_nop(); + rdtscl(t1); + + if (t1 - t0 < best_t1 - best_t0) + best_t0 = t0, best_t1 = t1, best_tm = tm; + last_go_slave = tm; + } - printk("checking TSC synchronization across %u CPUs: ", num_booting_cpus()); + *rt = best_t1 - best_t0; + *master = best_tm - best_t0; - one_usec = cpu_khz/1000; + /* average best_t0 and best_t1 without overflow: */ + tcenter = (best_t0/2 + best_t1/2); + if (best_t0 % 2 + best_t1 % 2 == 2) + ++tcenter; + return tcenter - best_tm; +} - atomic_set(&tsc_start_flag, 1); - wmb(); +/* + * Synchronize TSC of the current (slave) CPU with the TSC of the MASTER CPU + * (normally the time-keeper CPU). We use a closed loop to eliminate the + * possibility of unaccounted-for errors (such as getting a machine check in + * the middle of a calibration step). The basic idea is for the slave to ask + * the master what TSC value it has and to read its own TSC before and after + * the master responds. Each iteration gives us three + * timestamps: + * + * slave master + * + * t0 ---\ + * ---\ + * ---> + * tm + * /--- + * /--- + * t1 <--- + * + * + * The goal is to adjust the slave's TSC such that tm falls exactly half-way + * between t0 and t1. If we achieve this, the clocks are synchronized provided + * the interconnect between the slave and the master is symmetric. Even if the + * interconnect were asymmetric, we would still know that the synchronization + * error is smaller than the roundtrip latency (t0 - t1). + * + * When the interconnect is quiet and symmetric, this lets us synchronize the + * TSC to within one or two cycles. However, we can only *guarantee* that the + * synchronization is accurate to within a round-trip time, which is typically + * in the range of several hundred cycles (e.g., ~500 cycles). In practice, + * this means that the TSC's are usually almost perfectly synchronized, but we + * shouldn't assume that the accuracy is much better than half a micro second + * or so. + */ +static void __init +synchronize_tsc_ap (void) +{ + long i, delta, adj, adjust_latency, n_rounds; + unsigned long rt, master_time_stamp, tsc; +#if DEBUG_TSC_SYNC + struct tsc_sync_debug *t = + &tsc_sync_debug[smp_processor_id() * NUM_ROUNDS]; +#endif + /* - * We loop a few times to get a primed instruction cache, - * then the last pass is more or less synchronized and - * the BP and APs set their cycle counters to zero all at - * once. This reduces the chance of having random offsets - * between the processors, and guarantees that the maximum - * delay between the cycle counters is never bigger than - * the latency of information-passing (cachelines) between - * two CPUs. + * Wait for our turn to synchronize with the boot processor. */ - for (i = 0; i < NR_LOOPS; i++) { - /* - * all APs synchronize but they loop on '== num_cpus' - */ - while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) - mb(); - atomic_set(&tsc_count_stop, 0); - wmb(); - /* - * this lets the APs save their current TSC: - */ - atomic_inc(&tsc_count_start); - - rdtscll(tsc_values[smp_processor_id()]); - /* - * We clear the TSC in the last loop: - */ - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - /* - * Wait for all APs to leave the synchronization point: - */ - while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) - mb(); - atomic_set(&tsc_count_start, 0); - wmb(); - atomic_inc(&tsc_count_stop); + while (current_slave != smp_processor_id()) + rep_nop(); + adjust_latency = tsc_adj_latency; + + go[SLAVE] = 0; + go[MASTER] = 0; + write_tsc(0,0); + for (i = 0; i < NUM_ROUNDS; ++i) { + delta = get_delta(&rt, &master_time_stamp); + if (delta == 0) + break; + + if (i > 0) + adjust_latency += -delta; + adj = -delta + adjust_latency/8; + rdtscl(tsc); + write_tsc(tsc + adj, 0); +#if DEBUG_TSC_SYNC + t[i].rt = rt; + t[i].master = master_time_stamp; + t[i].diff = delta; + t[i].lat = adjust_latency/8; +#endif } + n_rounds = i; + go[MASTER] = ~0; - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (test_bit(i, &cpu_callout_map)) { - t0 = tsc_values[i]; - sum += t0; - } - } - avg = div64(sum, num_booting_cpus()); - - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (!test_bit(i, &cpu_callout_map)) - continue; - delta = tsc_values[i] - avg; - if (delta < 0) - delta = -delta; - /* - * We report bigger than 2 microseconds clock differences. - */ - if (delta > 2*one_usec) { - long realdelta; - if (!buggy) { - buggy = 1; - printk("\n"); - } - realdelta = div64(delta, one_usec); - if (tsc_values[i] < avg) - realdelta = -realdelta; - - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta); - } - - sum += delta; - } - if (!buggy) - printk("passed.\n"); - ; +#if (DEBUG_TSC_SYNC == 2) + for (i = 0; i < n_rounds; ++i) + printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", + t[i].rt, t[i].master, t[i].diff, t[i].lat); + + printk("CPU %d: synchronized TSC (last diff %ld cycles, maxerr %lu cycles)\n", + smp_processor_id(), delta, rt); + + printk("It took %d rounds\n", n_rounds); +#endif + if (rt > max_rt) + max_rt = rt; + if (delta < 0) + delta = -delta; + if (delta > max_delta) + max_delta = delta; + tsc_adj_latency = adjust_latency; + current_slave = -1; + while (!tsc_sync_complete) + rep_nop(); } -static void __init synchronize_tsc_ap (void) -{ - int i; +/* + * The boot processor set its own TSC to zero and then gives each + * slave processor the chance to synchronize itself. + */ - /* - * Not every cpu is online at the time - * this gets called, so we first wait for the BP to - * finish SMP initialization: - */ - while (!atomic_read(&tsc_start_flag)) mb(); +static void __init synchronize_tsc_bp (void) +{ + unsigned int tsc_low, tsc_high, error; + int cpu; - for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc_count_start); - while (atomic_read(&tsc_count_start) != num_booting_cpus()) - mb(); - - rdtscll(tsc_values[smp_processor_id()]); - if (i == NR_LOOPS-1) - write_tsc(0, 0); + printk("start TSC synchronization\n"); + write_tsc(0, 0); - atomic_inc(&tsc_count_stop); - while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (!(cpu_callin_map & (1<