Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755575AbXLGN43 (ORCPT ); Fri, 7 Dec 2007 08:56:29 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751593AbXLGN4S (ORCPT ); Fri, 7 Dec 2007 08:56:18 -0500 Received: from mx2.mail.elte.hu ([157.181.151.9]:44468 "EHLO mx2.mail.elte.hu" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752511AbXLGN4Q (ORCPT ); Fri, 7 Dec 2007 08:56:16 -0500 Date: Fri, 7 Dec 2007 14:55:25 +0100 From: Ingo Molnar To: Guillaume Chazarain Cc: Thomas Gleixner , Stefano Brivio , Robert Love , linux-kernel@vger.kernel.org, Dave Jones , "Rafael J. Wysocki" , Michael Buesch , "Andrew Morton"@pimp.vs19.net Subject: [patch] x86: scale cyc_2_nsec according to CPU frequency Message-ID: <20071207135525.GA5588@elte.hu> References: <20071207021952.6f0ac922@morte> <3d8471ca0712062318j483f8be6h256778752a13639a@mail.gmail.com> <20071207090252.1caf1509@inria.fr> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20071207090252.1caf1509@inria.fr> User-Agent: Mutt/1.5.17 (2007-11-01) X-ELTE-VirusStatus: clean X-ELTE-SpamScore: -1.5 X-ELTE-SpamLevel: X-ELTE-SpamCheck: no X-ELTE-SpamVersion: ELTE 2.0 X-ELTE-SpamCheck-Details: score=-1.5 required=5.9 tests=BAYES_00 autolearn=no SpamAssassin version=3.2.3 -1.5 BAYES_00 BODY: Bayesian spam probability is 0 to 1% [score: 0.0000] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8995 Lines: 296 * Guillaume Chazarain wrote: > > > Hmrpf. sched_clock() is used for the time stamp of the printks. We > > > need to find some better solution other than killing off the tsc > > > access completely. > > > > Something like http://lkml.org/lkml/2007/3/16/291 that would need > > some refresh? > > And here is a refreshed one just for testing with 2.6-git. The 64 bit > part is a shamelessly untested copy/paste as I cannot test it. Guillaume, i've updated your patch with a handful of changes - see the result below. Firstly, we dont need the 'offset' anymore because cpu_clock() maintains offsets itself. This simplifies the math and speeds up the sched_clock() common case. Secondly, with PER_CPU variables we need to update them for all possible CPUs - otherwise they might end up with a zero scaling factor which is not good. (not all CPUs are cpufreq capable) Thirdly, we can do a bit smarter and faster by using the fact that local_irq_disable() is preempt-safe - so we can use per_cpu() instead of get_cpu_var(). Ingo -----------------> Subject: x86: scale cyc_2_nsec according to CPU frequency From: "Guillaume Chazarain" scale the sched_clock() cyc_2_nsec scaling factor according to CPU frequency changes. [ mingo@elte.hu: simplified it and fixed it for SMP. ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_32.c | 41 +++++++++++++++++++++++++++----- arch/x86/kernel/tsc_64.c | 59 +++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/timer.h | 23 ++++++++++++++---- 3 files changed, 102 insertions(+), 21 deletions(-) Index: linux-x86.q/arch/x86/kernel/tsc_32.c =================================================================== --- linux-x86.q.orig/arch/x86/kernel/tsc_32.c +++ linux-x86.q/arch/x86/kernel/tsc_32.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -78,15 +79,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. * (mathieu.desnoyers@polymtl.ca) * + * ns += offset to avoid sched_clock jumps with cpufreq + * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +DEFINE_PER_CPU(unsigned long, cyc2ns); -static inline void set_cyc2ns_scale(unsigned long cpu_khz) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; + unsigned long flags, prev_scale, *scale; + unsigned long long tsc_now, ns_now; + + local_irq_save(flags); + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + prev_scale = *scale; + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + printk("CPU#%d: changed cyc2ns scale from %ld to %ld\n", + cpu, prev_scale, *scale); + local_irq_restore(flags); } /* @@ -239,7 +256,9 @@ time_cpufreq_notifier(struct notifier_bl ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { tsc_khz = cpu_khz; - set_cyc2ns_scale(cpu_khz); + preempt_disable(); + set_cyc2ns_scale(cpu_khz, smp_processor_id()); + preempt_enable(); /* * TSC based sched_clock turns * to junk w/ cpufreq @@ -367,6 +386,8 @@ static inline void check_geode_tsc_relia void __init tsc_init(void) { + int cpu; + if (!cpu_has_tsc || tsc_disable) goto out_no_tsc; @@ -380,7 +401,15 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); - set_cyc2ns_scale(cpu_khz); + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(cpu_khz, cpu); + use_tsc_delay(); /* Check and install the TSC clocksource */ Index: linux-x86.q/arch/x86/kernel/tsc_64.c =================================================================== --- linux-x86.q.orig/arch/x86/kernel/tsc_64.c +++ linux-x86.q/arch/x86/kernel/tsc_64.c @@ -10,6 +10,7 @@ #include #include +#include static int notsc __initdata = 0; @@ -18,16 +19,50 @@ EXPORT_SYMBOL(cpu_khz); unsigned int tsc_khz; EXPORT_SYMBOL(tsc_khz); -static unsigned int cyc2ns_scale __read_mostly; +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * ns += offset to avoid sched_clock jumps with cpufreq + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +DEFINE_PER_CPU(unsigned long, cyc2ns); -static inline void set_cyc2ns_scale(unsigned long khz) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; -} + unsigned long flags, prev_scale, *scale; + unsigned long long tsc_now, ns_now; -static unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; + local_irq_save(flags); + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + prev_scale = *scale; + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + printk("CPU#%d: changed cyc2ns scale from %ld to %ld\n", + cpu, prev_scale, *scale); + + local_irq_restore(flags); } unsigned long long sched_clock(void) @@ -100,7 +135,9 @@ static int time_cpufreq_notifier(struct mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(tsc_khz_ref); + preempt_disable(); + set_cyc2ns_scale(tsc_khz_ref, smp_processor_id()); + preempt_enable(); return 0; } @@ -151,7 +188,7 @@ static unsigned long __init tsc_read_ref void __init tsc_calibrate(void) { unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(); + int hpet = is_hpet_enabled(), cpu; local_irq_save(flags); @@ -206,7 +243,9 @@ void __init tsc_calibrate(void) } tsc_khz = tsc2 / tsc1; - set_cyc2ns_scale(tsc_khz); + + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu); } /* Index: linux-x86.q/include/asm-x86/timer.h =================================================================== --- linux-x86.q.orig/include/asm-x86/timer.h +++ linux-x86.q/include/asm-x86/timer.h @@ -2,6 +2,7 @@ #define _ASMi386_TIMER_H #include #include +#include #define TICK_SIZE (tick_nsec / 1000) @@ -16,7 +17,7 @@ extern int recalibrate_cpu_khz(void); #define calculate_cpu_khz() native_calculate_cpu_khz() #endif -/* Accellerators for sched_clock() +/* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) @@ -31,20 +32,32 @@ extern int recalibrate_cpu_khz(void); * And since SC is a constant power of two, we can convert the div * into a shift. * - * We can use khz divisor instead of mhz to keep a better percision, since + * We can use khz divisor instead of mhz to keep a better precision, since * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -extern unsigned long cyc2ns_scale __read_mostly; + +DECLARE_PER_CPU(unsigned long, cyc2ns); #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc) +static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; + return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR; } +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns; + unsigned long flags; + + local_irq_save(flags); + ns = __cycles_2_ns(cyc); + local_irq_restore(flags); + + return ns; +} #endif -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/