Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754407AbXLGM0W (ORCPT ); Fri, 7 Dec 2007 07:26:22 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1752203AbXLGM0N (ORCPT ); Fri, 7 Dec 2007 07:26:13 -0500 Received: from mx2.mail.elte.hu ([157.181.151.9]:53738 "EHLO mx2.mail.elte.hu" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751682AbXLGM0L (ORCPT ); Fri, 7 Dec 2007 07:26:11 -0500 Date: Fri, 7 Dec 2007 13:25:16 +0100 From: Ingo Molnar To: stefano.brivio@polimi.it Cc: Nick Piggin , Robert Love , linux-kernel@vger.kernel.org, Dave Jones , "Rafael J. Wysocki" , Michael Buesch , Thomas Gleixner , Andrew Morton , Len Brown , Guillaume Chazarain Subject: Re: [PATCH] scheduler: fix x86 regression in native_sched_clock Message-ID: <20071207122516.GA25115@elte.hu> References: <20071207021952.6f0ac922@morte> <20071207084559.GA11162@elte.hu> <200712072213.06530.nickpiggin@yahoo.com.au> <20071207122310.maqkoxlb4gsg4ggc@webmail.polimi.it> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20071207122310.maqkoxlb4gsg4ggc@webmail.polimi.it> User-Agent: Mutt/1.5.17 (2007-11-01) X-ELTE-VirusStatus: clean X-ELTE-SpamScore: -1.5 X-ELTE-SpamLevel: X-ELTE-SpamCheck: no X-ELTE-SpamVersion: ELTE 2.0 X-ELTE-SpamCheck-Details: score=-1.5 required=5.9 tests=BAYES_00 autolearn=no SpamAssassin version=3.2.3 -1.5 BAYES_00 BODY: Bayesian spam probability is 0 to 1% [score: 0.0000] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 16926 Lines: 595 ok, here's a rollup of 11 patches that relate to this. I hoped we could wait with this for 2.6.25, but it seems more urgent as per Stefano's testing, as udelay() and drivers are affected as well. Stefano, could you try this ontop of a recent-ish Linus tree - does this resolve all issues? (without introducing new ones ;-) Ingo Index: linux/arch/arm/kernel/time.c =================================================================== --- linux.orig/arch/arm/kernel/time.c +++ linux/arch/arm/kernel/time.c @@ -79,17 +79,6 @@ static unsigned long dummy_gettimeoffset } #endif -/* - * An implementation of printk_clock() independent from - * sched_clock(). This avoids non-bootable kernels when - * printk_clock is enabled. - */ -unsigned long long printk_clock(void) -{ - return (unsigned long long)(jiffies - INITIAL_JIFFIES) * - (1000000000 / HZ); -} - static unsigned long next_rtc_update; /* Index: linux/arch/ia64/kernel/time.c =================================================================== --- linux.orig/arch/ia64/kernel/time.c +++ linux/arch/ia64/kernel/time.c @@ -344,33 +344,6 @@ udelay (unsigned long usecs) } EXPORT_SYMBOL(udelay); -static unsigned long long ia64_itc_printk_clock(void) -{ - if (ia64_get_kr(IA64_KR_PER_CPU_DATA)) - return sched_clock(); - return 0; -} - -static unsigned long long ia64_default_printk_clock(void) -{ - return (unsigned long long)(jiffies_64 - INITIAL_JIFFIES) * - (1000000000/HZ); -} - -unsigned long long (*ia64_printk_clock)(void) = &ia64_default_printk_clock; - -unsigned long long printk_clock(void) -{ - return ia64_printk_clock(); -} - -void __init -ia64_setup_printk_clock(void) -{ - if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) - ia64_printk_clock = ia64_itc_printk_clock; -} - /* IA64 doesn't cache the timezone */ void update_vsyscall_tz(void) { Index: linux/arch/x86/kernel/process_32.c =================================================================== --- linux.orig/arch/x86/kernel/process_32.c +++ linux/arch/x86/kernel/process_32.c @@ -113,10 +113,19 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched()) { + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } else { /* loop is done by the caller */ Index: linux/arch/x86/kernel/tsc_32.c =================================================================== --- linux.orig/arch/x86/kernel/tsc_32.c +++ linux/arch/x86/kernel/tsc_32.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -78,15 +79,32 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. * (mathieu.desnoyers@polymtl.ca) * + * ns += offset to avoid sched_clock jumps with cpufreq + * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ -static inline void set_cyc2ns_scale(unsigned long cpu_khz) +DEFINE_PER_CPU(struct cyc2ns_params, cyc2ns) __read_mostly; + +static void set_cyc2ns_scale(unsigned long cpu_khz) { - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; + struct cyc2ns_params *params; + unsigned long flags; + unsigned long long tsc_now, ns_now; + + rdtscll(tsc_now); + params = &get_cpu_var(cyc2ns); + + local_irq_save(flags); + ns_now = __cycles_2_ns(params, tsc_now); + + params->scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + params->offset += ns_now - __cycles_2_ns(params, tsc_now); + local_irq_restore(flags); + + put_cpu_var(cyc2ns); } /* Index: linux/arch/x86/kernel/tsc_64.c =================================================================== --- linux.orig/arch/x86/kernel/tsc_64.c +++ linux/arch/x86/kernel/tsc_64.c @@ -10,6 +10,7 @@ #include #include +#include static int notsc __initdata = 0; @@ -18,16 +19,25 @@ EXPORT_SYMBOL(cpu_khz); unsigned int tsc_khz; EXPORT_SYMBOL(tsc_khz); -static unsigned int cyc2ns_scale __read_mostly; +DEFINE_PER_CPU(struct cyc2ns_params, cyc2ns) __read_mostly; -static inline void set_cyc2ns_scale(unsigned long khz) +static void set_cyc2ns_scale(unsigned long cpu_khz) { - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; -} + struct cyc2ns_params *params; + unsigned long flags; + unsigned long long tsc_now, ns_now; -static unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; + rdtscll(tsc_now); + params = &get_cpu_var(cyc2ns); + + local_irq_save(flags); + ns_now = __cycles_2_ns(params, tsc_now); + + params->scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + params->offset += ns_now - __cycles_2_ns(params, tsc_now); + local_irq_restore(flags); + + put_cpu_var(cyc2ns); } unsigned long long sched_clock(void) Index: linux/arch/x86/lib/delay_32.c =================================================================== --- linux.orig/arch/x86/lib/delay_32.c +++ linux/arch/x86/lib/delay_32.c @@ -38,17 +38,21 @@ static void delay_loop(unsigned long loo :"0" (loops)); } -/* TSC based delay: */ +/* cpu_clock() [TSC] based delay: */ static void delay_tsc(unsigned long loops) { - unsigned long bclock, now; + unsigned long long start, stop, now; + int this_cpu; + + preempt_disable(); + + this_cpu = smp_processor_id(); + start = now = cpu_clock(this_cpu); + stop = start + loops; + + while ((long long)(stop - now) > 0) + now = cpu_clock(this_cpu); - preempt_disable(); /* TSC's are per-cpu */ - rdtscl(bclock); - do { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); preempt_enable(); } Index: linux/arch/x86/lib/delay_64.c =================================================================== --- linux.orig/arch/x86/lib/delay_64.c +++ linux/arch/x86/lib/delay_64.c @@ -26,19 +26,28 @@ int read_current_timer(unsigned long *ti return 0; } -void __delay(unsigned long loops) +/* cpu_clock() [TSC] based delay: */ +static void delay_tsc(unsigned long loops) { - unsigned bclock, now; + unsigned long long start, stop, now; + int this_cpu; + + preempt_disable(); + + this_cpu = smp_processor_id(); + start = now = cpu_clock(this_cpu); + stop = start + loops; + + while ((long long)(stop - now) > 0) + now = cpu_clock(this_cpu); - preempt_disable(); /* TSC's are pre-cpu */ - rdtscl(bclock); - do { - rep_nop(); - rdtscl(now); - } - while ((now-bclock) < loops); preempt_enable(); } + +void __delay(unsigned long loops) +{ + delay_tsc(loops); +} EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) Index: linux/drivers/acpi/processor_idle.c =================================================================== --- linux.orig/drivers/acpi/processor_idle.c +++ linux/drivers/acpi/processor_idle.c @@ -531,6 +531,11 @@ static void acpi_processor_idle(void) case ACPI_STATE_C3: /* + * Must be done before busmaster disable as we might + * need to access HPET ! + */ + acpi_state_timer_broadcast(pr, cx, 1); + /* * disable bus master * bm_check implies we need ARB_DIS * !bm_check implies we need cache flush @@ -557,7 +562,6 @@ static void acpi_processor_idle(void) /* Get start time (ticks) */ t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); /* Invoke C3 */ - acpi_state_timer_broadcast(pr, cx, 1); /* Tell the scheduler that we are going deep-idle: */ sched_clock_idle_sleep_event(); acpi_cstate_enter(cx); @@ -1401,9 +1405,6 @@ static int acpi_idle_enter_simple(struct if (acpi_idle_suspend) return(acpi_idle_enter_c1(dev, state)); - if (pr->flags.bm_check) - acpi_idle_update_bm_rld(pr, cx); - local_irq_disable(); current_thread_info()->status &= ~TS_POLLING; /* @@ -1418,13 +1419,21 @@ static int acpi_idle_enter_simple(struct return 0; } + /* + * Must be done before busmaster disable as we might need to + * access HPET ! + */ + acpi_state_timer_broadcast(pr, cx, 1); + + if (pr->flags.bm_check) + acpi_idle_update_bm_rld(pr, cx); + if (cx->type == ACPI_STATE_C3) ACPI_FLUSH_CPU_CACHE(); t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); /* Tell the scheduler that we are going deep-idle: */ sched_clock_idle_sleep_event(); - acpi_state_timer_broadcast(pr, cx, 1); acpi_idle_do_entry(cx); t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); Index: linux/include/asm-x86/timer.h =================================================================== --- linux.orig/include/asm-x86/timer.h +++ linux/include/asm-x86/timer.h @@ -2,6 +2,7 @@ #define _ASMi386_TIMER_H #include #include +#include #define TICK_SIZE (tick_nsec / 1000) @@ -16,7 +17,7 @@ extern int recalibrate_cpu_khz(void); #define calculate_cpu_khz() native_calculate_cpu_khz() #endif -/* Accellerators for sched_clock() +/* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) @@ -31,20 +32,44 @@ extern int recalibrate_cpu_khz(void); * And since SC is a constant power of two, we can convert the div * into a shift. * - * We can use khz divisor instead of mhz to keep a better percision, since + * We can use khz divisor instead of mhz to keep a better precision, since * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. * (mathieu.desnoyers@polymtl.ca) * + * ns += offset to avoid sched_clock jumps with cpufreq + * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -extern unsigned long cyc2ns_scale __read_mostly; + +struct cyc2ns_params { + unsigned long scale; + unsigned long long offset; +}; + +DECLARE_PER_CPU(struct cyc2ns_params, cyc2ns) __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc) +static inline unsigned long long __cycles_2_ns(struct cyc2ns_params *params, + unsigned long long cyc) { - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; + return ((cyc * params->scale) >> CYC2NS_SCALE_FACTOR) + params->offset; } +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + struct cyc2ns_params *params; + unsigned long flags; + unsigned long long ns; + + params = &get_cpu_var(cyc2ns); + + local_irq_save(flags); + ns = __cycles_2_ns(params, cyc); + local_irq_restore(flags); + + put_cpu_var(cyc2ns); + return ns; +} #endif Index: linux/kernel/hrtimer.c =================================================================== --- linux.orig/kernel/hrtimer.c +++ linux/kernel/hrtimer.c @@ -850,6 +850,14 @@ hrtimer_start(struct hrtimer *timer, kti #ifdef CONFIG_TIME_LOW_RES tim = ktime_add(tim, base->resolution); #endif + /* + * Careful here: User space might have asked for a + * very long sleep, so the add above might result in a + * negative number, which enqueues the timer in front + * of the queue. + */ + if (tim.tv64 < 0) + tim.tv64 = KTIME_MAX; } timer->expires = tim; Index: linux/kernel/lockdep.c =================================================================== --- linux.orig/kernel/lockdep.c +++ linux/kernel/lockdep.c @@ -2654,10 +2654,15 @@ static void check_flags(unsigned long fl if (!debug_locks) return; - if (irqs_disabled_flags(flags)) - DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled); - else - DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled); + if (irqs_disabled_flags(flags)) { + if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + printk("possible reason: unannotated irqs-off.\n"); + } + } else { + if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + printk("possible reason: unannotated irqs-on.\n"); + } + } /* * We dont accurately track softirq state in e.g. Index: linux/kernel/printk.c =================================================================== --- linux.orig/kernel/printk.c +++ linux/kernel/printk.c @@ -573,11 +573,6 @@ static int __init printk_time_setup(char __setup("time", printk_time_setup); -__attribute__((weak)) unsigned long long printk_clock(void) -{ - return sched_clock(); -} - /* Check if we have any console registered that can be called early in boot. */ static int have_callable_console(void) { @@ -628,30 +623,57 @@ asmlinkage int printk(const char *fmt, . /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; +const char printk_recursion_bug_msg [] = + KERN_CRIT "BUG: recent printk recursion!\n"; +static int printk_recursion_bug; + asmlinkage int vprintk(const char *fmt, va_list args) { + static int log_level_unknown = 1; + static char printk_buf[1024]; + unsigned long flags; - int printed_len; + int printed_len = 0; + int this_cpu; char *p; - static char printk_buf[1024]; - static int log_level_unknown = 1; boot_delay_msec(); preempt_disable(); - if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) - /* If a crash is occurring during printk() on this CPU, - * make sure we can't deadlock */ - zap_locks(); - /* This stops the holder of console_sem just where we want him */ raw_local_irq_save(flags); + this_cpu = smp_processor_id(); + + /* + * Ouch, printk recursed into itself! + */ + if (unlikely(printk_cpu == this_cpu)) { + /* + * If a crash is occurring during printk() on this CPU, + * then try to get the crash message out but make sure + * we can't deadlock. Otherwise just return to avoid the + * recursion and return - but flag the recursion so that + * it can be printed at the next appropriate moment: + */ + if (!oops_in_progress) { + printk_recursion_bug = 1; + goto out_restore_irqs; + } + zap_locks(); + } + lockdep_off(); spin_lock(&logbuf_lock); - printk_cpu = smp_processor_id(); + printk_cpu = this_cpu; + if (printk_recursion_bug) { + printk_recursion_bug = 0; + strcpy(printk_buf, printk_recursion_bug_msg); + printed_len = sizeof(printk_recursion_bug_msg); + } /* Emit the output into the temporary buffer */ - printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); + printed_len += vscnprintf(printk_buf + printed_len, + sizeof(printk_buf), fmt, args); /* * Copy the output into log_buf. If the caller didn't provide @@ -680,7 +702,11 @@ asmlinkage int vprintk(const char *fmt, loglev_char = default_message_loglevel + '0'; } - t = printk_clock(); + if (panic_timeout) { + panic_timeout = 0; + printk("recurse!\n"); + } + t = cpu_clock(printk_cpu); nanosec_rem = do_div(t, 1000000000); tlen = sprintf(tbuf, "<%c>[%5lu.%06lu] ", @@ -744,6 +770,7 @@ asmlinkage int vprintk(const char *fmt, printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); lockdep_on(); +out_restore_irqs: raw_local_irq_restore(flags); } Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -488,7 +488,12 @@ unsigned long long cpu_clock(int cpu) local_irq_save(flags); rq = cpu_rq(cpu); - update_rq_clock(rq); + /* + * Only call sched_clock() if the scheduler has already been + * initialized (some code might call cpu_clock() very early): + */ + if (rq->idle) + update_rq_clock(rq); now = rq->clock; local_irq_restore(flags); Index: linux/kernel/sched_fair.c =================================================================== --- linux.orig/kernel/sched_fair.c +++ linux/kernel/sched_fair.c @@ -511,8 +511,7 @@ place_entity(struct cfs_rq *cfs_rq, stru if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && - task_of(se)->policy != SCHED_BATCH) + if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se)) vruntime -= sysctl_sched_latency; /* ensure we never gain time by being placed backwards. */ Index: linux/kernel/time/clockevents.c =================================================================== --- linux.orig/kernel/time/clockevents.c +++ linux/kernel/time/clockevents.c @@ -78,6 +78,11 @@ int clockevents_program_event(struct clo unsigned long long clc; int64_t delta; + if (unlikely(expires.tv64 < 0)) { + WARN_ON_ONCE(1); + return -ETIME; + } + delta = ktime_to_ns(ktime_sub(expires, now)); if (delta <= 0) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/