Background:
Local APIC timer stops functioning when CPU is in C3 state. As a
result the local APIC timer interrupt will fire at uncertain times, depending
on how long we spend in C3 state. And this has two side effects
* Idle balancing will not happen as we expect it to.
* Kernel statistics for idle time will not be proper (as we get less LAPIC
interrupts when we are idle). This can result in confusing other parts of
kernel (like ondemand cpufreq governor) which depends on this idle stats.
Proposed Fix:
Attached is a prototype patch, that tries to eliminate the dependency on
local APIC timer for update_process_times(). The patch gets rid of Local APIC
timer altogether. We use the timer interrupt (IRQ 0) configured in
broadcast mode in IOAPIC instead (Doesn't work with 8259).
As changing anything related to basic timer interrupt is a little bit risky,
I have a boot parameter currently ("useapictimer") to switch back to original
local APIC timer way of doing things.
This may seem like a overkill to solve this particular problem. But, I feel
it simplifies things and will have other advantages:
* Should help dynamick tick as one has to change only global timer interrupt
freq with varying jiffies.
* Reduces one interrupt per jiffy.
* One less interrupt source to worry about.
The patch handles i386 and x86-64.
BEFORE:
/proc/interrupts at random time
lcoyote1-32-SLES9:~ # cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3
0: 32615 32194 32159 32166 IO-APIC-edge timer
:
:
LOC: 128584 128699 128585 128699
:
AFTER:
/proc/interrupts at random time
lcoyote1-32-SLES9:~ # cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3
0: 719237 718797 718913 718911 IO-APIC-edge timer
:
:
LOC: 0 0 0 0
:
TBD:
* This is tested in normal i386 and x86-64 hardware. I think that this scheme
should work on NUMAQ and other subarchitectures as well. But, haven't really
tested it on such hardware.
Comments/concerns?
Thanks,
Venki
Signed-off-by: Venkatesh Pallipadi <[email protected]>
diff -purN linux-2.6.12-rc2-mm3/arch/i386/kernel/apic.c linux-2.6.12-rc2-mm3-new/arch/i386/kernel/apic.c
--- linux-2.6.12-rc2-mm3/arch/i386/kernel/apic.c 2005-04-30 07:20:19.500689424 -0700
+++ linux-2.6.12-rc2-mm3-new/arch/i386/kernel/apic.c 2005-04-30 07:24:37.064533768 -0700
@@ -707,6 +707,13 @@ static void apic_pm_activate(void) { }
* Original code written by Keir Fraser.
*/
+static __init int setup_apictimer(char *str)
+{
+ using_apic_timer = 1;
+ return 0;
+}
+__setup("useapictimer", setup_apictimer);
+
static int __init apic_set_verbosity(char *str)
{
if (strcmp("debug", str) == 0)
@@ -1051,8 +1058,19 @@ static unsigned int calibration_result;
void __init setup_boot_APIC_clock(void)
{
+ /*
+ * Special case: If we were not able to setup IOAPIC timer interrupt
+ * to broadcast mode on an SMP capable system, then we have to use
+ * local apic timer...
+ */
+ if (!using_apic_timer && !timer_broadcast && (num_possible_cpus() > 1))
+ using_apic_timer = 1;
+
+ if (!using_apic_timer) {
+ apic_printk(APIC_VERBOSE, "Disabling APIC timer\n");
+ return;
+ }
apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
- using_apic_timer = 1;
local_irq_disable();
@@ -1154,9 +1172,7 @@ inline void smp_local_timer_interrupt(st
per_cpu(prof_counter, cpu);
}
-#ifdef CONFIG_SMP
update_process_times(user_mode(regs));
-#endif
}
/*
diff -purN linux-2.6.12-rc2-mm3/arch/i386/kernel/io_apic.c linux-2.6.12-rc2-mm3-new/arch/i386/kernel/io_apic.c
--- linux-2.6.12-rc2-mm3/arch/i386/kernel/io_apic.c 2005-04-30 07:20:19.514687296 -0700
+++ linux-2.6.12-rc2-mm3-new/arch/i386/kernel/io_apic.c 2005-04-30 07:24:37.066533464 -0700
@@ -84,6 +84,8 @@ int vector_irq[NR_VECTORS] = { [0 ... NR
#define vector_to_irq(vector) (vector)
#endif
+int timer_broadcast;
+
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
@@ -221,6 +223,21 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
+static void __init setup_IO_APIC_timer_broadcast(int pin)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ *(((int*)&entry) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+ *(((int*)&entry) + 1) = io_apic_read(0, 0x11 + 2 * pin);
+ entry.delivery_mode = dest_Fixed;
+ entry.dest.logical.logical_dest = 0xff;
+ io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+ io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
{
unsigned long flags;
@@ -228,6 +245,9 @@ static void set_ioapic_affinity_irq(unsi
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int apicid_value;
+ if (irq_desc[irq].status & IRQ_PER_CPU)
+ return;
+
apicid_value = cpu_mask_to_apicid(cpumask);
/* Prepare to do the io_apic_write */
apicid_value = apicid_value << 24;
@@ -2207,6 +2227,11 @@ static inline void check_timer(void)
setup_nmi();
enable_8259A_irq(0);
}
+ if (!using_apic_timer) {
+ timer_broadcast = 1;
+ irq_desc[0].status |= IRQ_PER_CPU;
+ setup_IO_APIC_timer_broadcast(pin1);
+ }
return;
}
clear_IO_APIC_pin(0, pin1);
diff -purN linux-2.6.12-rc2-mm3/arch/i386/kernel/nmi.c linux-2.6.12-rc2-mm3-new/arch/i386/kernel/nmi.c
--- linux-2.6.12-rc2-mm3/arch/i386/kernel/nmi.c 2005-04-30 07:20:19.520686384 -0700
+++ linux-2.6.12-rc2-mm3-new/arch/i386/kernel/nmi.c 2005-04-30 07:25:43.341458144 -0700
@@ -504,7 +504,11 @@ void nmi_watchdog_tick (struct pt_regs *
*/
int sum, cpu = smp_processor_id();
- sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+ if (using_apic_timer)
+ sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+ else
+ sum = kstat_cpu(cpu).irqs[0];
+
#ifdef CONFIG_KGDB
if (!in_kgdb(regs) && last_irq_sums[cpu] == sum) {
diff -purN linux-2.6.12-rc2-mm3/arch/i386/kernel/time.c linux-2.6.12-rc2-mm3-new/arch/i386/kernel/time.c
--- linux-2.6.12-rc2-mm3/arch/i386/kernel/time.c 2005-04-30 07:20:19.526685472 -0700
+++ linux-2.6.12-rc2-mm3-new/arch/i386/kernel/time.c 2005-04-30 07:24:37.067533312 -0700
@@ -297,14 +297,19 @@ irqreturn_t timer_interrupt(int irq, voi
* CPU. We need to avoid to SMP race with it. NOTE: we don' t need
* the irq version of write_lock because as just said we have irq
* locally disabled. -arca
+ * When timer interrupt is broadcast CPU0 becomes our timekeeper CPU
+ * Side effect: CPU0 cannot be hot added/removed
*/
- write_seqlock(&xtime_lock);
+ if (using_apic_timer || smp_processor_id() == 0) {
+ write_seqlock(&xtime_lock);
+ cur_timer->mark_offset();
+ do_timer_interrupt(irq, NULL, regs);
+ write_sequnlock(&xtime_lock);
+ }
- cur_timer->mark_offset();
-
- do_timer_interrupt(irq, NULL, regs);
+ /* We don't need to grab xtime lock to handle per cpu schedule, etc */
+ do_timer_interrupt_hook_percpu(regs);
- write_sequnlock(&xtime_lock);
return IRQ_HANDLED;
}
diff -purN linux-2.6.12-rc2-mm3/arch/x86_64/kernel/apic.c linux-2.6.12-rc2-mm3-new/arch/x86_64/kernel/apic.c
--- linux-2.6.12-rc2-mm3/arch/x86_64/kernel/apic.c 2005-04-30 07:20:19.631669512 -0700
+++ linux-2.6.12-rc2-mm3-new/arch/x86_64/kernel/apic.c 2005-04-30 07:24:37.068533160 -0700
@@ -36,8 +36,6 @@
int apic_verbosity;
-int disable_apic_timer __initdata;
-
/* Using APIC to generate smp_local_timer_interrupt? */
int using_apic_timer = 0;
@@ -790,13 +788,20 @@ static unsigned int calibration_result;
void __init setup_boot_APIC_clock (void)
{
- if (disable_apic_timer) {
+ /*
+ * Special case: If we were not able to setup IOAPIC timer interrupt
+ * to broadcast mode on an SMP capable system, then we have to use
+ * local apic timer...
+ */
+ if (!using_apic_timer && !timer_broadcast && (num_possible_cpus() > 1))
+ using_apic_timer = 1;
+
+ if (!using_apic_timer) {
printk(KERN_INFO "Disabling APIC timer\n");
return;
}
printk(KERN_INFO "Using local APIC timer interrupts.\n");
- using_apic_timer = 1;
local_irq_disable();
@@ -899,9 +904,7 @@ void smp_local_timer_interrupt(struct pt
per_cpu(prof_counter, cpu);
}
-#ifdef CONFIG_SMP
update_process_times(user_mode(regs));
-#endif
}
/*
@@ -1108,9 +1111,9 @@ static __init int setup_nolapic(char *st
return 0;
}
-static __init int setup_noapictimer(char *str)
+static __init int setup_apictimer(char *str)
{
- disable_apic_timer = 1;
+ using_apic_timer = 1;
return 0;
}
@@ -1119,6 +1122,6 @@ static __init int setup_noapictimer(char
__setup("disableapic", setup_disableapic);
__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
-__setup("noapictimer", setup_noapictimer);
+__setup("useapictimer", setup_apictimer);
/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff -purN linux-2.6.12-rc2-mm3/arch/x86_64/kernel/io_apic.c linux-2.6.12-rc2-mm3-new/arch/x86_64/kernel/io_apic.c
--- linux-2.6.12-rc2-mm3/arch/x86_64/kernel/io_apic.c 2005-04-30 07:20:19.636668752 -0700
+++ linux-2.6.12-rc2-mm3-new/arch/x86_64/kernel/io_apic.c 2005-04-30 07:24:37.070532856 -0700
@@ -75,6 +75,8 @@ int vector_irq[NR_VECTORS] = { [0 ... NR
#define vector_to_irq(vector) (vector)
#endif
+int timer_broadcast;
+
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
@@ -179,6 +181,21 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
+static void __init setup_IO_APIC_timer_broadcast(int pin)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ *(((int*)&entry) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+ *(((int*)&entry) + 1) = io_apic_read(0, 0x11 + 2 * pin);
+ entry.delivery_mode = dest_Fixed;
+ entry.dest.logical.logical_dest = 0xff;
+ io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+ io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
* specific CPU-side IRQs.
@@ -1349,6 +1366,9 @@ static void set_ioapic_affinity_irq(unsi
unsigned long flags;
unsigned int dest;
+ if (irq_desc[irq].status & IRQ_PER_CPU)
+ return;
+
dest = cpu_mask_to_apicid(mask);
/*
@@ -1640,6 +1660,11 @@ static inline void check_timer(void)
setup_nmi();
enable_8259A_irq(0);
}
+ if (!using_apic_timer) {
+ timer_broadcast = 1;
+ irq_desc[0].status |= IRQ_PER_CPU;
+ setup_IO_APIC_timer_broadcast(pin1);
+ }
return;
}
clear_IO_APIC_pin(0, pin1);
diff -purN linux-2.6.12-rc2-mm3/arch/x86_64/kernel/nmi.c linux-2.6.12-rc2-mm3-new/arch/x86_64/kernel/nmi.c
--- linux-2.6.12-rc2-mm3/arch/x86_64/kernel/nmi.c 2005-04-30 07:20:19.643667688 -0700
+++ linux-2.6.12-rc2-mm3-new/arch/x86_64/kernel/nmi.c 2005-04-30 07:24:37.070532856 -0700
@@ -389,7 +389,11 @@ void nmi_watchdog_tick (struct pt_regs *
int sum, cpu;
cpu = safe_smp_processor_id();
- sum = read_pda(apic_timer_irqs);
+ if (using_apic_timer)
+ sum = read_pda(apic_timer_irqs);
+ else
+ sum = kstat_cpu(cpu).irqs[0];
+
if (last_irq_sums[cpu] == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
diff -purN linux-2.6.12-rc2-mm3/arch/x86_64/kernel/time.c linux-2.6.12-rc2-mm3-new/arch/x86_64/kernel/time.c
--- linux-2.6.12-rc2-mm3/arch/x86_64/kernel/time.c 2005-04-30 07:20:19.650666624 -0700
+++ linux-2.6.12-rc2-mm3-new/arch/x86_64/kernel/time.c 2005-04-30 07:24:37.071532704 -0700
@@ -358,7 +358,7 @@ static noinline void handle_lost_ticks(i
#endif
}
-static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
static unsigned long rtc_update = 0;
unsigned long tsc;
@@ -423,27 +423,10 @@ static irqreturn_t timer_interrupt(int i
jiffies += lost;
}
-/*
- * Do the timer stuff.
- */
-
+ /*
+ * Do the timer stuff.
+ */
do_timer(regs);
-#ifndef CONFIG_SMP
- update_process_times(user_mode(regs));
-#endif
-
-/*
- * In the SMP case we use the local APIC timer interrupt to do the profiling,
- * except when we simulate SMP mode on a uniprocessor system, in that case we
- * have to call the local interrupt handler.
- */
-
-#ifndef CONFIG_X86_LOCAL_APIC
- profile_tick(CPU_PROFILING, regs);
-#else
- if (!using_apic_timer)
- smp_local_timer_interrupt(regs);
-#endif
/*
* If we have an externally synchronized Linux clock, then update CMOS clock
@@ -461,6 +444,31 @@ static irqreturn_t timer_interrupt(int i
write_sequnlock(&xtime_lock);
+ return;
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+ /*
+ * CPU0 becomes our timekeeper CPU
+ * Side effect: CPU0 cannot be hot added/removed
+ */
+ if (using_apic_timer || smp_processor_id() == 0)
+ do_timer_interrupt(irq, dev_id, regs);
+
+ /*
+ * In case we are using local APIC timer interrupt these calls
+ * will be done there.
+ */
+#ifndef CONFIG_X86_LOCAL_APIC
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING, regs);
+#else
+ if (!using_apic_timer) {
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING, regs);
+ }
+#endif
return IRQ_HANDLED;
}
diff -purN linux-2.6.12-rc2-mm3/include/asm-i386/io_apic.h linux-2.6.12-rc2-mm3-new/include/asm-i386/io_apic.h
--- linux-2.6.12-rc2-mm3/include/asm-i386/io_apic.h 2005-03-01 23:38:13.000000000 -0800
+++ linux-2.6.12-rc2-mm3-new/include/asm-i386/io_apic.h 2005-04-30 07:24:37.072532552 -0700
@@ -13,6 +13,8 @@
#ifdef CONFIG_X86_IO_APIC
+extern int timer_broadcast;
+
#ifdef CONFIG_PCI_MSI
static inline int use_pci_vector(void) {return 1;}
static inline void disable_edge_ioapic_vector(unsigned int vector) { }
diff -purN linux-2.6.12-rc2-mm3/include/asm-i386/mach-default/do_timer.h linux-2.6.12-rc2-mm3-new/include/asm-i386/mach-default/do_timer.h
--- linux-2.6.12-rc2-mm3/include/asm-i386/mach-default/do_timer.h 2005-03-01 23:38:26.000000000 -0800
+++ linux-2.6.12-rc2-mm3-new/include/asm-i386/mach-default/do_timer.h 2005-04-30 07:24:37.072532552 -0700
@@ -16,23 +16,36 @@
static inline void do_timer_interrupt_hook(struct pt_regs *regs)
{
do_timer(regs);
-#ifndef CONFIG_SMP
- update_process_times(user_mode(regs));
-#endif
-/*
- * In the SMP case we use the local APIC timer interrupt to do the
- * profiling, except when we simulate SMP mode on a uniprocessor
- * system, in that case we have to call the local interrupt handler.
- */
+}
+
+
+/**
+ * do_timer_interrupt_hook_percpu - hook into timer tick for each cpu
+ * @regs: standard registers from interrupt
+ *
+ * Description:
+ * It's primary purpose is to allow architectures that don't use
+ * individual per CPU clocks (like the CPU APICs supply) to handle
+ * timer interrupt as a means of triggering reschedules etc.
+ **/
+
+static inline void do_timer_interrupt_hook_percpu(struct pt_regs *regs)
+{
+ /*
+ * In case we are using local APIC timer interrupt these calls
+ * will be done there.
+ */
#ifndef CONFIG_X86_LOCAL_APIC
+ update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING, regs);
#else
- if (!using_apic_timer)
- smp_local_timer_interrupt(regs);
+ if (!using_apic_timer) {
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING, regs);
+ }
#endif
}
-
/* you can safely undefine this if you don't have the Neptune chipset */
#define BUGGY_NEPTUN_TIMER
diff -purN linux-2.6.12-rc2-mm3/include/asm-i386/mach-visws/do_timer.h linux-2.6.12-rc2-mm3-new/include/asm-i386/mach-visws/do_timer.h
--- linux-2.6.12-rc2-mm3/include/asm-i386/mach-visws/do_timer.h 2005-03-01 23:37:53.000000000 -0800
+++ linux-2.6.12-rc2-mm3-new/include/asm-i386/mach-visws/do_timer.h 2005-04-30 07:24:37.072532552 -0700
@@ -9,15 +9,13 @@ static inline void do_timer_interrupt_ho
co_cpu_write(CO_CPU_STAT,co_cpu_read(CO_CPU_STAT) & ~CO_STAT_TIMEINTR);
do_timer(regs);
-#ifndef CONFIG_SMP
- update_process_times(user_mode(regs));
-#endif
/*
* In the SMP case we use the local APIC timer interrupt to do the
* profiling, except when we simulate SMP mode on a uniprocessor
* system, in that case we have to call the local interrupt handler.
*/
#ifndef CONFIG_X86_LOCAL_APIC
+ update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING, regs);
#else
if (!using_apic_timer)
@@ -25,6 +23,17 @@ static inline void do_timer_interrupt_ho
#endif
}
+/**
+ * do_timer_interrupt_hook_percpu - hook into timer tick for each cpu
+ * @regs: standard registers from interrupt
+ *
+ * Description:
+ * It's primary purpose is to allow architectures that don't use
+ * individual per CPU clocks (like the CPU APICs supply) to handle
+ * timer interrupt as a means of triggering reschedules etc.
+ **/
+static inline void do_timer_interrupt_hook_percpu(struct pt_regs *regs) {}
+
static inline int do_timer_overflow(int count)
{
int i;
diff -purN linux-2.6.12-rc2-mm3/include/asm-i386/mach-voyager/do_timer.h linux-2.6.12-rc2-mm3-new/include/asm-i386/mach-voyager/do_timer.h
--- linux-2.6.12-rc2-mm3/include/asm-i386/mach-voyager/do_timer.h 2005-03-01 23:38:17.000000000 -0800
+++ linux-2.6.12-rc2-mm3-new/include/asm-i386/mach-voyager/do_timer.h 2005-04-30 07:24:37.073532400 -0700
@@ -11,6 +11,17 @@ static inline void do_timer_interrupt_ho
voyager_timer_interrupt(regs);
}
+/**
+ * do_timer_interrupt_hook_percpu - hook into timer tick for each cpu
+ * @regs: standard registers from interrupt
+ *
+ * Description:
+ * It's primary purpose is to allow architectures that don't use
+ * individual per CPU clocks (like the CPU APICs supply) to handle
+ * timer interrupt as a means of triggering reschedules etc.
+ **/
+static inline void do_timer_interrupt_hook_percpu(struct pt_regs *regs) {}
+
static inline int do_timer_overflow(int count)
{
/* can't read the ISR, just assume 1 tick
diff -purN linux-2.6.12-rc2-mm3/include/asm-x86_64/io_apic.h linux-2.6.12-rc2-mm3-new/include/asm-x86_64/io_apic.h
--- linux-2.6.12-rc2-mm3/include/asm-x86_64/io_apic.h 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.12-rc2-mm3-new/include/asm-x86_64/io_apic.h 2005-04-30 07:24:37.073532400 -0700
@@ -13,6 +13,8 @@
#ifdef CONFIG_X86_IO_APIC
+extern int timer_broadcast;
+
#ifdef CONFIG_PCI_MSI
static inline int use_pci_vector(void) {return 1;}
static inline void disable_edge_ioapic_vector(unsigned int vector) { }
On Fri, 29 Apr 2005, Venkatesh Pallipadi wrote:
>
> Background:
> Local APIC timer stops functioning when CPU is in C3 state. As a
> result the local APIC timer interrupt will fire at uncertain times, depending
> on how long we spend in C3 state. And this has two side effects
> * Idle balancing will not happen as we expect it to.
> * Kernel statistics for idle time will not be proper (as we get less LAPIC
> interrupts when we are idle). This can result in confusing other parts of
> kernel (like ondemand cpufreq governor) which depends on this idle stats.
>
>
> Proposed Fix:
> Attached is a prototype patch, that tries to eliminate the dependency on
> local APIC timer for update_process_times(). The patch gets rid of Local APIC
> timer altogether. We use the timer interrupt (IRQ 0) configured in
> broadcast mode in IOAPIC instead (Doesn't work with 8259).
> As changing anything related to basic timer interrupt is a little bit risky,
> I have a boot parameter currently ("useapictimer") to switch back to original
> local APIC timer way of doing things.
This all looks like it'll contend on irq0 related locks really badly, have
you profiled this?
Thanks,
Zwane
On Fri, 29 Apr 2005, Zwane Mwaikambo wrote:
> On Fri, 29 Apr 2005, Venkatesh Pallipadi wrote:
>
> >
> > Background:
> > Local APIC timer stops functioning when CPU is in C3 state. As a
> > result the local APIC timer interrupt will fire at uncertain times, depending
> > on how long we spend in C3 state. And this has two side effects
> > * Idle balancing will not happen as we expect it to.
> > * Kernel statistics for idle time will not be proper (as we get less LAPIC
> > interrupts when we are idle). This can result in confusing other parts of
> > kernel (like ondemand cpufreq governor) which depends on this idle stats.
> >
> >
> > Proposed Fix:
> > Attached is a prototype patch, that tries to eliminate the dependency on
> > local APIC timer for update_process_times(). The patch gets rid of Local APIC
> > timer altogether. We use the timer interrupt (IRQ 0) configured in
> > broadcast mode in IOAPIC instead (Doesn't work with 8259).
> > As changing anything related to basic timer interrupt is a little bit risky,
> > I have a boot parameter currently ("useapictimer") to switch back to original
> > local APIC timer way of doing things.
>
> This all looks like it'll contend on irq0 related locks really badly, have
> you profiled this?
Hmm ok i see you have IRQ_PER_CPU, so please ignore my statement.
Thanks,
Zwane
On Fri, 29 Apr 2005, Venkatesh Pallipadi wrote:
> Proposed Fix:
> Attached is a prototype patch, that tries to eliminate the dependency on
> local APIC timer for update_process_times(). The patch gets rid of Local APIC
> timer altogether. We use the timer interrupt (IRQ 0) configured in
> broadcast mode in IOAPIC instead (Doesn't work with 8259).
> As changing anything related to basic timer interrupt is a little bit risky,
> I have a boot parameter currently ("useapictimer") to switch back to original
> local APIC timer way of doing things.
I'm rather reluctant to advocate the broadcast scheme as i can see it
breaking on a lot of systems, e.g. SMP systems which use i8259 (as you
noted), IBM x440 and ES7000. If anything the default mode should be APIC
timer and have a parameter to disable it. Regarding things like variable
timer ticks, reprogramming the PIT is slow, and using it extensively for
such sounds like a step in the wrong direction. Is this feature/bug going
to proliferate amongst newer processor local APICs?
Thanks,
Zwane
Venkatesh Pallipadi <[email protected]> wrote:
>
> Background:
> Local APIC timer stops functioning when CPU is in C3 state. As a
> result the local APIC timer interrupt will fire at uncertain times, depending
> on how long we spend in C3 state. And this has two side effects
> * Idle balancing will not happen as we expect it to.
> * Kernel statistics for idle time will not be proper (as we get less LAPIC
> interrupts when we are idle). This can result in confusing other parts of
> kernel (like ondemand cpufreq governor) which depends on this idle stats.
>
>
> Proposed Fix:
> Attached is a prototype patch, that tries to eliminate the dependency on
> local APIC timer for update_process_times(). The patch gets rid of Local APIC
> timer altogether. We use the timer interrupt (IRQ 0) configured in
> broadcast mode in IOAPIC instead (Doesn't work with 8259).
> As changing anything related to basic timer interrupt is a little bit risky,
> I have a boot parameter currently ("useapictimer") to switch back to original
> local APIC timer way of doing things.
>
> This may seem like a overkill to solve this particular problem. But, I feel
> it simplifies things and will have other advantages:
> * Should help dynamick tick as one has to change only global timer interrupt
> freq with varying jiffies.
> * Reduces one interrupt per jiffy.
> * One less interrupt source to worry about.
The patch (at least, as I merged it) goes into a ghastly death spiral early
in boot.
Serial console says:
Initializing CPU#1
Calibrating delay using timer specific routine.. 5615.95 BogoMIPS (lpj=2807978)
CPU: Trace cache: 12K uops, L1 D cache: 8K
CPU: L2 cache: 512K
CPU: Physical Processor ID: 0
CPU1: Intel Pentium 4 (Northwood) stepping 07
Total of 2 processors activated (11238.26 BogoMIPS).
ENABLING IO-APIC IRQs
..TIMER: vector=0x31 pin1=2 pin2=-1
checking TSC synchronization across 2 CPUs: passed.
Brought up 2 CPUs
Unable to handle kernel NULL pointer dereference
which isn't very helpful.
tty output is at http://www.zip.com.au/~akpm/linux/patches/stuff/dsc02506.jpg
which is also less that totally useful.
There's waaaaaaaaay too much low-level x86 stuff happening at present. We
need to settle it down, go more slowly, take more care and test things
better, please. Next -mm has already been delayed by two days due to my
having to chase down all the bugs people have been sending me.
Here's what I merged:
From: Venkatesh Pallipadi <[email protected]>
Background:
Local APIC timer stops functioning when CPU is in C3 state. As a
result the local APIC timer interrupt will fire at uncertain times, depending
on how long we spend in C3 state. And this has two side effects
* Idle balancing will not happen as we expect it to.
* Kernel statistics for idle time will not be proper (as we get less LAPIC
interrupts when we are idle). This can result in confusing other parts of
kernel (like ondemand cpufreq governor) which depends on this idle stats.
Proposed Fix:
Attached is a prototype patch, that tries to eliminate the dependency on
local APIC timer for update_process_times(). The patch gets rid of Local APIC
timer altogether. We use the timer interrupt (IRQ 0) configured in
broadcast mode in IOAPIC instead (Doesn't work with 8259).
As changing anything related to basic timer interrupt is a little bit risky,
I have a boot parameter currently ("useapictimer") to switch back to original
local APIC timer way of doing things.
This may seem like a overkill to solve this particular problem. But, I feel
it simplifies things and will have other advantages:
* Should help dynamick tick as one has to change only global timer interrupt
freq with varying jiffies.
* Reduces one interrupt per jiffy.
* One less interrupt source to worry about.
The patch handles i386 and x86-64.
BEFORE:
/proc/interrupts at random time
lcoyote1-32-SLES9:~ # cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3
0: 32615 32194 32159 32166 IO-APIC-edge timer
:
:
LOC: 128584 128699 128585 128699
:
AFTER:
/proc/interrupts at random time
lcoyote1-32-SLES9:~ # cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3
0: 719237 718797 718913 718911 IO-APIC-edge timer
:
:
LOC: 0 0 0 0
:
TBD:
* This is tested in normal i386 and x86-64 hardware. I think that this scheme
should work on NUMAQ and other subarchitectures as well. But, haven't really
tested it on such hardware.
Signed-off-by: Venkatesh Pallipadi <[email protected]>
Zwane:
I'm rather reluctant to advocate the broadcast scheme as I can see it
breaking on a lot of systems, e.g. SMP systems which use i8259 (as you
noted), IBM x440 and ES7000. If anything the default mode should be APIC
timer and have a parameter to disable it. Regarding things like variable
timer ticks, reprogramming the PIT is slow, and using it extensively for such
sounds like a step in the wrong direction. Is this feature/bug going to
proliferate amongst newer processor local APICs?
Signed-off-by: Andrew Morton <[email protected]>
---
arch/i386/kernel/apic.c | 22 +++++++++++--
arch/i386/kernel/io_apic.c | 25 +++++++++++++++
arch/i386/kernel/nmi.c | 6 +++
arch/i386/kernel/time.c | 15 ++++++---
arch/x86_64/kernel/apic.c | 21 +++++++------
arch/x86_64/kernel/io_apic.c | 25 +++++++++++++++
arch/x86_64/kernel/nmi.c | 6 +++
arch/x86_64/kernel/time.c | 50 +++++++++++++++++--------------
include/asm-i386/io_apic.h | 2 +
include/asm-i386/mach-default/do_timer.h | 35 ++++++++++++++-------
include/asm-i386/mach-visws/do_timer.h | 15 +++++++--
include/asm-i386/mach-voyager/do_timer.h | 11 ++++++
include/asm-x86_64/io_apic.h | 2 +
13 files changed, 181 insertions(+), 54 deletions(-)
diff -puN arch/i386/kernel/apic.c~i386-x86-64-eliminate-local-apic-timer-interrupt arch/i386/kernel/apic.c
--- 25/arch/i386/kernel/apic.c~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.433620256 -0700
+++ 25-akpm/arch/i386/kernel/apic.c 2005-04-29 18:32:14.455616912 -0700
@@ -685,6 +685,13 @@ static int __init lapic_enable(char *str
}
__setup("lapic", lapic_enable);
+static __init int setup_apictimer(char *str)
+{
+ using_apic_timer = 1;
+ return 0;
+}
+__setup("useapictimer", setup_apictimer);
+
static int __init apic_set_verbosity(char *str)
{
if (strcmp("debug", str) == 0)
@@ -1029,8 +1036,19 @@ static unsigned int calibration_result;
void __init setup_boot_APIC_clock(void)
{
+ /*
+ * Special case: If we were not able to setup IOAPIC timer interrupt
+ * to broadcast mode on an SMP capable system, then we have to use
+ * local apic timer...
+ */
+ if (!using_apic_timer && !timer_broadcast && (num_possible_cpus() > 1))
+ using_apic_timer = 1;
+
+ if (!using_apic_timer) {
+ apic_printk(APIC_VERBOSE, "Disabling APIC timer\n");
+ return;
+ }
apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
- using_apic_timer = 1;
local_irq_disable();
@@ -1132,9 +1150,7 @@ inline void smp_local_timer_interrupt(st
per_cpu(prof_counter, cpu);
}
-#ifdef CONFIG_SMP
update_process_times(user_mode(regs));
-#endif
}
/*
diff -puN arch/i386/kernel/io_apic.c~i386-x86-64-eliminate-local-apic-timer-interrupt arch/i386/kernel/io_apic.c
--- 25/arch/i386/kernel/io_apic.c~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.435619952 -0700
+++ 25-akpm/arch/i386/kernel/io_apic.c 2005-04-29 18:32:14.458616456 -0700
@@ -84,6 +84,8 @@ int vector_irq[NR_VECTORS] = { [0 ... NR
#define vector_to_irq(vector) (vector)
#endif
+int timer_broadcast;
+
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
@@ -221,6 +223,21 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
+static void __init setup_IO_APIC_timer_broadcast(int pin)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ *(((int*)&entry) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+ *(((int*)&entry) + 1) = io_apic_read(0, 0x11 + 2 * pin);
+ entry.delivery_mode = dest_Fixed;
+ entry.dest.logical.logical_dest = 0xff;
+ io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+ io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
{
unsigned long flags;
@@ -235,6 +252,9 @@ static void set_ioapic_affinity_irq(unsi
cpus_and(cpumask, tmp, CPU_MASK_ALL);
+ if (irq_desc[irq].status & IRQ_PER_CPU)
+ return;
+
apicid_value = cpu_mask_to_apicid(cpumask);
/* Prepare to do the io_apic_write */
apicid_value = apicid_value << 24;
@@ -2178,6 +2198,11 @@ static inline void check_timer(void)
setup_nmi();
enable_8259A_irq(0);
}
+ if (!using_apic_timer) {
+ timer_broadcast = 1;
+ irq_desc[0].status |= IRQ_PER_CPU;
+ setup_IO_APIC_timer_broadcast(pin1);
+ }
return;
}
clear_IO_APIC_pin(0, pin1);
diff -puN arch/i386/kernel/nmi.c~i386-x86-64-eliminate-local-apic-timer-interrupt arch/i386/kernel/nmi.c
--- 25/arch/i386/kernel/nmi.c~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.436619800 -0700
+++ 25-akpm/arch/i386/kernel/nmi.c 2005-04-29 18:32:14.458616456 -0700
@@ -486,7 +486,11 @@ void nmi_watchdog_tick (struct pt_regs *
*/
int sum, cpu = smp_processor_id();
- sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+ if (using_apic_timer)
+ sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+ else
+ sum = kstat_cpu(cpu).irqs[0];
+
if (last_irq_sums[cpu] == sum) {
/*
diff -puN arch/i386/kernel/time.c~i386-x86-64-eliminate-local-apic-timer-interrupt arch/i386/kernel/time.c
--- 25/arch/i386/kernel/time.c~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.438619496 -0700
+++ 25-akpm/arch/i386/kernel/time.c 2005-04-29 18:32:14.459616304 -0700
@@ -297,14 +297,19 @@ irqreturn_t timer_interrupt(int irq, voi
* CPU. We need to avoid to SMP race with it. NOTE: we don' t need
* the irq version of write_lock because as just said we have irq
* locally disabled. -arca
+ * When timer interrupt is broadcast CPU0 becomes our timekeeper CPU
+ * Side effect: CPU0 cannot be hot added/removed
*/
- write_seqlock(&xtime_lock);
+ if (using_apic_timer || smp_processor_id() == 0) {
+ write_seqlock(&xtime_lock);
+ cur_timer->mark_offset();
+ do_timer_interrupt(irq, NULL, regs);
+ write_sequnlock(&xtime_lock);
+ }
- cur_timer->mark_offset();
-
- do_timer_interrupt(irq, NULL, regs);
+ /* We don't need to grab xtime lock to handle per cpu schedule, etc */
+ do_timer_interrupt_hook_percpu(regs);
- write_sequnlock(&xtime_lock);
return IRQ_HANDLED;
}
diff -puN arch/x86_64/kernel/apic.c~i386-x86-64-eliminate-local-apic-timer-interrupt arch/x86_64/kernel/apic.c
--- 25/arch/x86_64/kernel/apic.c~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.439619344 -0700
+++ 25-akpm/arch/x86_64/kernel/apic.c 2005-04-29 18:32:14.461616000 -0700
@@ -36,8 +36,6 @@
int apic_verbosity;
-int disable_apic_timer __initdata;
-
/* Using APIC to generate smp_local_timer_interrupt? */
int using_apic_timer = 0;
@@ -754,13 +752,20 @@ static unsigned int calibration_result;
void __init setup_boot_APIC_clock (void)
{
- if (disable_apic_timer) {
+ /*
+ * Special case: If we were not able to setup IOAPIC timer interrupt
+ * to broadcast mode on an SMP capable system, then we have to use
+ * local apic timer...
+ */
+ if (!using_apic_timer && !timer_broadcast && (num_possible_cpus() > 1))
+ using_apic_timer = 1;
+
+ if (!using_apic_timer) {
printk(KERN_INFO "Disabling APIC timer\n");
return;
}
printk(KERN_INFO "Using local APIC timer interrupts.\n");
- using_apic_timer = 1;
local_irq_disable();
@@ -863,9 +868,7 @@ void smp_local_timer_interrupt(struct pt
per_cpu(prof_counter, cpu);
}
-#ifdef CONFIG_SMP
update_process_times(user_mode(regs));
-#endif
}
/*
@@ -1072,9 +1075,9 @@ static __init int setup_nolapic(char *st
return 0;
}
-static __init int setup_noapictimer(char *str)
+static __init int setup_apictimer(char *str)
{
- disable_apic_timer = 1;
+ using_apic_timer = 1;
return 0;
}
@@ -1083,6 +1086,6 @@ static __init int setup_noapictimer(char
__setup("disableapic", setup_disableapic);
__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
-__setup("noapictimer", setup_noapictimer);
+__setup("useapictimer", setup_apictimer);
/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff -puN arch/x86_64/kernel/io_apic.c~i386-x86-64-eliminate-local-apic-timer-interrupt arch/x86_64/kernel/io_apic.c
--- 25/arch/x86_64/kernel/io_apic.c~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.441619040 -0700
+++ 25-akpm/arch/x86_64/kernel/io_apic.c 2005-04-29 18:32:14.462615848 -0700
@@ -109,6 +109,9 @@ static void set_ioapic_affinity_irq(unsi
cpus_and(mask, tmp, CPU_MASK_ALL);
+ if (irq_desc[irq].status & IRQ_PER_CPU)
+ return;
+
dest = cpu_mask_to_apicid(mask);
/*
@@ -128,6 +131,8 @@ static void set_ioapic_affinity_irq(unsi
}
#endif
+int timer_broadcast;
+
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
@@ -212,6 +217,21 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
+static void __init setup_IO_APIC_timer_broadcast(int pin)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ *(((int*)&entry) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+ *(((int*)&entry) + 1) = io_apic_read(0, 0x11 + 2 * pin);
+ entry.delivery_mode = dest_Fixed;
+ entry.dest.logical.logical_dest = 0xff;
+ io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+ io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
* specific CPU-side IRQs.
@@ -1630,6 +1650,11 @@ static inline void check_timer(void)
setup_nmi();
enable_8259A_irq(0);
}
+ if (!using_apic_timer) {
+ timer_broadcast = 1;
+ irq_desc[0].status |= IRQ_PER_CPU;
+ setup_IO_APIC_timer_broadcast(pin1);
+ }
return;
}
clear_IO_APIC_pin(0, pin1);
diff -puN arch/x86_64/kernel/nmi.c~i386-x86-64-eliminate-local-apic-timer-interrupt arch/x86_64/kernel/nmi.c
--- 25/arch/x86_64/kernel/nmi.c~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.443618736 -0700
+++ 25-akpm/arch/x86_64/kernel/nmi.c 2005-04-29 18:32:14.463615696 -0700
@@ -384,7 +384,11 @@ void nmi_watchdog_tick (struct pt_regs *
int sum, cpu;
cpu = safe_smp_processor_id();
- sum = read_pda(apic_timer_irqs);
+ if (using_apic_timer)
+ sum = read_pda(apic_timer_irqs);
+ else
+ sum = kstat_cpu(cpu).irqs[0];
+
if (last_irq_sums[cpu] == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
diff -puN arch/x86_64/kernel/time.c~i386-x86-64-eliminate-local-apic-timer-interrupt arch/x86_64/kernel/time.c
--- 25/arch/x86_64/kernel/time.c~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.444618584 -0700
+++ 25-akpm/arch/x86_64/kernel/time.c 2005-04-29 18:32:14.464615544 -0700
@@ -361,7 +361,7 @@ static noinline void handle_lost_ticks(i
#endif
}
-static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
static unsigned long rtc_update = 0;
unsigned long tsc;
@@ -433,27 +433,10 @@ static irqreturn_t timer_interrupt(int i
jiffies += lost;
}
-/*
- * Do the timer stuff.
- */
-
+ /*
+ * Do the timer stuff.
+ */
do_timer(regs);
-#ifndef CONFIG_SMP
- update_process_times(user_mode(regs));
-#endif
-
-/*
- * In the SMP case we use the local APIC timer interrupt to do the profiling,
- * except when we simulate SMP mode on a uniprocessor system, in that case we
- * have to call the local interrupt handler.
- */
-
-#ifndef CONFIG_X86_LOCAL_APIC
- profile_tick(CPU_PROFILING, regs);
-#else
- if (!using_apic_timer)
- smp_local_timer_interrupt(regs);
-#endif
/*
* If we have an externally synchronized Linux clock, then update CMOS clock
@@ -471,6 +454,31 @@ static irqreturn_t timer_interrupt(int i
write_sequnlock(&xtime_lock);
+ return;
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+ /*
+ * CPU0 becomes our timekeeper CPU
+ * Side effect: CPU0 cannot be hot added/removed
+ */
+ if (using_apic_timer || smp_processor_id() == 0)
+ do_timer_interrupt(irq, dev_id, regs);
+
+ /*
+ * In case we are using local APIC timer interrupt these calls
+ * will be done there.
+ */
+#ifndef CONFIG_X86_LOCAL_APIC
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING, regs);
+#else
+ if (!using_apic_timer) {
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING, regs);
+ }
+#endif
return IRQ_HANDLED;
}
diff -puN include/asm-i386/io_apic.h~i386-x86-64-eliminate-local-apic-timer-interrupt include/asm-i386/io_apic.h
--- 25/include/asm-i386/io_apic.h~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.445618432 -0700
+++ 25-akpm/include/asm-i386/io_apic.h 2005-04-29 18:32:14.465615392 -0700
@@ -13,6 +13,8 @@
#ifdef CONFIG_X86_IO_APIC
+extern int timer_broadcast;
+
#ifdef CONFIG_PCI_MSI
static inline int use_pci_vector(void) {return 1;}
static inline void disable_edge_ioapic_vector(unsigned int vector) { }
diff -puN include/asm-i386/mach-default/do_timer.h~i386-x86-64-eliminate-local-apic-timer-interrupt include/asm-i386/mach-default/do_timer.h
--- 25/include/asm-i386/mach-default/do_timer.h~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.447618128 -0700
+++ 25-akpm/include/asm-i386/mach-default/do_timer.h 2005-04-29 18:32:14.465615392 -0700
@@ -16,23 +16,36 @@
static inline void do_timer_interrupt_hook(struct pt_regs *regs)
{
do_timer(regs);
-#ifndef CONFIG_SMP
- update_process_times(user_mode(regs));
-#endif
-/*
- * In the SMP case we use the local APIC timer interrupt to do the
- * profiling, except when we simulate SMP mode on a uniprocessor
- * system, in that case we have to call the local interrupt handler.
- */
+}
+
+
+/**
+ * do_timer_interrupt_hook_percpu - hook into timer tick for each cpu
+ * @regs: standard registers from interrupt
+ *
+ * Description:
+ * It's primary purpose is to allow architectures that don't use
+ * individual per CPU clocks (like the CPU APICs supply) to handle
+ * timer interrupt as a means of triggering reschedules etc.
+ **/
+
+static inline void do_timer_interrupt_hook_percpu(struct pt_regs *regs)
+{
+ /*
+ * In case we are using local APIC timer interrupt these calls
+ * will be done there.
+ */
#ifndef CONFIG_X86_LOCAL_APIC
+ update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING, regs);
#else
- if (!using_apic_timer)
- smp_local_timer_interrupt(regs);
+ if (!using_apic_timer) {
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING, regs);
+ }
#endif
}
-
/* you can safely undefine this if you don't have the Neptune chipset */
#define BUGGY_NEPTUN_TIMER
diff -puN include/asm-i386/mach-visws/do_timer.h~i386-x86-64-eliminate-local-apic-timer-interrupt include/asm-i386/mach-visws/do_timer.h
--- 25/include/asm-i386/mach-visws/do_timer.h~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.448617976 -0700
+++ 25-akpm/include/asm-i386/mach-visws/do_timer.h 2005-04-29 18:32:14.466615240 -0700
@@ -9,15 +9,13 @@ static inline void do_timer_interrupt_ho
co_cpu_write(CO_CPU_STAT,co_cpu_read(CO_CPU_STAT) & ~CO_STAT_TIMEINTR);
do_timer(regs);
-#ifndef CONFIG_SMP
- update_process_times(user_mode(regs));
-#endif
/*
* In the SMP case we use the local APIC timer interrupt to do the
* profiling, except when we simulate SMP mode on a uniprocessor
* system, in that case we have to call the local interrupt handler.
*/
#ifndef CONFIG_X86_LOCAL_APIC
+ update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING, regs);
#else
if (!using_apic_timer)
@@ -25,6 +23,17 @@ static inline void do_timer_interrupt_ho
#endif
}
+/**
+ * do_timer_interrupt_hook_percpu - hook into timer tick for each cpu
+ * @regs: standard registers from interrupt
+ *
+ * Description:
+ * It's primary purpose is to allow architectures that don't use
+ * individual per CPU clocks (like the CPU APICs supply) to handle
+ * timer interrupt as a means of triggering reschedules etc.
+ **/
+static inline void do_timer_interrupt_hook_percpu(struct pt_regs *regs) {}
+
static inline int do_timer_overflow(int count)
{
int i;
diff -puN include/asm-i386/mach-voyager/do_timer.h~i386-x86-64-eliminate-local-apic-timer-interrupt include/asm-i386/mach-voyager/do_timer.h
--- 25/include/asm-i386/mach-voyager/do_timer.h~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.450617672 -0700
+++ 25-akpm/include/asm-i386/mach-voyager/do_timer.h 2005-04-29 18:32:14.466615240 -0700
@@ -11,6 +11,17 @@ static inline void do_timer_interrupt_ho
voyager_timer_interrupt(regs);
}
+/**
+ * do_timer_interrupt_hook_percpu - hook into timer tick for each cpu
+ * @regs: standard registers from interrupt
+ *
+ * Description:
+ * It's primary purpose is to allow architectures that don't use
+ * individual per CPU clocks (like the CPU APICs supply) to handle
+ * timer interrupt as a means of triggering reschedules etc.
+ **/
+static inline void do_timer_interrupt_hook_percpu(struct pt_regs *regs) {}
+
static inline int do_timer_overflow(int count)
{
/* can't read the ISR, just assume 1 tick
diff -puN include/asm-x86_64/io_apic.h~i386-x86-64-eliminate-local-apic-timer-interrupt include/asm-x86_64/io_apic.h
--- 25/include/asm-x86_64/io_apic.h~i386-x86-64-eliminate-local-apic-timer-interrupt 2005-04-29 18:32:14.451617520 -0700
+++ 25-akpm/include/asm-x86_64/io_apic.h 2005-04-29 18:32:14.466615240 -0700
@@ -13,6 +13,8 @@
#ifdef CONFIG_X86_IO_APIC
+extern int timer_broadcast;
+
#ifdef CONFIG_PCI_MSI
static inline int use_pci_vector(void) {return 1;}
static inline void disable_edge_ioapic_vector(unsigned int vector) { }
_
>-----Original Message-----
>From: Zwane Mwaikambo [mailto:[email protected]]
>Sent: Friday, April 29, 2005 6:13 PM
>To: Pallipadi, Venkatesh
>Cc: Andrew Morton; Linus Torvalds; [email protected];
>linux-kernel; Shah, Rajesh; John Stultz; Andi Kleen; Mallick, Asit K
>Subject: Re: [RFC][PATCH] i386 x86-64 Eliminate Local APIC
>timer interrupt
>
>On Fri, 29 Apr 2005, Venkatesh Pallipadi wrote:
>
>> Proposed Fix:
>> Attached is a prototype patch, that tries to eliminate the
>dependency on
>> local APIC timer for update_process_times(). The patch gets
>rid of Local APIC
>> timer altogether. We use the timer interrupt (IRQ 0) configured in
>> broadcast mode in IOAPIC instead (Doesn't work with 8259).
>> As changing anything related to basic timer interrupt is a
>little bit risky,
>> I have a boot parameter currently ("useapictimer") to switch
>back to original
>> local APIC timer way of doing things.
>
>I'm rather reluctant to advocate the broadcast scheme as i can see it
>breaking on a lot of systems, e.g. SMP systems which use i8259 (as you
>noted), IBM x440 and ES7000. If anything the default mode
>should be APIC
>timer and have a parameter to disable it.
The patch as it is should handle 8259 case using the regular APIC timer.
It only adds broadcast when IOAPIC is used for timer interrupt.
And if broadcast doesn't work on IBM x440 and ES7000, it can easily
be handled by sub-arch, to use local APIC.
> Regarding things like variable
>timer ticks, reprogramming the PIT is slow, and using it
>extensively for
>such sounds like a step in the wrong direction.
Variable tick should come into picture only when system is totally idle
(for a long time). The algorithm that change ticks should handle the
trade-off between frequent HZ interrupt when system is idle and overhead
Of reprogramming PIT/HPET. And variable HZ is already changing PIT if I
Remember correctly. This patch doesn't add any complexity there.
> Is this feature/bug going to proliferate amongst newer processor
> local APICs?
This APIC timer stopping in C3 will affect all CPUs that have C3 or
deeper state.
Although I agree that changing the things like timer interrupt is like
walking on a landmine, given all different kind of hardware present,
in general this seems simplify things related to timer interrupts.
Thanks,
Venki
>-----Original Message-----
>From: Andrew Morton [mailto:[email protected]]
>Sent: Friday, April 29, 2005 7:33 PM
>To: Pallipadi, Venkatesh
>Cc: [email protected]; [email protected];
>[email protected]; Shah, Rajesh;
>[email protected]; [email protected]; Mallick, Asit K
>Subject: Re: [RFC][PATCH] i386 x86-64 Eliminate Local APIC
>timer interrupt
>
>The patch (at least, as I merged it) goes into a ghastly death
>spiral early
>in boot.
>
>Serial console says:
>
>
>Initializing CPU#1
>Calibrating delay using timer specific routine.. 5615.95
>BogoMIPS (lpj=2807978)
>CPU: Trace cache: 12K uops, L1 D cache: 8K
>CPU: L2 cache: 512K
>CPU: Physical Processor ID: 0
>CPU1: Intel Pentium 4 (Northwood) stepping 07
>Total of 2 processors activated (11238.26 BogoMIPS).
>ENABLING IO-APIC IRQs
>..TIMER: vector=0x31 pin1=2 pin2=-1
>checking TSC synchronization across 2 CPUs: passed.
>Brought up 2 CPUs
>Unable to handle kernel NULL pointer dereference
>
>which isn't very helpful.
>
>tty output is at
>http://www.zip.com.au/~akpm/linux/patches/stuff/dsc02506.jpg
>
>which is also less that totally useful.
>
>There's waaaaaaaaay too much low-level x86 stuff happening at
>present. We
>need to settle it down, go more slowly, take more care and test things
>better, please. Next -mm has already been delayed by two days
>due to my
>having to chase down all the bugs people have been sending me.
>
I did test this patch on variety of systems and didn't see any failures.
Probably some other change in mm conflicting with this patch?
Is there way to get the all the patches in mm, so that I can try same
Kernel and reproduce this failure?
I agree though that this patch is very risky and needs some discussion
and lot of testing before it goes into base.
Thanks,
Venki
"Pallipadi, Venkatesh" <[email protected]> wrote:
>
> I did test this patch on variety of systems and didn't see any failures.
OK.
> Probably some other change in mm conflicting with this patch?
Could well be.
> Is there way to get the all the patches in mm, so that I can try same
> Kernel and reproduce this failure?
I'm reluctant to do that, because the -mm lineup is usually a hysterical
pile of crap - you wouldn't believe what people send me. I actually do a
lot of testing, despite appearances ;)
> I agree though that this patch is very risky and needs some discussion
> and lot of testing before it goes into base.
OK.
We need to get the x86 stuff in -mm reviewed, tested, settled down and
merged up before we can move any further forward, I think.
If x86_64 manages to limp to a login prompt I'll put rc3-mm1 up later this
evening.
> On Fri, 29 Apr 2005, Venkatesh Pallipadi wrote:
>
> > Proposed Fix:
> > Attached is a prototype patch, that tries to eliminate the
> dependency
> > on local APIC timer for update_process_times(). The patch
> gets rid of
> > Local APIC timer altogether. We use the timer interrupt (IRQ 0)
> > configured in broadcast mode in IOAPIC instead (Doesn't
> work with 8259).
> > As changing anything related to basic timer interrupt is a
> little bit
> > risky, I have a boot parameter currently ("useapictimer") to switch
> > back to original local APIC timer way of doing things.
>
> I'm rather reluctant to advocate the broadcast scheme as i
> can see it breaking on a lot of systems, e.g. SMP systems
> which use i8259 (as you noted), IBM x440 and ES7000. If
> anything the default mode should be APIC timer and have a
> parameter to disable it. Regarding things like variable timer
> ticks, reprogramming the PIT is slow, and using it
> extensively for such sounds like a step in the wrong
> direction. Is this feature/bug going to proliferate amongst
> newer processor local APICs?
>
> Thanks,
> Zwane
I did preliminary testing of the patch applied to the rc3 on the IA-32
ES7000, and it booted fine, without the useapictimer boot option. As
Zwane pointed out correctly, ES7000 doesn't handle IRQ broadcast. The
kernel by-passed the non-apic timer option (chose pin1 in check_timer)
and came up safely with local APIC timer.
Thanks,
--Natalie
On Fri, Apr 29, 2005 at 05:26:05PM -0700, Venkatesh Pallipadi wrote:
>
> Background:
> Local APIC timer stops functioning when CPU is in C3 state. As a
> result the local APIC timer interrupt will fire at uncertain times, depending
> on how long we spend in C3 state. And this has two side effects
> * Idle balancing will not happen as we expect it to.
> * Kernel statistics for idle time will not be proper (as we get less LAPIC
> interrupts when we are idle). This can result in confusing other parts of
> kernel (like ondemand cpufreq governor) which depends on this idle stats.
This is absolutely incompatible to no timer tick in IDLE - at least
my implementation that allows to keep all CPUs indepently in idle,
not require all CPUs to be idle to turn off the timers.
I had hoped that the hardware was clever enough to keep the
APIC timers working on SMP systems.
This is very unfortunate :-(( It means you will always
waste a lot of power on a partially idle system, because
you cannot keep individual CPUs sleeping for a longer time.
I ran into the problem on AMD systems with C2 state - in
my initial implementation there was only the APIC timer for
any time keeping because it is cheaper to reprogram and
there are a lot of chipsets which miss HPET - and on the
BP only the APIC timer was ticking at all, which did not
work very well. This was avoided by using
PIT/HPET on the BP again. But on SMP systems there is not
much choice on the APs.
> it simplifies things and will have other advantages:
> * Should help dynamick tick as one has to change only global timer interrupt
> freq with varying jiffies.
Huh? On contrary it completely kills really SMP aware dynamic tick
implementations.
I would really prefer to find some other way to solve this
Perhaps if we have enough timer sources in HPET we can use the 3-4
HPET timers on different CPUs. That would work on upto 4 socket
systems (sibling/dual core can be ignored I guess because for
power management purposes they dont really exist)
This would be a problem for the few applications that use HPET
from user space for application timers, but I think power saving
is more important than them or they can disable dynamic tick.
It is a big mess. I begin to envy the PPC guys who
actually have usable per CPU timers.
-Andi
On Mon, May 02, 2005 at 06:38:21PM +0200, Andi Kleen wrote:
> On Fri, Apr 29, 2005 at 05:26:05PM -0700, Venkatesh Pallipadi wrote:
> >
> > Background:
> > Local APIC timer stops functioning when CPU is in C3 state. As a
> > result the local APIC timer interrupt will fire at uncertain times, depending
> > on how long we spend in C3 state. And this has two side effects
> > * Idle balancing will not happen as we expect it to.
> > * Kernel statistics for idle time will not be proper (as we get less LAPIC
> > interrupts when we are idle). This can result in confusing other parts of
> > kernel (like ondemand cpufreq governor) which depends on this idle stats.
>
> This is absolutely incompatible to no timer tick in IDLE - at least
> my implementation that allows to keep all CPUs indepently in idle,
> not require all CPUs to be idle to turn off the timers.
>
> I had hoped that the hardware was clever enough to keep the
> APIC timers working on SMP systems.
>
> This is very unfortunate :-(( It means you will always
> waste a lot of power on a partially idle system, because
> you cannot keep individual CPUs sleeping for a longer time.
>
Yes. With this solution things will work well with busy or idle systems. But,
when system has some CPUs busy and other CPUs idle, things will be sub-optimal,
as all CPUs will wake up once every jiffy.
But, solution where external timer tick runs at some frequency and then each
individual local APIC timer runs at some other frequency and the external timer
interrupt can be routed to any CPU by the irq routing daemon, I feel this
soultion is somewhat flaky. I mean, I don't think that will be clean solution.
May be I am wrong, but the timer code right now feels very messy to me.
> I ran into the problem on AMD systems with C2 state - in
> my initial implementation there was only the APIC timer for
> any time keeping because it is cheaper to reprogram and
> there are a lot of chipsets which miss HPET - and on the
> BP only the APIC timer was ticking at all, which did not
> work very well. This was avoided by using
> PIT/HPET on the BP again. But on SMP systems there is not
> much choice on the APs.
>
> > it simplifies things and will have other advantages:
> > * Should help dynamick tick as one has to change only global timer interrupt
> > freq with varying jiffies.
>
> Huh? On contrary it completely kills really SMP aware dynamic tick
> implementations.
>
> I would really prefer to find some other way to solve this
>
> Perhaps if we have enough timer sources in HPET we can use the 3-4
> HPET timers on different CPUs. That would work on upto 4 socket
> systems (sibling/dual core can be ignored I guess because for
> power management purposes they dont really exist)
>
> This would be a problem for the few applications that use HPET
> from user space for application timers, but I think power saving
> is more important than them or they can disable dynamic tick.
>
> It is a big mess. I begin to envy the PPC guys who
> actually have usable per CPU timers.
>
Fully agree with you on the mess part :(. Few other options that we had
thought about earlier:
- Have some sort of callbacks while entering/exiting C3, and hand manipulate
Local APIC timer counter to account for the time spent in C3 state. This is
less intrusive change (affects only the system that has C3), but code starts
getting ugly once we have time spent in C3 exceed a jiffy and spans across
multiple jiffies. And we have to have some execute some code to handle all
the lost local APIC timer idle ticks (for the statistics part) and can
increase C3 wakeup latency higher.
- Other simpler solution is to remove idle from cpu_usage_stat and use
(overall time - other accounted time) instead. This doesn't really solve
the problem, but it is a workaround for all the code that depends on
proper idle statistics.
Thanks,
Venki
I thought about it more and i really dislike the broadcast timer
more and more. Zwanes point on creating a lot of contention
on irq0 datastructures is also a very good one.
> Fully agree with you on the mess part :(. Few other options that we had
> thought about earlier:
> - Have some sort of callbacks while entering/exiting C3, and hand manipulate
> Local APIC timer counter to account for the time spent in C3 state. This is
> less intrusive change (affects only the system that has C3), but code starts
> getting ugly once we have time spent in C3 exceed a jiffy and spans across
> multiple jiffies. And we have to have some execute some code to handle all
> the lost local APIC timer idle ticks (for the statistics part) and can
> increase C3 wakeup latency higher.
It is a bit messy agreed, but no timer tick in idle has to do this
anyways. And we need to communicate with the ACPI idle code even
because we need to shorten delays artificially in lower sleep
modi (e.g. in C1 you dont want to sleep for longer than a ms
before waking up and switching into C2)
So given that we need this anyways (and I have it partly coded up
already) I think that is the way to go. The no tick code has
to query the backing time in this case anyways (or rather use the TSC
instead which is local - and I hope is still accurate even after C3)
and fix the timer up. So the infrastructure is there already
and the APIC problem can be handled in the same way.
BTW can you confirm that the APIC timer frequency is stable
over cpufreq changes on your x86-64 CPUs, or does this need
to be handled too?
The only drawback is that these systems will pretty much need
no timer tick in idle then to be reliable - i had hoped
to keep it an experimental option at the beginning; but I guess
we can accelerate it a bit.
So I would propose to go with this variant.
What do you think?
> - Other simpler solution is to remove idle from cpu_usage_stat and use
> (overall time - other accounted time) instead. This doesn't really solve
> the problem, but it is a workaround for all the code that depends on
> proper idle statistics.
What code is that exactly?
In no idle tick I just do the same thing as s390 and accumulate any lost ticks
up in a loop. However I have not done much measurements how affected
the CPU time statistics are.
But when the choice is between better power saving and slightly
less accurate statistics I will prefer better power saving any day - and
the people who really need good statistics are always free to turn
off the power saving, but I doubt that will happen often as long
as the statistics are "good enough". e.g. we can be factor 10 worse
and not be worse than 2.4 with HZ=100. That is a lot of play ground.
-Andi
On Mon, May 02, 2005 at 09:08:50PM +0200, Andi Kleen wrote:
> I thought about it more and i really dislike the broadcast timer
> more and more. Zwanes point on creating a lot of contention
> on irq0 datastructures is also a very good one.
>
Actually, as IRQ0 will become PER_CPU din broadcast, there are no irq0 lock
contention. Only contention that can be there is in scheduler idle balancing,
which should not be an issue as the CPU is idle anyway.
> > Fully agree with you on the mess part :(. Few other options that we had
> > thought about earlier:
> > - Have some sort of callbacks while entering/exiting C3, and hand manipulate
> > Local APIC timer counter to account for the time spent in C3 state. This is
> > less intrusive change (affects only the system that has C3), but code starts
> > getting ugly once we have time spent in C3 exceed a jiffy and spans across
> > multiple jiffies. And we have to have some execute some code to handle all
> > the lost local APIC timer idle ticks (for the statistics part) and can
> > increase C3 wakeup latency higher.
>
> It is a bit messy agreed, but no timer tick in idle has to do this
> anyways. And we need to communicate with the ACPI idle code even
> because we need to shorten delays artificially in lower sleep
> modi (e.g. in C1 you dont want to sleep for longer than a ms
> before waking up and switching into C2)
>
> So given that we need this anyways (and I have it partly coded up
> already) I think that is the way to go. The no tick code has
> to query the backing time in this case anyways (or rather use the TSC
> instead which is local - and I hope is still accurate even after C3)
Unfortunately no :(. TSC will also stop in C3. Myself and John are working on
another patch to fix TSC based gettimeofday() to handle this (atleast in UP
case) It is almost impossible in SMP, as TSCs can go out of sync with C3 on SMP.
So, ACPI PM timer or HPET seem to be the only option for backing time.
> and fix the timer up. So the infrastructure is there already
> and the APIC problem can be handled in the same way.
>
> BTW can you confirm that the APIC timer frequency is stable
> over cpufreq changes on your x86-64 CPUs, or does this need
> to be handled too?
I haven't seen any issues with cpufreq on APIC timer, as APIC timers run
based on FSB.
>
> The only drawback is that these systems will pretty much need
> no timer tick in idle then to be reliable - i had hoped
> to keep it an experimental option at the beginning; but I guess
> we can accelerate it a bit.
>
> So I would propose to go with this variant.
>
> What do you think?
>
OK. I have some code that I had prototyped to fixup local APIC counts in
presence of C3s. I guess we can work together and solve this one faster.
I will send you the patch in a separate mail.
> > - Other simpler solution is to remove idle from cpu_usage_stat and use
> > (overall time - other accounted time) instead. This doesn't really solve
> > the problem, but it is a workaround for all the code that depends on
> > proper idle statistics.
>
> What code is that exactly?
>
> In no idle tick I just do the same thing as s390 and accumulate any lost ticks
> up in a loop. However I have not done much measurements how affected
> the CPU time statistics are.
>
> But when the choice is between better power saving and slightly
> less accurate statistics I will prefer better power saving any day - and
> the people who really need good statistics are always free to turn
> off the power saving, but I doubt that will happen often as long
> as the statistics are "good enough". e.g. we can be factor 10 worse
> and not be worse than 2.4 with HZ=100. That is a lot of play ground.
cpufreq_ondemand governor depends on the idle statistics. And due to the
wrong idle statistics, the governor will keep the CPU frequency at maximum,
loosing all the power advantages of cpufreq. So, question is not a simple
power savings against accurate atatistics. Accurate statistics is related
to power savings as well...
Thanks,
Venki
Hi!
> > Is there way to get the all the patches in mm, so that I can try same
> > Kernel and reproduce this failure?
>
> I'm reluctant to do that, because the -mm lineup is usually a hysterical
> pile of crap - you wouldn't believe what people send me. I actually do a
> lot of testing, despite appearances ;)
Actually having -mm tree available in near realtime would be quite nice. I don't
think I'd run it reguraly, and "other patches from you are ..." in commit confirmation
is very usefull, but seeing patches from other people, too, seems interesting.
Pavel
--
64 bytes from 195.113.31.123: icmp_seq=28 ttl=51 time=448769.1 ms
On Mon, May 02, 2005 at 01:27:37PM -0700, Venkatesh Pallipadi wrote:
> On Mon, May 02, 2005 at 09:08:50PM +0200, Andi Kleen wrote:
> > I thought about it more and i really dislike the broadcast timer
> > more and more. Zwanes point on creating a lot of contention
> > on irq0 datastructures is also a very good one.
> >
>
> Actually, as IRQ0 will become PER_CPU din broadcast, there are no irq0 lock
> contention. Only contention that can be there is in scheduler idle balancing,
> which should not be an issue as the CPU is idle anyway.
>
> > > Fully agree with you on the mess part :(. Few other options that we had
> > > thought about earlier:
> > > - Have some sort of callbacks while entering/exiting C3, and hand manipulate
> > > Local APIC timer counter to account for the time spent in C3 state. This is
> > > less intrusive change (affects only the system that has C3), but code starts
> > > getting ugly once we have time spent in C3 exceed a jiffy and spans across
> > > multiple jiffies. And we have to have some execute some code to handle all
> > > the lost local APIC timer idle ticks (for the statistics part) and can
> > > increase C3 wakeup latency higher.
> >
> > It is a bit messy agreed, but no timer tick in idle has to do this
> > anyways. And we need to communicate with the ACPI idle code even
> > because we need to shorten delays artificially in lower sleep
> > modi (e.g. in C1 you dont want to sleep for longer than a ms
> > before waking up and switching into C2)
> >
> > So given that we need this anyways (and I have it partly coded up
> > already) I think that is the way to go. The no tick code has
> > to query the backing time in this case anyways (or rather use the TSC
> > instead which is local - and I hope is still accurate even after C3)
>
> Unfortunately no :(. TSC will also stop in C3. Myself and John are working on
> another patch to fix TSC based gettimeofday() to handle this (atleast in UP
> case) It is almost impossible in SMP, as TSCs can go out of sync with C3 on SMP.
>
> So, ACPI PM timer or HPET seem to be the only option for backing time.
Ok, that is ok too, just slower. May need some heuristics to not do
it that often in this case, but I guess C3 is long enough that
adding the additional overhead for HPET/PM is not that bad.
>
> cpufreq_ondemand governor depends on the idle statistics. And due to the
> wrong idle statistics, the governor will keep the CPU frequency at maximum,
> loosing all the power advantages of cpufreq. So, question is not a simple
> power savings against accurate atatistics. Accurate statistics is related
> to power savings as well...
I guess it just needs to know how long the machine is idle. That is
relatively easy to do anyways by just accumulating a counter when
the timer is fixed up.
-Andi
On Fri, 2005-04-29 at 22:43, Pallipadi, Venkatesh wrote:
> The patch as it is should handle 8259 case using the regular APIC
> timer. It only adds broadcast when IOAPIC is used for timer
> interrupt.
While they don't need the broadcast capability of this patch,
uniprocessors do need the change to stop using LAPIC timer
if they support C3 (as virtually all laptops do).
It was a mistake to allow using the LOC on a uniprocessor
in the first place -- as the UP system runs perfectly fine
with timers coming in on IRQ0 and doesn't need another interrupt.
Re: SMP using i8259
While Linux in ACPI mode allows "noapic" on SMP, it isn't recommended.
It is there for comparisons, debugging, and to work-around the odd
broken system. It is an exception configuration, and supporting it
should in no way impact the design for other 99.99% normal systems.
Indeed, note that SMP systems using i8259 instead of IOAPIC
is explicity forbidden by MPS, and thus would probably fail
the compatibility test for your favorite high volume binary OS.
-Len
Re: no idle tick
Idle power savings does not by itself justify HZ=0.
We'll get the same idle power consumption with HZ=1.
Indeed, within measurement error, we'll get the same
idle power consumption with HZ=10.
Linux should probably default to HZ=100, and have
the capability to speed up to HZ=1000 at run-time
if applications request it; and it should slow down
to HZ=10 in deep idle.
If we keep HZ=10 in idle rather than going all
the way to HZ=0, it allows the C-state promotion code
to work without any special cases to wake the system
when idle just to promote to a deeper C-state --
i.e. like it works today.
Re: multiple LAPIC rates on SMP
This concept doesn't work when it is needed (C3)
and isn't needed when it works (C1/C2).
This is because the LAPIC timer stops in C3,
and the latencies in C1/C2 are so low that
it doesn't matter what the tick rate is.
Re: using TSC to patch things up
Nope. TSC is variable on some processors with P-states,
and on some processors it stops in C3.
I'm not happy about this reality either.
Re: LAPIC timer vs P-states
On the systems I'm aware of, LAPIC timer is based
on the bus speed rather than the core speed. So
today it should be constant or zero -- that is until
some HW guy decides to throttle the bus at run-time
to save power. Based on the history of the TSC --
frozen in C3 and sometimes variable with MHz changes;
it would not surprise me a bit to see the LAPIC, now
frozen in C3, become variable in some future power
saving state that varies bus speed.
Re: re-calibrating the un-frozen LAPIC timer
I think we're on thin-ice if we endeavor to continue
to use the LAPIC timer. The multiple LAPIC rates
on SMP concept is defunct (above), so the only benefit
of using the LAPIC timer is that it is lower latency
to re-program it when we re-program the global rate.
But then we have to do this on all logical processors
and we have to add the code correct it with a
stable reference time-source.
This must be compared to simply using the stable
reference time-source in the first place, and perhaps
not changing its rate as frequently.
Re: what to do?
A proposal:
1. disable LAPIC timer use on uni-processor
it adds no value, and breaks if C3 is supported.
2. disable LAPIC timer use on SMP, via
Venki's timer broadcast patch, or similar.
3. Transparently use HZ=10 in "deep idle"
This can be done the same way that C-state
promotions are done -- when we recognize
that we're still idle after a long time,
take steps to get into a deeper state.
eg. we might say that entry to C3 or C4
is "deep idle", or better yet, we might
base this on the advertised latency of
the C-states since low latency states will
not notice clock ticks and high-latency
states will become ineffective if ticks
are too frequent.
4. Apply "boot-time dynamic HZ" patch, and default
to hz=100.
5. Move to real "run-time dynamic HZ" where the
system HZ can be changed by programs that need
it changed.
thoughts?
-Len
On Thu, May 05, 2005 at 12:16:21AM -0400, Brown, Len wrote:
> On Fri, 2005-04-29 at 22:43, Pallipadi, Venkatesh wrote:
>
> > The patch as it is should handle 8259 case using the regular APIC
> > timer. It only adds broadcast when IOAPIC is used for timer
> > interrupt.
>
> While they don't need the broadcast capability of this patch,
> uniprocessors do need the change to stop using LAPIC timer
> if they support C3 (as virtually all laptops do).
> It was a mistake to allow using the LOC on a uniprocessor
> in the first place -- as the UP system runs perfectly fine
> with timers coming in on IRQ0 and doesn't need another interrupt.
Yes I agree. It does not make much sense to run two timers
on a CPU. I already changed that in my x86-64 tree. i386 will
hopefully follow at some point.
-Andi
On Thu, May 05, 2005 at 01:33:33AM -0400, Brown, Len wrote:
> Re: no idle tick
>
> Idle power savings does not by itself justify HZ=0.
> We'll get the same idle power consumption with HZ=1.
Power consumption is not the only reason for no tick in idle.
The other big one is virtualization. If you have lots
of virtual machines running on a hypervisor you really
dont want them to wake up regularly even when idle.
I would advise you have a talk about this with
the people in your company who work in this area ,-)
> Linux should probably default to HZ=100, and have
> the capability to speed up to HZ=1000 at run-time
> if applications request it; and it should slow down
> to HZ=10 in deep idle.
There is no interface to request this.
Also for this you need all the infrastructure for a fully dynamic
tick already, and when you have that you can go the full way.
But currently disabling the timer tick in non idle is still
some time away, in particular the CPU statistics support
in generic code needs quite some revamp for this. On the other
hand no tick in idle is mostly possible (biggest left often
issues seem to be RCU and interaction with ACPI on x86)
> If we keep HZ=10 in idle rather than going all
> the way to HZ=0, it allows the C-state promotion code
> to work without any special cases to wake the system
> when idle just to promote to a deeper C-state --
> i.e. like it works today.
I think the C state code needs improvements anyways,
and when it gets that there is not much reason to not
let it talk to the timer code.
The current (somewhat dumb) C promotion algorithm
can be made to work with no-timer-tick-in-idle
by giving a simply hint to the delay timer code
to not delay longer than the timeout of the current
C state (and so delay in a staircase pattern
for longer and longer)
But I hope in future a better algorithm can be found
that takes into account how busy the machine is on average and
uses that as an estimate to predict which C state should
be used next. e.g. a most idle box that only occasionally
wakes up can go back directly into C2 or C3 again, no
need to go through C1.
Thomas is working in this area.
>
> Re: multiple LAPIC rates on SMP
>
> This concept doesn't work when it is needed (C3)
> and isn't needed when it works (C1/C2).
>
> This is because the LAPIC timer stops in C3,
> and the latencies in C1/C2 are so low that
> it doesn't matter what the tick rate is.
First we need it anyways for the virtualization case
(where idle CPUs need to be fully idle)
and then it does not make much sense to me to wake up
that often and eat power when not needed.
> Re: using TSC to patch things up
>
> Nope. TSC is variable on some processors with P-states,
> and on some processors it stops in C3.
But it is constant on others, and on those it can
be a valuable speedup. And variable TSC can be dealt
with with some infrastructure work. With cpufreq
we know the current frequency (except in thermal throttle,
but ignoring that is quite reasonable imho). Ok there
is APM with SMM code that does it behind your back ,
but that can be probably ignored too now ,-) On x86-64
all these wards dont exist anyways.
The missing infrastructure is that the timer interrupt code
needs to be rerun when the frequency changes to fix
up the TSC base and the TSC base needs to be kept per CPU,
not global. I hope to get this done eventually on x86-64
at least.
Using is especially important on systems without HPET
(which is unfortunately missing in some modern high volume chipsets :-(((=,
because reprogramming the PIT all the time is really slow and
using the TSC helps doing that less often.
> On the systems I'm aware of, LAPIC timer is based
> on the bus speed rather than the core speed. So
> today it should be constant or zero -- that is until
> some HW guy decides to throttle the bus at run-time
> to save power. Based on the history of the TSC --
> frozen in C3 and sometimes variable with MHz changes;
> it would not surprise me a bit to see the LAPIC, now
> frozen in C3, become variable in some future power
> saving state that varies bus speed.
cpufreq callbacks can deal with that.
>
> Re: re-calibrating the un-frozen LAPIC timer
>
> I think we're on thin-ice if we endeavor to continue
> to use the LAPIC timer. The multiple LAPIC rates
> on SMP concept is defunct (above), so the only benefit
> of using the LAPIC timer is that it is lower latency
> to re-program it when we re-program the global rate.
> But then we have to do this on all logical processors
> and we have to add the code correct it with a
> stable reference time-source.
I disagree. LAPIC is still the only sane choice for
a per CPU timer on a SMP system. On UP they are not needed,
but I suspect UP only laptops will eventually die out
when everything new becomes dual core ..
> 1. disable LAPIC timer use on uni-processor
> it adds no value, and breaks if C3 is supported.
I did that already on x86-64.
> 2. disable LAPIC timer use on SMP, via
> Venki's timer broadcast patch, or similar.
Strongly disagreed.
> 3. Transparently use HZ=10 in "deep idle"
See above.
-Andi
On Thu, 5 May 2005, Len Brown wrote:
> Re: SMP using i8259
> While Linux in ACPI mode allows "noapic" on SMP, it isn't recommended.
> It is there for comparisons, debugging, and to work-around the odd
> broken system. It is an exception configuration, and supporting it
> should in no way impact the design for other 99.99% normal systems.
>
> Indeed, note that SMP systems using i8259 instead of IOAPIC
> is explicity forbidden by MPS, and thus would probably fail
> the compatibility test for your favorite high volume binary OS.
I'm not quite sure if that's forbidden by MPS -- the mixed mode certainly
is not as not all interrupt sources may necessarily be routed to one of
I/O APICs, and "noapic" can probably be treated as a special case of the
mixed mode. Regardless, there used to be systems in existence that
wouldn't route IRQ 0 to an I/O APIC, so using that interrupt requires
either the mixed mode or using the "through-i8259A" trick (which
unfortunately does not work for a subset of affected systems as a result
of manufacturers implementing the Intel-recommended glue logic at the
output of the master i8259).
Maciej
Venkatesh Pallipadi wrote:
> Background:
> Local APIC timer stops functioning when CPU is in C3 state. As a
> result the local APIC timer interrupt will fire at uncertain times, depending
> on how long we spend in C3 state. And this has two side effects
> * Idle balancing will not happen as we expect it to.
> * Kernel statistics for idle time will not be proper (as we get less LAPIC
> interrupts when we are idle). This can result in confusing other parts of
> kernel (like ondemand cpufreq governor) which depends on this idle stats.
>
>
> Proposed Fix:
> Attached is a prototype patch, that tries to eliminate the dependency on
> local APIC timer for update_process_times(). The patch gets rid of Local APIC
> timer altogether. We use the timer interrupt (IRQ 0) configured in
> broadcast mode in IOAPIC instead (Doesn't work with 8259).
> As changing anything related to basic timer interrupt is a little bit risky,
> I have a boot parameter currently ("useapictimer") to switch back to original
> local APIC timer way of doing things.
>
> This may seem like a overkill to solve this particular problem. But, I feel
> it simplifies things and will have other advantages:
> * Should help dynamick tick as one has to change only global timer interrupt
> freq with varying jiffies.
> * Reduces one interrupt per jiffy.
> * One less interrupt source to worry about.
>
Sorry I missed this when it came out, but, I think a better way to handle this
is to use an IPI. In the non-VST case it can be to all but self, while when a
given cpu is sleeping we can not send it to that cpu. The advantages are:
1) The broadcast has a race/ contention on the xtime lock. The read lock is
needed by all and the write lock taken by one. The IPI should be sent AFTER the
xtime write unlock.
2) It is easy to prune VST sleeping cpus from the list of all to wake.
--
George Anzinger [email protected]
High-res-timers: http://sourceforge.net/projects/high-res-timers/
* Andi Kleen <[email protected]> [050505 05:21]:
> On Thu, May 05, 2005 at 01:33:33AM -0400, Brown, Len wrote:
> > Re: no idle tick
> >
> > Idle power savings does not by itself justify HZ=0.
> > We'll get the same idle power consumption with HZ=1.
>
>
> Power consumption is not the only reason for no tick in idle.
> The other big one is virtualization. If you have lots
> of virtual machines running on a hypervisor you really
> dont want them to wake up regularly even when idle.
Yes, and although this is x86/x86_64 thread, I'd like to point out
that embedded systems can benefit from skipping ticks for several
seconds at a time.
There's no need for ticks on embedded systems until some event,
such as an interrupt happens.
Cheers,
Tony