[PATCH] s390: no timer interrupts in idle.
From: Martin Schwidefsky <[email protected]>
This patch add a system control that allows to switch off the jiffies
timer interrupts while a cpu sleeps in idle. This is useful for a system
running with virtual cpus under z/VM.
diffstat:
arch/s390/Kconfig | 19 +++++
arch/s390/defconfig | 1
arch/s390/kernel/process.c | 10 +-
arch/s390/kernel/time.c | 158 ++++++++++++++++++++++++++++++++++++++-------
arch/s390/kernel/traps.c | 4 -
include/linux/rcupdate.h | 28 +++++++
include/linux/sysctl.h | 1
include/linux/timer.h | 4 +
kernel/rcupdate.c | 2
kernel/sysctl.c | 14 +++
kernel/timer.c | 69 +++++++++++++++++++
11 files changed, 281 insertions(+), 29 deletions(-)
diff -urN linux-2.6/arch/s390/Kconfig linux-2.6-s390/arch/s390/Kconfig
--- linux-2.6/arch/s390/Kconfig Wed Apr 21 16:29:40 2004
+++ linux-2.6-s390/arch/s390/Kconfig Wed Apr 21 16:29:41 2004
@@ -333,6 +333,25 @@
This can also be compiled as a module, which will be called
appldata_net_sum.o.
+config NO_IDLE_HZ
+ bool "No HZ timer ticks in idle"
+ help
+ Switches the regular HZ timer off when the system is going idle.
+ This helps z/VM to detect that the Linux system is idle. VM can
+ then "swap-out" this guest which reduces memory usage. It also
+ reduces the overhead of idle systems.
+
+ The HZ timer can be switched on/off via /proc/sys/kernel/hz_timer.
+ hz_timer=0 means HZ timer is disabled. hz_timer=1 means HZ
+ timer is active.
+
+config NO_IDLE_HZ_INIT
+ bool "HZ timer in idle off by default"
+ depends on NO_IDLE_HZ
+ help
+ The HZ timer is switched off in idle by default. That means the
+ HZ timer is already disabled at boot time.
+
endmenu
config PCMCIA
diff -urN linux-2.6/arch/s390/defconfig linux-2.6-s390/arch/s390/defconfig
--- linux-2.6/arch/s390/defconfig Wed Apr 21 16:29:41 2004
+++ linux-2.6-s390/arch/s390/defconfig Wed Apr 21 16:29:41 2004
@@ -83,6 +83,7 @@
# CONFIG_SHARED_KERNEL is not set
# CONFIG_CMM is not set
# CONFIG_VIRT_TIMER is not set
+# CONFIG_NO_IDLE_HZ is not set
# CONFIG_PCMCIA is not set
#
diff -urN linux-2.6/arch/s390/kernel/process.c linux-2.6-s390/arch/s390/kernel/process.c
--- linux-2.6/arch/s390/kernel/process.c Wed Apr 21 16:29:15 2004
+++ linux-2.6-s390/arch/s390/kernel/process.c Wed Apr 21 16:29:41 2004
@@ -40,7 +40,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/irq.h>
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
#include <asm/timer.h>
#endif
@@ -75,17 +75,21 @@
psw_t wait_psw;
unsigned long reg;
+ local_irq_disable();
if (need_resched()) {
+ local_irq_enable();
schedule();
return;
}
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
/*
* hook to stop timers that should not tick while CPU is idle
*/
- if (stop_timers())
+ if (stop_timers()) {
+ local_irq_enable();
return;
+ }
#endif
/*
diff -urN linux-2.6/arch/s390/kernel/time.c linux-2.6-s390/arch/s390/kernel/time.c
--- linux-2.6/arch/s390/kernel/time.c Wed Apr 21 16:29:40 2004
+++ linux-2.6-s390/arch/s390/kernel/time.c Wed Apr 21 16:29:41 2004
@@ -331,29 +331,6 @@
return 0;
}
-void do_monitor_call(struct pt_regs *regs, long interruption_code)
-{
- /* disable monitor call class 0 */
- __ctl_clear_bit(8, 15);
-
- start_cpu_timer();
-}
-
-/*
- * called from cpu_idle to stop any timers
- * returns 1 if CPU should not be stopped
- */
-int stop_timers(void)
-{
- if (stop_cpu_timer())
- return 1;
-
- /* enable monitor call class 0 */
- __ctl_set_bit(8, 15);
-
- return 0;
-}
-
void set_vtimer(__u64 expires)
{
asm volatile ("SPT %0" : : "m" (expires));
@@ -474,6 +451,141 @@
}
#endif
+#ifdef CONFIG_NO_IDLE_HZ
+
+cpumask_t idle_cpu_mask = CPU_MASK_NONE;
+
+#ifdef CONFIG_NO_IDLE_HZ_INIT
+int sysctl_hz_timer = 0;
+#else
+int sysctl_hz_timer = 1;
+#endif
+
+/*
+ * Start the HZ tick on the current CPU.
+ * Only cpu_idle may call this function.
+ */
+void start_hz_timer(struct pt_regs *regs)
+{
+ __u64 tmp;
+ __u32 ticks;
+
+ if (!cpu_isset(smp_processor_id(), idle_cpu_mask))
+ return;
+
+ /* Calculate how many ticks have passed */
+ asm volatile ("STCK 0(%0)" : : "a" (&tmp) : "memory", "cc");
+ tmp = tmp + CLK_TICKS_PER_JIFFY - S390_lowcore.jiffy_timer;
+ ticks = __calculate_ticks(tmp);
+ S390_lowcore.jiffy_timer += CLK_TICKS_PER_JIFFY * (__u64) ticks;
+
+ /* Set the clock comparator to the next tick. */
+ tmp = S390_lowcore.jiffy_timer + CPU_DEVIATION;
+ asm volatile ("SCKC %0" : : "m" (tmp));
+
+ /* Charge the ticks. */
+ if (ticks > 0) {
+#ifdef CONFIG_SMP
+ /*
+ * Do not rely on the boot cpu to do the calls to do_timer.
+ * Spread it over all cpus instead.
+ */
+ write_seqlock(&xtime_lock);
+ if (S390_lowcore.jiffy_timer > xtime_cc) {
+ __u32 xticks;
+
+ tmp = S390_lowcore.jiffy_timer - xtime_cc;
+ if (tmp >= 2*CLK_TICKS_PER_JIFFY) {
+ xticks = __calculate_ticks(tmp);
+ xtime_cc += (__u64) xticks*CLK_TICKS_PER_JIFFY;
+ } else {
+ xticks = 1;
+ xtime_cc += CLK_TICKS_PER_JIFFY;
+ }
+ while (xticks--)
+ do_timer(regs);
+ }
+ write_sequnlock(&xtime_lock);
+ while (ticks--)
+ update_process_times(user_mode(regs));
+#else
+ while (ticks--)
+ do_timer(regs);
+#endif
+ }
+ cpu_clear(smp_processor_id(), idle_cpu_mask);
+}
+
+/*
+ * Stop the HZ tick on the current CPU.
+ * Only cpu_idle may call this function.
+ */
+int stop_hz_timer(void)
+{
+ __u64 timer;
+
+ if (sysctl_hz_timer != 0)
+ return 1;
+
+ /*
+ * Leave the clock comparator set up for the next timer
+ * tick if either rcu or a softirq is pending.
+ */
+ if (rcu_pending(smp_processor_id()) || local_softirq_pending())
+ return 1;
+
+ /*
+ * This cpu is going really idle. Set up the clock comparator
+ * for the next event.
+ */
+ cpu_set(smp_processor_id(), idle_cpu_mask);
+ timer = (__u64) (next_timer_interrupt() - jiffies) + jiffies_64;
+ timer = jiffies_timer_cc + timer * CLK_TICKS_PER_JIFFY;
+ asm volatile ("SCKC %0" : : "m" (timer));
+
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
+
+void do_monitor_call(struct pt_regs *regs, long interruption_code)
+{
+ /* disable monitor call class 0 */
+ __ctl_clear_bit(8, 15);
+
+#ifdef CONFIG_VIRT_TIMER
+ start_cpu_timer();
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+ start_hz_timer(regs);
+#endif
+}
+
+/*
+ * called from cpu_idle to stop any timers
+ * returns 1 if CPU should not be stopped
+ */
+int stop_timers(void)
+{
+#ifdef CONFIG_VIRT_TIMER
+ if (stop_cpu_timer())
+ return 1;
+#endif
+
+#ifdef CONFIG_NO_IDLE_HZ
+ if (stop_hz_timer())
+ return 1;
+#endif
+
+ /* enable monitor call class 0 */
+ __ctl_set_bit(8, 15);
+
+ return 0;
+}
+
+#endif
+
/*
* Start the clock comparator and the virtual CPU timer
* on the current CPU.
diff -urN linux-2.6/arch/s390/kernel/traps.c linux-2.6-s390/arch/s390/kernel/traps.c
--- linux-2.6/arch/s390/kernel/traps.c Sun Apr 4 05:36:55 2004
+++ linux-2.6-s390/arch/s390/kernel/traps.c Wed Apr 21 16:29:41 2004
@@ -64,7 +64,7 @@
extern void pfault_interrupt(struct pt_regs *regs, __u16 error_code);
static ext_int_info_t ext_int_pfault;
#endif
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_VIRT_TIMER)
extern pgm_check_handler_t do_monitor_call;
#endif
@@ -631,7 +631,7 @@
#endif /* CONFIG_ARCH_S390X */
pgm_check_table[0x15] = &operand_exception;
pgm_check_table[0x1C] = &privileged_op;
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
pgm_check_table[0x40] = &do_monitor_call;
#endif
if (MACHINE_IS_VM) {
diff -urN linux-2.6/include/linux/rcupdate.h linux-2.6-s390/include/linux/rcupdate.h
--- linux-2.6/include/linux/rcupdate.h Sun Apr 4 05:36:52 2004
+++ linux-2.6-s390/include/linux/rcupdate.h Wed Apr 21 16:29:41 2004
@@ -121,6 +121,34 @@
return 0;
}
+#ifdef CONFIG_NO_IDLE_HZ
+
+extern cpumask_t idle_cpu_mask;
+
+/*
+ * RCU is build for ticking systems. Without the HZ timer
+ * we have not enought state changes which may result in a
+ * never finished RCU request.
+ * In a tickless system we don't want to wake idle CPUs just
+ * to finish the RCU request. That is possible because the
+ * idle CPUs satisfy the quiescilant RCU condition anyway.
+ */
+static inline void rcu_set_active_cpu_map(cpumask_t *mask)
+{
+ cpumask_t active = idle_cpu_mask;
+ cpus_complement(active);
+ cpus_and(*mask, cpu_online_map, active);
+}
+
+#else
+
+static inline void rcu_set_active_cpu_map(cpumask_t *mask)
+{
+ *mask = cpu_online_map;
+}
+
+#endif
+
#define rcu_read_lock() preempt_disable()
#define rcu_read_unlock() preempt_enable()
diff -urN linux-2.6/include/linux/sysctl.h linux-2.6-s390/include/linux/sysctl.h
--- linux-2.6/include/linux/sysctl.h Wed Apr 21 16:29:19 2004
+++ linux-2.6-s390/include/linux/sysctl.h Wed Apr 21 16:29:41 2004
@@ -132,6 +132,7 @@
KERN_PTY=62, /* dir: pty driver */
KERN_NGROUPS_MAX=63, /* int: NGROUPS_MAX */
KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
+ KERN_S390_HZ_TIMER=64, /* int: hz timer on or off */
};
diff -urN linux-2.6/include/linux/timer.h linux-2.6-s390/include/linux/timer.h
--- linux-2.6/include/linux/timer.h Sun Apr 4 05:37:37 2004
+++ linux-2.6-s390/include/linux/timer.h Wed Apr 21 16:29:41 2004
@@ -65,6 +65,10 @@
extern int __mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
+#ifdef CONFIG_NO_IDLE_HZ
+extern unsigned long next_timer_interrupt(void);
+#endif
+
/***
* add_timer - start a timer
* @timer: the timer to be added
diff -urN linux-2.6/kernel/rcupdate.c linux-2.6-s390/kernel/rcupdate.c
--- linux-2.6/kernel/rcupdate.c Wed Apr 21 16:29:19 2004
+++ linux-2.6-s390/kernel/rcupdate.c Wed Apr 21 16:29:41 2004
@@ -111,7 +111,7 @@
return;
}
/* Can't change, since spin lock held. */
- rcu_ctrlblk.rcu_cpu_mask = cpu_online_map;
+ rcu_set_active_cpu_map(&rcu_ctrlblk.rcu_cpu_mask);
}
/*
diff -urN linux-2.6/kernel/sysctl.c linux-2.6-s390/kernel/sysctl.c
--- linux-2.6/kernel/sysctl.c Wed Apr 21 16:29:19 2004
+++ linux-2.6-s390/kernel/sysctl.c Wed Apr 21 16:29:41 2004
@@ -108,6 +108,10 @@
extern int sysctl_userprocess_debug;
#endif
+#ifdef CONFIG_NO_IDLE_HZ
+extern int sysctl_hz_timer;
+#endif
+
#if defined(CONFIG_PPC32) && defined(CONFIG_6xx)
extern unsigned long powersave_nap;
int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
@@ -573,6 +577,16 @@
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+ {
+ .ctl_name = KERN_S390_HZ_TIMER,
+ .procname = "hz_timer",
+ .data = &sysctl_hz_timer,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#endif
{
.ctl_name = KERN_S390_USER_DEBUG_LOGGING,
diff -urN linux-2.6/kernel/timer.c linux-2.6-s390/kernel/timer.c
--- linux-2.6/kernel/timer.c Wed Apr 21 16:29:19 2004
+++ linux-2.6-s390/kernel/timer.c Wed Apr 21 16:29:41 2004
@@ -428,6 +428,75 @@
spin_unlock_irq(&base->lock);
}
+#ifdef CONFIG_NO_IDLE_HZ
+/*
+ * Find out when the next timer event is due to happen. This
+ * is used on S/390 to stop all activity when a cpus is idle.
+ * This functions needs to be called disabled.
+ */
+unsigned long next_timer_interrupt(void)
+{
+ tvec_base_t *base;
+ struct list_head *list;
+ struct timer_list *nte;
+ unsigned long expires;
+ tvec_t *varray[4];
+ int i, j;
+
+ base = &__get_cpu_var(tvec_bases);
+ spin_lock(&base->lock);
+ expires = base->timer_jiffies + (LONG_MAX >> 1);
+ list = 0;
+
+ /* Look for timer events in tv1. */
+ j = base->timer_jiffies & TVR_MASK;
+ do {
+ list_for_each_entry(nte, base->tv1.vec + j, entry) {
+ expires = nte->expires;
+ if (j < (base->timer_jiffies & TVR_MASK))
+ list = base->tv2.vec + (INDEX(0));
+ goto found;
+ }
+ j = (j + 1) & TVR_MASK;
+ } while (j != (base->timer_jiffies & TVR_MASK));
+
+ /* Check tv2-tv5. */
+ varray[0] = &base->tv2;
+ varray[1] = &base->tv3;
+ varray[2] = &base->tv4;
+ varray[3] = &base->tv5;
+ for (i = 0; i < 4; i++) {
+ j = INDEX(i);
+ do {
+ if (list_empty(varray[i]->vec + j)) {
+ j = (j + 1) & TVN_MASK;
+ continue;
+ }
+ list_for_each_entry(nte, varray[i]->vec + j, entry)
+ if (time_before(nte->expires, expires))
+ expires = nte->expires;
+ if (j < (INDEX(i)) && i < 3)
+ list = varray[i + 1]->vec + (INDEX(i + 1));
+ goto found;
+ } while (j != (INDEX(i)));
+ }
+found:
+ if (list) {
+ /*
+ * The search wrapped. We need to look at the next list
+ * from next tv element that would cascade into tv element
+ * where we found the timer element.
+ */
+ list_for_each_entry(nte, list, entry) {
+ if (time_before(nte->expires, expires))
+ expires = nte->expires;
+ }
+ }
+ spin_unlock(&base->lock);
+ return expires;
+}
+#endif
+
/******************************************************************/
/*
> +++ linux-2.6-s390/include/linux/rcupdate.h Wed Apr 21 16:29:41 2004
> @@ -121,6 +121,34 @@
> return 0;
> }
>
> +#ifdef CONFIG_NO_IDLE_HZ
> +
> +extern cpumask_t idle_cpu_mask;
> +
> +/*
> + * RCU is build for ticking systems. Without the HZ timer
> + * we have not enought state changes which may result in a
> + * never finished RCU request.
> + * In a tickless system we don't want to wake idle CPUs just
> + * to finish the RCU request. That is possible because the
> + * idle CPUs satisfy the quiescilant RCU condition anyway.
> + */
> +static inline void rcu_set_active_cpu_map(cpumask_t *mask)
> +{
> + cpumask_t active = idle_cpu_mask;
> + cpus_complement(active);
> + cpus_and(*mask, cpu_online_map, active);
> +}
> +
> +#else
> +
> +static inline void rcu_set_active_cpu_map(cpumask_t *mask)
> +{
> + *mask = cpu_online_map;
> +}
> +
> +#endif
This is a bit ugly. What about inlining the CONFIG_NO_IDLE_HZ case
of this function in it's only caller and define idle_cpu_mask to
an empty cpu mask for all other arches?
> --- linux-2.6/include/linux/sysctl.h Wed Apr 21 16:29:19 2004
> +++ linux-2.6-s390/include/linux/sysctl.h Wed Apr 21 16:29:41 2004
> @@ -132,6 +132,7 @@
> KERN_PTY=62, /* dir: pty driver */
> KERN_NGROUPS_MAX=63, /* int: NGROUPS_MAX */
> KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
> + KERN_S390_HZ_TIMER=64, /* int: hz timer on or off */
Kill the S390, this seems usefull for a bunch of other architectures.
> --- linux-2.6/include/linux/timer.h Sun Apr 4 05:37:37 2004
> +++ linux-2.6-s390/include/linux/timer.h Wed Apr 21 16:29:41 2004
> @@ -65,6 +65,10 @@
> extern int __mod_timer(struct timer_list *timer, unsigned long expires);
> extern int mod_timer(struct timer_list *timer, unsigned long expires);
>
> +#ifdef CONFIG_NO_IDLE_HZ
> +extern unsigned long next_timer_interrupt(void);
> +#endif
kill the ifdef. externs don't need to be cpp'ed away.
> --- linux-2.6/kernel/sysctl.c Wed Apr 21 16:29:19 2004
> +++ linux-2.6-s390/kernel/sysctl.c Wed Apr 21 16:29:41 2004
> @@ -108,6 +108,10 @@
> extern int sysctl_userprocess_debug;
> #endif
>
> +#ifdef CONFIG_NO_IDLE_HZ
> +extern int sysctl_hz_timer;
dito.
On Wed, Apr 21, 2004 at 06:31:08PM +0200, Martin Schwidefsky wrote:
> > This is a bit ugly. What about inlining the CONFIG_NO_IDLE_HZ case
> > of this function in it's only caller and define idle_cpu_mask to
> > an empty cpu mask for all other arches?
>
> This would mean that all other arches need to do the above three
> statements in rcu_start_batch. If this is acceptable we certainly
> can introduce a global idle_cpu_mask. Where? sched.c?
My hope was gcc would actually optimize it away if it was a CPP constant
instead of a variable.
Hi Christoph,
> > +static inline void rcu_set_active_cpu_map(cpumask_t *mask)
> > +{
> > + cpumask_t active = idle_cpu_mask;
> > + cpus_complement(active);
> > + cpus_and(*mask, cpu_online_map, active);
> > +}
>
> This is a bit ugly. What about inlining the CONFIG_NO_IDLE_HZ case
> of this function in it's only caller and define idle_cpu_mask to
> an empty cpu mask for all other arches?
This would mean that all other arches need to do the above three
statements in rcu_start_batch. If this is acceptable we certainly
can introduce a global idle_cpu_mask. Where? sched.c?
> > + KERN_S390_HZ_TIMER=64, /* int: hz timer on or off */
>
> Kill the S390, this seems usefull for a bunch of other architectures.
Ok, makes sense.
> > +#ifdef CONFIG_NO_IDLE_HZ
> > +extern unsigned long next_timer_interrupt(void);
> > +#endif
>
> kill the ifdef. externs don't need to be cpp'ed away.
Aye, aye, sir ;-)
blue skies,
Martin
Linux/390 Design & Development, IBM Deutschland Entwicklung GmbH
Sch?naicherstr. 220, D-71032 B?blingen, Telefon: 49 - (0)7031 - 16-2247
E-Mail: [email protected]
> > This would mean that all other arches need to do the above three
> > statements in rcu_start_batch. If this is acceptable we certainly
> > can introduce a global idle_cpu_mask. Where? sched.c?
>
> My hope was gcc would actually optimize it away if it was a CPP constant
> instead of a variable.
Now I got it. You want to introduce a generic idle_cpu_mask which is a
#define to CPU_MASK_NONE and only an exploiter would use a real variable.
This is just a matter of test. I'll give it a try.
blue skies,
Martin
Linux/390 Design & Development, IBM Deutschland Entwicklung GmbH
Sch?naicherstr. 220, D-71032 B?blingen, Telefon: 49 - (0)7031 - 16-2247
E-Mail: [email protected]
Hi Christoph,
new patch with your suggestions. I compiled a kernel for i386. The
code for rcu_start_batch isn't identical but it didn't looked too
bad either. I still think a #ifdef in rcpu_start_batch would help.
Any more comments ?
blue skies,
Martin.
---
[PATCH] s390: no timer interrupts in idle.
From: Martin Schwidefsky <[email protected]>
This patch add a system control that allows to switch off the jiffies
timer interrupts while a cpu sleeps in idle. This is useful for a system
running with virtual cpus under z/VM.
diffstat:
arch/s390/Kconfig | 19 +++++
arch/s390/defconfig | 1
arch/s390/kernel/process.c | 10 +-
arch/s390/kernel/time.c | 158 ++++++++++++++++++++++++++++++++++++++-------
arch/s390/kernel/traps.c | 4 -
include/asm-s390/cpumask.h | 3
include/linux/sysctl.h | 1
include/linux/timer.h | 2
kernel/rcupdate.c | 9 ++
kernel/sysctl.c | 12 +++
kernel/timer.c | 69 +++++++++++++++++++
11 files changed, 259 insertions(+), 29 deletions(-)
diff -urN linux-2.6/arch/s390/Kconfig linux-2.6-s390/arch/s390/Kconfig
--- linux-2.6/arch/s390/Kconfig Wed Apr 21 20:25:32 2004
+++ linux-2.6-s390/arch/s390/Kconfig Wed Apr 21 20:25:33 2004
@@ -333,6 +333,25 @@
This can also be compiled as a module, which will be called
appldata_net_sum.o.
+config NO_IDLE_HZ
+ bool "No HZ timer ticks in idle"
+ help
+ Switches the regular HZ timer off when the system is going idle.
+ This helps z/VM to detect that the Linux system is idle. VM can
+ then "swap-out" this guest which reduces memory usage. It also
+ reduces the overhead of idle systems.
+
+ The HZ timer can be switched on/off via /proc/sys/kernel/hz_timer.
+ hz_timer=0 means HZ timer is disabled. hz_timer=1 means HZ
+ timer is active.
+
+config NO_IDLE_HZ_INIT
+ bool "HZ timer in idle off by default"
+ depends on NO_IDLE_HZ
+ help
+ The HZ timer is switched off in idle by default. That means the
+ HZ timer is already disabled at boot time.
+
endmenu
config PCMCIA
diff -urN linux-2.6/arch/s390/defconfig linux-2.6-s390/arch/s390/defconfig
--- linux-2.6/arch/s390/defconfig Wed Apr 21 20:25:33 2004
+++ linux-2.6-s390/arch/s390/defconfig Wed Apr 21 20:25:58 2004
@@ -83,6 +83,7 @@
# CONFIG_SHARED_KERNEL is not set
# CONFIG_CMM is not set
# CONFIG_VIRT_TIMER is not set
+# CONFIG_NO_IDLE_HZ is not set
# CONFIG_PCMCIA is not set
#
diff -urN linux-2.6/arch/s390/kernel/process.c linux-2.6-s390/arch/s390/kernel/process.c
--- linux-2.6/arch/s390/kernel/process.c Wed Apr 21 20:25:05 2004
+++ linux-2.6-s390/arch/s390/kernel/process.c Wed Apr 21 20:25:33 2004
@@ -40,7 +40,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/irq.h>
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
#include <asm/timer.h>
#endif
@@ -75,17 +75,21 @@
psw_t wait_psw;
unsigned long reg;
+ local_irq_disable();
if (need_resched()) {
+ local_irq_enable();
schedule();
return;
}
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
/*
* hook to stop timers that should not tick while CPU is idle
*/
- if (stop_timers())
+ if (stop_timers()) {
+ local_irq_enable();
return;
+ }
#endif
/*
diff -urN linux-2.6/arch/s390/kernel/time.c linux-2.6-s390/arch/s390/kernel/time.c
--- linux-2.6/arch/s390/kernel/time.c Wed Apr 21 20:25:32 2004
+++ linux-2.6-s390/arch/s390/kernel/time.c Wed Apr 21 20:25:33 2004
@@ -331,29 +331,6 @@
return 0;
}
-void do_monitor_call(struct pt_regs *regs, long interruption_code)
-{
- /* disable monitor call class 0 */
- __ctl_clear_bit(8, 15);
-
- start_cpu_timer();
-}
-
-/*
- * called from cpu_idle to stop any timers
- * returns 1 if CPU should not be stopped
- */
-int stop_timers(void)
-{
- if (stop_cpu_timer())
- return 1;
-
- /* enable monitor call class 0 */
- __ctl_set_bit(8, 15);
-
- return 0;
-}
-
void set_vtimer(__u64 expires)
{
asm volatile ("SPT %0" : : "m" (expires));
@@ -474,6 +451,141 @@
}
#endif
+#ifdef CONFIG_NO_IDLE_HZ
+
+cpumask_t idle_cpu_mask = CPU_MASK_NONE;
+
+#ifdef CONFIG_NO_IDLE_HZ_INIT
+int sysctl_hz_timer = 0;
+#else
+int sysctl_hz_timer = 1;
+#endif
+
+/*
+ * Start the HZ tick on the current CPU.
+ * Only cpu_idle may call this function.
+ */
+void start_hz_timer(struct pt_regs *regs)
+{
+ __u64 tmp;
+ __u32 ticks;
+
+ if (!cpu_isset(smp_processor_id(), idle_cpu_mask))
+ return;
+
+ /* Calculate how many ticks have passed */
+ asm volatile ("STCK 0(%0)" : : "a" (&tmp) : "memory", "cc");
+ tmp = tmp + CLK_TICKS_PER_JIFFY - S390_lowcore.jiffy_timer;
+ ticks = __calculate_ticks(tmp);
+ S390_lowcore.jiffy_timer += CLK_TICKS_PER_JIFFY * (__u64) ticks;
+
+ /* Set the clock comparator to the next tick. */
+ tmp = S390_lowcore.jiffy_timer + CPU_DEVIATION;
+ asm volatile ("SCKC %0" : : "m" (tmp));
+
+ /* Charge the ticks. */
+ if (ticks > 0) {
+#ifdef CONFIG_SMP
+ /*
+ * Do not rely on the boot cpu to do the calls to do_timer.
+ * Spread it over all cpus instead.
+ */
+ write_seqlock(&xtime_lock);
+ if (S390_lowcore.jiffy_timer > xtime_cc) {
+ __u32 xticks;
+
+ tmp = S390_lowcore.jiffy_timer - xtime_cc;
+ if (tmp >= 2*CLK_TICKS_PER_JIFFY) {
+ xticks = __calculate_ticks(tmp);
+ xtime_cc += (__u64) xticks*CLK_TICKS_PER_JIFFY;
+ } else {
+ xticks = 1;
+ xtime_cc += CLK_TICKS_PER_JIFFY;
+ }
+ while (xticks--)
+ do_timer(regs);
+ }
+ write_sequnlock(&xtime_lock);
+ while (ticks--)
+ update_process_times(user_mode(regs));
+#else
+ while (ticks--)
+ do_timer(regs);
+#endif
+ }
+ cpu_clear(smp_processor_id(), idle_cpu_mask);
+}
+
+/*
+ * Stop the HZ tick on the current CPU.
+ * Only cpu_idle may call this function.
+ */
+int stop_hz_timer(void)
+{
+ __u64 timer;
+
+ if (sysctl_hz_timer != 0)
+ return 1;
+
+ /*
+ * Leave the clock comparator set up for the next timer
+ * tick if either rcu or a softirq is pending.
+ */
+ if (rcu_pending(smp_processor_id()) || local_softirq_pending())
+ return 1;
+
+ /*
+ * This cpu is going really idle. Set up the clock comparator
+ * for the next event.
+ */
+ cpu_set(smp_processor_id(), idle_cpu_mask);
+ timer = (__u64) (next_timer_interrupt() - jiffies) + jiffies_64;
+ timer = jiffies_timer_cc + timer * CLK_TICKS_PER_JIFFY;
+ asm volatile ("SCKC %0" : : "m" (timer));
+
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
+
+void do_monitor_call(struct pt_regs *regs, long interruption_code)
+{
+ /* disable monitor call class 0 */
+ __ctl_clear_bit(8, 15);
+
+#ifdef CONFIG_VIRT_TIMER
+ start_cpu_timer();
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+ start_hz_timer(regs);
+#endif
+}
+
+/*
+ * called from cpu_idle to stop any timers
+ * returns 1 if CPU should not be stopped
+ */
+int stop_timers(void)
+{
+#ifdef CONFIG_VIRT_TIMER
+ if (stop_cpu_timer())
+ return 1;
+#endif
+
+#ifdef CONFIG_NO_IDLE_HZ
+ if (stop_hz_timer())
+ return 1;
+#endif
+
+ /* enable monitor call class 0 */
+ __ctl_set_bit(8, 15);
+
+ return 0;
+}
+
+#endif
+
/*
* Start the clock comparator and the virtual CPU timer
* on the current CPU.
diff -urN linux-2.6/arch/s390/kernel/traps.c linux-2.6-s390/arch/s390/kernel/traps.c
--- linux-2.6/arch/s390/kernel/traps.c Sun Apr 4 05:36:55 2004
+++ linux-2.6-s390/arch/s390/kernel/traps.c Wed Apr 21 20:25:33 2004
@@ -64,7 +64,7 @@
extern void pfault_interrupt(struct pt_regs *regs, __u16 error_code);
static ext_int_info_t ext_int_pfault;
#endif
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_VIRT_TIMER)
extern pgm_check_handler_t do_monitor_call;
#endif
@@ -631,7 +631,7 @@
#endif /* CONFIG_ARCH_S390X */
pgm_check_table[0x15] = &operand_exception;
pgm_check_table[0x1C] = &privileged_op;
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
pgm_check_table[0x40] = &do_monitor_call;
#endif
if (MACHINE_IS_VM) {
diff -urN linux-2.6/include/asm-s390/cpumask.h linux-2.6-s390/include/asm-s390/cpumask.h
--- linux-2.6/include/asm-s390/cpumask.h Sun Apr 4 05:38:13 2004
+++ linux-2.6-s390/include/asm-s390/cpumask.h Wed Apr 21 20:23:48 2004
@@ -3,4 +3,7 @@
#include <asm-generic/cpumask.h>
+#define __ARCH_HAS_IDLE_CPU_MASK
+extern cpumask_t idle_cpu_mask;
+
#endif /* _ASM_S390_CPUMASK_H */
diff -urN linux-2.6/include/linux/sysctl.h linux-2.6-s390/include/linux/sysctl.h
--- linux-2.6/include/linux/sysctl.h Wed Apr 21 20:25:09 2004
+++ linux-2.6-s390/include/linux/sysctl.h Wed Apr 21 20:25:33 2004
@@ -132,6 +132,7 @@
KERN_PTY=62, /* dir: pty driver */
KERN_NGROUPS_MAX=63, /* int: NGROUPS_MAX */
KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
+ KERN_HZ_TIMER=65, /* int: hz timer on or off */
};
diff -urN linux-2.6/include/linux/timer.h linux-2.6-s390/include/linux/timer.h
--- linux-2.6/include/linux/timer.h Sun Apr 4 05:37:37 2004
+++ linux-2.6-s390/include/linux/timer.h Wed Apr 21 20:25:33 2004
@@ -65,6 +65,8 @@
extern int __mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
+extern unsigned long next_timer_interrupt(void);
+
/***
* add_timer - start a timer
* @timer: the timer to be added
diff -urN linux-2.6/kernel/rcupdate.c linux-2.6-s390/kernel/rcupdate.c
--- linux-2.6/kernel/rcupdate.c Wed Apr 21 20:25:10 2004
+++ linux-2.6-s390/kernel/rcupdate.c Wed Apr 21 20:25:33 2004
@@ -96,6 +96,10 @@
}
}
+#ifndef __ARCH_HAS_IDLE_CPU_MASK
+#define idle_cpu_mask CPU_MASK_NONE
+#endif
+
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
@@ -111,7 +115,10 @@
return;
}
/* Can't change, since spin lock held. */
- rcu_ctrlblk.rcu_cpu_mask = cpu_online_map;
+ rcu_ctrlblk.rcu_cpu_mask = idle_cpu_mask;
+ cpus_complement(rcu_ctrlblk.rcu_cpu_mask);
+ cpus_and(rcu_ctrlblk.rcu_cpu_mask, cpu_online_map,
+ rcu_ctrlblk.rcu_cpu_mask);
}
/*
diff -urN linux-2.6/kernel/sysctl.c linux-2.6-s390/kernel/sysctl.c
--- linux-2.6/kernel/sysctl.c Wed Apr 21 20:25:10 2004
+++ linux-2.6-s390/kernel/sysctl.c Wed Apr 21 20:25:33 2004
@@ -108,6 +108,8 @@
extern int sysctl_userprocess_debug;
#endif
+extern int sysctl_hz_timer;
+
#if defined(CONFIG_PPC32) && defined(CONFIG_6xx)
extern unsigned long powersave_nap;
int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
@@ -573,6 +575,16 @@
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+ {
+ .ctl_name = KERN_HZ_TIMER,
+ .procname = "hz_timer",
+ .data = &sysctl_hz_timer,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#endif
{
.ctl_name = KERN_S390_USER_DEBUG_LOGGING,
diff -urN linux-2.6/kernel/timer.c linux-2.6-s390/kernel/timer.c
--- linux-2.6/kernel/timer.c Wed Apr 21 20:25:10 2004
+++ linux-2.6-s390/kernel/timer.c Wed Apr 21 20:25:33 2004
@@ -428,6 +428,75 @@
spin_unlock_irq(&base->lock);
}
+#ifdef CONFIG_NO_IDLE_HZ
+/*
+ * Find out when the next timer event is due to happen. This
+ * is used on S/390 to stop all activity when a cpus is idle.
+ * This functions needs to be called disabled.
+ */
+unsigned long next_timer_interrupt(void)
+{
+ tvec_base_t *base;
+ struct list_head *list;
+ struct timer_list *nte;
+ unsigned long expires;
+ tvec_t *varray[4];
+ int i, j;
+
+ base = &__get_cpu_var(tvec_bases);
+ spin_lock(&base->lock);
+ expires = base->timer_jiffies + (LONG_MAX >> 1);
+ list = 0;
+
+ /* Look for timer events in tv1. */
+ j = base->timer_jiffies & TVR_MASK;
+ do {
+ list_for_each_entry(nte, base->tv1.vec + j, entry) {
+ expires = nte->expires;
+ if (j < (base->timer_jiffies & TVR_MASK))
+ list = base->tv2.vec + (INDEX(0));
+ goto found;
+ }
+ j = (j + 1) & TVR_MASK;
+ } while (j != (base->timer_jiffies & TVR_MASK));
+
+ /* Check tv2-tv5. */
+ varray[0] = &base->tv2;
+ varray[1] = &base->tv3;
+ varray[2] = &base->tv4;
+ varray[3] = &base->tv5;
+ for (i = 0; i < 4; i++) {
+ j = INDEX(i);
+ do {
+ if (list_empty(varray[i]->vec + j)) {
+ j = (j + 1) & TVN_MASK;
+ continue;
+ }
+ list_for_each_entry(nte, varray[i]->vec + j, entry)
+ if (time_before(nte->expires, expires))
+ expires = nte->expires;
+ if (j < (INDEX(i)) && i < 3)
+ list = varray[i + 1]->vec + (INDEX(i + 1));
+ goto found;
+ } while (j != (INDEX(i)));
+ }
+found:
+ if (list) {
+ /*
+ * The search wrapped. We need to look at the next list
+ * from next tv element that would cascade into tv element
+ * where we found the timer element.
+ */
+ list_for_each_entry(nte, list, entry) {
+ if (time_before(nte->expires, expires))
+ expires = nte->expires;
+ }
+ }
+ spin_unlock(&base->lock);
+ return expires;
+}
+#endif
+
/******************************************************************/
/*
On Thu, Apr 22, 2004 at 02:13:04AM +0530, Dipankar Sarma wrote:
> I think CPU_MASK_NONE can be used only for assignments. You need
> to actually declare a generic idle_cpu_mask and set it to CPU_MASK_NONE
> for all other archs. Of course, then the compiler will not be able
> to optimize it out :)
Well, there's a const keyword in C these days, no?
On Wed, Apr 21, 2004 at 06:37:29PM +0200, Martin Schwidefsky wrote:
> > > This would mean that all other arches need to do the above three
> > > statements in rcu_start_batch. If this is acceptable we certainly
> > > can introduce a global idle_cpu_mask. Where? sched.c?
> >
> > My hope was gcc would actually optimize it away if it was a CPP constant
> > instead of a variable.
>
> Now I got it. You want to introduce a generic idle_cpu_mask which is a
> #define to CPU_MASK_NONE and only an exploiter would use a real variable.
> This is just a matter of test. I'll give it a try.
I think CPU_MASK_NONE can be used only for assignments. You need
to actually declare a generic idle_cpu_mask and set it to CPU_MASK_NONE
for all other archs. Of course, then the compiler will not be able
to optimize it out :)
Thanks
Dipankar
On Wed, Apr 21, 2004 at 09:46:05PM +0100, Christoph Hellwig wrote:
> On Thu, Apr 22, 2004 at 02:13:04AM +0530, Dipankar Sarma wrote:
> > I think CPU_MASK_NONE can be used only for assignments. You need
> > to actually declare a generic idle_cpu_mask and set it to CPU_MASK_NONE
> > for all other archs. Of course, then the compiler will not be able
> > to optimize it out :)
>
> Well, there's a const keyword in C these days, no?
OK, then I missed what optimization you were talking about or underestimated
gcc. Can gcc do inter-procedural constant propagation ?
Thanks
Dipankar
On Wed, Apr 21, 2004 at 08:52:06PM +0200, Martin Schwidefsky wrote:
> diff -urN linux-2.6/kernel/rcupdate.c linux-2.6-s390/kernel/rcupdate.c
> --- linux-2.6/kernel/rcupdate.c Wed Apr 21 20:25:10 2004
> +++ linux-2.6-s390/kernel/rcupdate.c Wed Apr 21 20:25:33 2004
> @@ -96,6 +96,10 @@
> }
> }
>
> +#ifndef __ARCH_HAS_IDLE_CPU_MASK
> +#define idle_cpu_mask CPU_MASK_NONE
> +#endif
> +
> /*
> * Register a new batch of callbacks, and start it up if there is currently no
> * active batch and the batch to be registered has not already occurred.
> @@ -111,7 +115,10 @@
> return;
> }
> /* Can't change, since spin lock held. */
> - rcu_ctrlblk.rcu_cpu_mask = cpu_online_map;
> + rcu_ctrlblk.rcu_cpu_mask = idle_cpu_mask;
> + cpus_complement(rcu_ctrlblk.rcu_cpu_mask);
> + cpus_and(rcu_ctrlblk.rcu_cpu_mask, cpu_online_map,
> + rcu_ctrlblk.rcu_cpu_mask);
> }
Defining idle_cpu_mask in the middle of RCU code is really not a good idea.
A cleaner solution would be to define idle_cpu_mask in sched.c
and initialize it to CPU_MASK_NONE there. You could put it in
sched.h, but then there is the likelyhood of people using
idle_cpu_mask for things other than initialization in which
case NR_CPUS > 64 compilation will fail.
Thanks
Dipankar
Dipankar Sarma <[email protected]> writes:
> On Wed, Apr 21, 2004 at 09:46:05PM +0100, Christoph Hellwig wrote:
>> On Thu, Apr 22, 2004 at 02:13:04AM +0530, Dipankar Sarma wrote:
>> > I think CPU_MASK_NONE can be used only for assignments. You need
>> > to actually declare a generic idle_cpu_mask and set it to CPU_MASK_NONE
>> > for all other archs. Of course, then the compiler will not be able
>> > to optimize it out :)
>>
>> Well, there's a const keyword in C these days, no?
It is not strong enough in C unfortunately. It is still legal
to change const variables, so the compiler has to take that into
account and it is hard for global variables. C++ is better here.
> OK, then I missed what optimization you were talking about or underestimated
> gcc. Can gcc do inter-procedural constant propagation ?
Only when the functions are inlined
(but it is much better at that than it used to be, gcc 3.4 can even
inline across multiple files and order doesn't matter anymore)
-Andi
On Wed, 2004-04-21 at 16:49, Martin Schwidefsky wrote:
> [PATCH] s390: no timer interrupts in idle.
>
> From: Martin Schwidefsky <[email protected]>
>
> This patch add a system control that allows to switch off the jiffies
> timer interrupts while a cpu sleeps in idle. This is useful for a system
> running with virtual cpus under z/VM.
is this generally useful, eg can all architectures use the
infrastructure you propose ? I seriously hope so; s390 isn't the only
one who would benefit, I'd love to see a generic thing for this.
On Thu, Apr 22, 2004 at 10:48:11AM +0200, Martin Schwidefsky wrote:
>
>
>
>
> > is this generally useful, eg can all architectures use the
> > infrastructure you propose ? I seriously hope so; s390 isn't the only
> > one who would benefit, I'd love to see a generic thing for this.
>
> It is. All you have to do is to rework the timer functions for the
> architecture you want to support. This can be quite complicated
> though. There are some subtle races if you want to switch of the
> 100 HZ timer for a cpu. We had some problem with cpus that didn't
> want to wake up anymore ...
well my worry is the API; should it be "turn the timer off" or should it be
"the next tick is THIS many from now". The later allows one to use the hw in
one-shot mode (PC's can do this) where the scheduler timeslice expiry ends
up being a timer as well.
> Defining idle_cpu_mask in the middle of RCU code is really not a good
idea.
> A cleaner solution would be to define idle_cpu_mask in sched.c
> and initialize it to CPU_MASK_NONE there. You could put it in
> sched.h, but then there is the likelyhood of people using
> idle_cpu_mask for things other than initialization in which
> case NR_CPUS > 64 compilation will fail.
Yes, the best solution would be to introduce idle_cpu_mask for all
architectures and to add the cpu_set/cpu_clear calls to the timer
code of these architectures. The problem is that it isn't easy to
find the correct place for the cpu_clear call. If a cpu gets woken
by an irq the cpu_clear has to be done first and then the irq
function may be executed. On s390 we use the monitor call (mc)
instruction to reenable to HZ timer and to clear the cpu bit in
idle_cpu_mask. Dunno if there is something similar for other
architectures.
blue skies,
Martin
> well my worry is the API; should it be "turn the timer off" or should it
be
> "the next tick is THIS many from now". The later allows one to use the hw
in
> one-shot mode (PC's can do this) where the scheduler timeslice expiry
ends
> up being a timer as well.
The sysctl is /proc/sys/kernel/hz_timer. If it contains a "1" then HZ timer
is
active in idle, a "0" indicates that the HZ timer is switchted off.
The semantic "the next tick is THIS many <ticks> from now" IHMO doesn't make
any sense. Skipping ticks will have some really bad effects, e.g. if the xtime
isn't up-to-date network packets will get incorrect time stamps, timer events
will be delivered too late, etc.
blue skies,
Martin
On Thu, Apr 22, 2004 at 12:53:15PM +0200, Martin Schwidefsky wrote:
> active in idle, a "0" indicates that the HZ timer is switchted off.
> The semantic "the next tick is THIS many <ticks> from now" IHMO doesn't make
> any sense. Skipping ticks will have some really bad effects, e.g. if the xtime
> isn't up-to-date network packets will get incorrect time stamps, timer events
> will be delivered too late, etc.
why? Most hardware have an alternative time source for such time stamps.
Timer events *won't* be delivered too late, simply *because* the timer said
"don't bother checking back for X amount of time", so when that time has
expired (eg the delay) then the timer comparison tells the kernel "ok this
one is due now".
> why? Most hardware have an alternative time source for such time stamps.
> Timer events *won't* be delivered too late, simply *because* the timer
said
> "don't bother checking back for X amount of time", so when that time has
> expired (eg the delay) then the timer comparison tells the kernel "ok
this
> one is due now".
Huh? You lost me there. The HZ timer is the interrupt that will trigger the
execution of a timer event. If you say "don't bother checking back for X
amount of time" the cpu stays in idle doing nothing. You won't get control
to execute the timer event.
blue skies,
Martin
On Thu, Apr 22, 2004 at 01:09:22PM +0200, Martin Schwidefsky wrote:
>
>
>
>
> > why? Most hardware have an alternative time source for such time stamps.
> > Timer events *won't* be delivered too late, simply *because* the timer
> said
> > "don't bother checking back for X amount of time", so when that time has
> > expired (eg the delay) then the timer comparison tells the kernel "ok
> this
> > one is due now".
>
> Huh? You lost me there. The HZ timer is the interrupt that will trigger the
> execution of a timer event. If you say "don't bother checking back for X
> amount of time" the cpu stays in idle doing nothing. You won't get control
> to execute the timer event.
ok maybe I need to word it differently.
What I'm proposing as alternative is using the one shot mode of the timers
on most machines to do teh following:
when the timer irq hits, you do the business you need to do. And then you
check all existing timers and the scheduler when the next "virtual tick" is where
you're going to do real work. You then set the one-shot counter to that
amount. This means that in add_timer/mod_timer you will need to check if the
just added timer is before the current one-shot runs out, and if so, adjust
it. Perhaps in the scheduler too.
You *do* get back control to do the timer event, due to the one-shot timer
firing at just the right time. And this works regardless of the cpu being
idle or not, so it also wins back that 1% performance cost to HPC that
HC=1000 has etc etc because you just don't do irq's until the HPC task runs
out of it's timeslice.
You can go even a step further and introduce another request_irq flag that
makes the irq subsystem treat such marked irqs as if they were timer irqs,
eg run the timerlist and then set a new mark. That way you may even be able
to do the timers before the actual timer IRQ hits and thus avoiding it (this
does mean setting the timer IRQ to somewhat after the optimal time in order
to have a window for this to happen).
> is this generally useful, eg can all architectures use the
> infrastructure you propose ? I seriously hope so; s390 isn't the only
> one who would benefit, I'd love to see a generic thing for this.
It is. All you have to do is to rework the timer functions for the
architecture you want to support. This can be quite complicated
though. There are some subtle races if you want to switch of the
100 HZ timer for a cpu. We had some problem with cpus that didn't
want to wake up anymore ...
blue skies,
Martin
> ok maybe I need to word it differently.
> What I'm proposing as alternative is using the one shot mode of the
timers
> on most machines to do teh following:
> when the timer irq hits, you do the business you need to do. And then you
> check all existing timers and the scheduler when the next "virtual tick"
is where
> you're going to do real work. You then set the one-shot counter to that
> amount. This means that in add_timer/mod_timer you will need to check if
the
> just added timer is before the current one-shot runs out, and if so,
adjust
> it. Perhaps in the scheduler too.
You can't do that with the current timer code. A HZ timer interrupt is used
for several things: 1) increase jiffies_64, 2) update the xtime, 3) calculate
the load every 5 seconds, 4) check cpu time limits and send SIGXCPU,
5) do interval timer stuff, 6) run local timer queue and 7) add time slice to
current process. With your one-shot timer you won't do the correct updates
to the jiffies and the xtime.
blue skies,
Martin
On Thu, Apr 22, 2004 at 02:14:44PM +0200, Martin Schwidefsky wrote:
> > What I'm proposing as alternative is using the one shot mode of the
> timers
> > on most machines to do teh following:
> > when the timer irq hits, you do the business you need to do. And then you
> > check all existing timers and the scheduler when the next "virtual tick"
> is where
> > you're going to do real work. You then set the one-shot counter to that
> > amount. This means that in add_timer/mod_timer you will need to check if
> the
> > just added timer is before the current one-shot runs out, and if so,
> adjust
> > it. Perhaps in the scheduler too.
>
> You can't do that with the current timer code. A HZ timer interrupt is used
> for several things: 1) increase jiffies_64, 2) update the xtime, 3) calculate
> the load every 5 seconds, 4) check cpu time limits and send SIGXCPU,
> 5) do interval timer stuff, 6) run local timer queue and 7) add time slice to
> current process. With your one-shot timer you won't do the correct updates
> to the jiffies and the xtime.
xtime is easy, that's interpolated anyway afaics. Jiffies would either just
jump some, which code needs to deal with anyway given that preempt can do
the same, or would become an approximated thing as well based on the other
time keeping sources in the system.
calculating the load can be a real timer for sure (which would cause an irq
at that time), cpu limits we can do at the end of timeslice (and set the
timeslice such that the limits won't be exceeded).
> xtime is easy, that's interpolated anyway afaics. Jiffies would either
just
> jump some, which code needs to deal with anyway given that preempt can do
> the same, or would become an approximated thing as well based on the
other
> time keeping sources in the system.
Unluckily no. xtime is not easy because the network stack uses this for
time stamps at several locations. Living in the past and time stamps for
network packets don't go together, do they?
> calculating the load can be a real timer for sure (which would cause an irq
> at that time), cpu limits we can do at the end of timeslice (and set the
> timeslice such that the limits won't be exceeded).
Nod, the load could easily be done with an add_timer and we can live with a
small inaccuracy as far as the cpu limits are concerned.
By the way I am planning to do a BOFS at the OLS in july where I'd like
to discuss exactly this kind of questions. Any chance that you'd be there
too?
blue skies,
Martin
On Thu, Apr 22, 2004 at 02:44:17PM +0200, Martin Schwidefsky wrote:
>
>
>
>
> > xtime is easy, that's interpolated anyway afaics. Jiffies would either
> just
> > jump some, which code needs to deal with anyway given that preempt can do
> > the same, or would become an approximated thing as well based on the
> other
> > time keeping sources in the system.
>
> Unluckily no. xtime is not easy because the network stack uses this for
> time stamps at several locations. Living in the past and time stamps for
> network packets don't go together, do they?
I thought this got fixed last week. But as I said it's easy to interpolate.
> By the way I am planning to do a BOFS at the OLS in july where I'd like
> to discuss exactly this kind of questions. Any chance that you'd be there
> too?
quite possible, I haven't booked my flight yet tho
Hi Dipankar,
here is my newest version of the timer patch. I picked up your
suggestion to add idle_cpu_mask to sched.c. Anything else ?
blue skies,
Martin.
---
[PATCH] s390: no timer interrupts in idle.
From: Martin Schwidefsky <[email protected]>
This patch add a system control that allows to switch off the jiffies
timer interrupts while a cpu sleeps in idle. This is useful for a system
running with virtual cpus under z/VM.
diffstat:
arch/s390/Kconfig | 19 +++++
arch/s390/defconfig | 1
arch/s390/kernel/process.c | 10 ++
arch/s390/kernel/time.c | 156 ++++++++++++++++++++++++++++++++++++++-------
arch/s390/kernel/traps.c | 4 -
include/linux/sched.h | 2
include/linux/sysctl.h | 1
include/linux/timer.h | 2
kernel/rcupdate.c | 6 +
kernel/sched.c | 9 ++
kernel/sysctl.c | 12 +++
kernel/timer.c | 69 +++++++++++++++++++
12 files changed, 262 insertions(+), 29 deletions(-)
diff -urN linux-2.6/arch/s390/Kconfig linux-2.6-s390/arch/s390/Kconfig
--- linux-2.6/arch/s390/Kconfig Thu Apr 22 15:54:53 2004
+++ linux-2.6-s390/arch/s390/Kconfig Thu Apr 22 15:54:54 2004
@@ -333,6 +333,25 @@
This can also be compiled as a module, which will be called
appldata_net_sum.o.
+config NO_IDLE_HZ
+ bool "No HZ timer ticks in idle"
+ help
+ Switches the regular HZ timer off when the system is going idle.
+ This helps z/VM to detect that the Linux system is idle. VM can
+ then "swap-out" this guest which reduces memory usage. It also
+ reduces the overhead of idle systems.
+
+ The HZ timer can be switched on/off via /proc/sys/kernel/hz_timer.
+ hz_timer=0 means HZ timer is disabled. hz_timer=1 means HZ
+ timer is active.
+
+config NO_IDLE_HZ_INIT
+ bool "HZ timer in idle off by default"
+ depends on NO_IDLE_HZ
+ help
+ The HZ timer is switched off in idle by default. That means the
+ HZ timer is already disabled at boot time.
+
endmenu
config PCMCIA
diff -urN linux-2.6/arch/s390/defconfig linux-2.6-s390/arch/s390/defconfig
--- linux-2.6/arch/s390/defconfig Thu Apr 22 15:54:54 2004
+++ linux-2.6-s390/arch/s390/defconfig Thu Apr 22 15:54:54 2004
@@ -83,6 +83,7 @@
# CONFIG_SHARED_KERNEL is not set
# CONFIG_CMM is not set
# CONFIG_VIRT_TIMER is not set
+# CONFIG_NO_IDLE_HZ is not set
# CONFIG_PCMCIA is not set
#
diff -urN linux-2.6/arch/s390/kernel/process.c linux-2.6-s390/arch/s390/kernel/process.c
--- linux-2.6/arch/s390/kernel/process.c Thu Apr 22 15:54:27 2004
+++ linux-2.6-s390/arch/s390/kernel/process.c Thu Apr 22 15:54:54 2004
@@ -40,7 +40,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/irq.h>
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
#include <asm/timer.h>
#endif
@@ -75,17 +75,21 @@
psw_t wait_psw;
unsigned long reg;
+ local_irq_disable();
if (need_resched()) {
+ local_irq_enable();
schedule();
return;
}
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
/*
* hook to stop timers that should not tick while CPU is idle
*/
- if (stop_timers())
+ if (stop_timers()) {
+ local_irq_enable();
return;
+ }
#endif
/*
diff -urN linux-2.6/arch/s390/kernel/time.c linux-2.6-s390/arch/s390/kernel/time.c
--- linux-2.6/arch/s390/kernel/time.c Thu Apr 22 15:54:53 2004
+++ linux-2.6-s390/arch/s390/kernel/time.c Thu Apr 22 15:54:54 2004
@@ -331,29 +331,6 @@
return 0;
}
-void do_monitor_call(struct pt_regs *regs, long interruption_code)
-{
- /* disable monitor call class 0 */
- __ctl_clear_bit(8, 15);
-
- start_cpu_timer();
-}
-
-/*
- * called from cpu_idle to stop any timers
- * returns 1 if CPU should not be stopped
- */
-int stop_timers(void)
-{
- if (stop_cpu_timer())
- return 1;
-
- /* enable monitor call class 0 */
- __ctl_set_bit(8, 15);
-
- return 0;
-}
-
void set_vtimer(__u64 expires)
{
asm volatile ("SPT %0" : : "m" (expires));
@@ -474,6 +451,139 @@
}
#endif
+#ifdef CONFIG_NO_IDLE_HZ
+
+#ifdef CONFIG_NO_IDLE_HZ_INIT
+int sysctl_hz_timer = 0;
+#else
+int sysctl_hz_timer = 1;
+#endif
+
+/*
+ * Start the HZ tick on the current CPU.
+ * Only cpu_idle may call this function.
+ */
+void start_hz_timer(struct pt_regs *regs)
+{
+ __u64 tmp;
+ __u32 ticks;
+
+ if (!cpu_isset(smp_processor_id(), idle_cpu_mask))
+ return;
+
+ /* Calculate how many ticks have passed */
+ asm volatile ("STCK 0(%0)" : : "a" (&tmp) : "memory", "cc");
+ tmp = tmp + CLK_TICKS_PER_JIFFY - S390_lowcore.jiffy_timer;
+ ticks = __calculate_ticks(tmp);
+ S390_lowcore.jiffy_timer += CLK_TICKS_PER_JIFFY * (__u64) ticks;
+
+ /* Set the clock comparator to the next tick. */
+ tmp = S390_lowcore.jiffy_timer + CPU_DEVIATION;
+ asm volatile ("SCKC %0" : : "m" (tmp));
+
+ /* Charge the ticks. */
+ if (ticks > 0) {
+#ifdef CONFIG_SMP
+ /*
+ * Do not rely on the boot cpu to do the calls to do_timer.
+ * Spread it over all cpus instead.
+ */
+ write_seqlock(&xtime_lock);
+ if (S390_lowcore.jiffy_timer > xtime_cc) {
+ __u32 xticks;
+
+ tmp = S390_lowcore.jiffy_timer - xtime_cc;
+ if (tmp >= 2*CLK_TICKS_PER_JIFFY) {
+ xticks = __calculate_ticks(tmp);
+ xtime_cc += (__u64) xticks*CLK_TICKS_PER_JIFFY;
+ } else {
+ xticks = 1;
+ xtime_cc += CLK_TICKS_PER_JIFFY;
+ }
+ while (xticks--)
+ do_timer(regs);
+ }
+ write_sequnlock(&xtime_lock);
+ while (ticks--)
+ update_process_times(user_mode(regs));
+#else
+ while (ticks--)
+ do_timer(regs);
+#endif
+ }
+ cpu_clear(smp_processor_id(), idle_cpu_mask);
+}
+
+/*
+ * Stop the HZ tick on the current CPU.
+ * Only cpu_idle may call this function.
+ */
+int stop_hz_timer(void)
+{
+ __u64 timer;
+
+ if (sysctl_hz_timer != 0)
+ return 1;
+
+ /*
+ * Leave the clock comparator set up for the next timer
+ * tick if either rcu or a softirq is pending.
+ */
+ if (rcu_pending(smp_processor_id()) || local_softirq_pending())
+ return 1;
+
+ /*
+ * This cpu is going really idle. Set up the clock comparator
+ * for the next event.
+ */
+ cpu_set(smp_processor_id(), idle_cpu_mask);
+ timer = (__u64) (next_timer_interrupt() - jiffies) + jiffies_64;
+ timer = jiffies_timer_cc + timer * CLK_TICKS_PER_JIFFY;
+ asm volatile ("SCKC %0" : : "m" (timer));
+
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
+
+void do_monitor_call(struct pt_regs *regs, long interruption_code)
+{
+ /* disable monitor call class 0 */
+ __ctl_clear_bit(8, 15);
+
+#ifdef CONFIG_VIRT_TIMER
+ start_cpu_timer();
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+ start_hz_timer(regs);
+#endif
+}
+
+/*
+ * called from cpu_idle to stop any timers
+ * returns 1 if CPU should not be stopped
+ */
+int stop_timers(void)
+{
+#ifdef CONFIG_VIRT_TIMER
+ if (stop_cpu_timer())
+ return 1;
+#endif
+
+#ifdef CONFIG_NO_IDLE_HZ
+ if (stop_hz_timer())
+ return 1;
+#endif
+
+ /* enable monitor call class 0 */
+ __ctl_set_bit(8, 15);
+
+ return 0;
+}
+
+#endif
+
/*
* Start the clock comparator and the virtual CPU timer
* on the current CPU.
diff -urN linux-2.6/arch/s390/kernel/traps.c linux-2.6-s390/arch/s390/kernel/traps.c
--- linux-2.6/arch/s390/kernel/traps.c Sun Apr 4 05:36:55 2004
+++ linux-2.6-s390/arch/s390/kernel/traps.c Thu Apr 22 15:54:54 2004
@@ -64,7 +64,7 @@
extern void pfault_interrupt(struct pt_regs *regs, __u16 error_code);
static ext_int_info_t ext_int_pfault;
#endif
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_VIRT_TIMER)
extern pgm_check_handler_t do_monitor_call;
#endif
@@ -631,7 +631,7 @@
#endif /* CONFIG_ARCH_S390X */
pgm_check_table[0x15] = &operand_exception;
pgm_check_table[0x1C] = &privileged_op;
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
pgm_check_table[0x40] = &do_monitor_call;
#endif
if (MACHINE_IS_VM) {
diff -urN linux-2.6/include/linux/sched.h linux-2.6-s390/include/linux/sched.h
--- linux-2.6/include/linux/sched.h Thu Apr 22 15:54:34 2004
+++ linux-2.6-s390/include/linux/sched.h Thu Apr 22 15:54:54 2004
@@ -149,6 +149,8 @@
extern void sched_init(void);
extern void init_idle(task_t *idle, int cpu);
+extern cpumask_t idle_cpu_mask;
+
extern void show_state(void);
extern void show_regs(struct pt_regs *);
extern void show_trace_task(task_t *tsk);
diff -urN linux-2.6/include/linux/sysctl.h linux-2.6-s390/include/linux/sysctl.h
--- linux-2.6/include/linux/sysctl.h Thu Apr 22 15:54:34 2004
+++ linux-2.6-s390/include/linux/sysctl.h Thu Apr 22 15:54:54 2004
@@ -132,6 +132,7 @@
KERN_PTY=62, /* dir: pty driver */
KERN_NGROUPS_MAX=63, /* int: NGROUPS_MAX */
KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
+ KERN_HZ_TIMER=65, /* int: hz timer on or off */
};
diff -urN linux-2.6/include/linux/timer.h linux-2.6-s390/include/linux/timer.h
--- linux-2.6/include/linux/timer.h Sun Apr 4 05:37:37 2004
+++ linux-2.6-s390/include/linux/timer.h Thu Apr 22 15:54:54 2004
@@ -65,6 +65,8 @@
extern int __mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
+extern unsigned long next_timer_interrupt(void);
+
/***
* add_timer - start a timer
* @timer: the timer to be added
diff -urN linux-2.6/kernel/rcupdate.c linux-2.6-s390/kernel/rcupdate.c
--- linux-2.6/kernel/rcupdate.c Thu Apr 22 15:54:34 2004
+++ linux-2.6-s390/kernel/rcupdate.c Thu Apr 22 15:54:54 2004
@@ -103,6 +103,8 @@
*/
static void rcu_start_batch(long newbatch)
{
+ cpumask_t active;
+
if (rcu_batch_before(rcu_ctrlblk.maxbatch, newbatch)) {
rcu_ctrlblk.maxbatch = newbatch;
}
@@ -111,7 +113,9 @@
return;
}
/* Can't change, since spin lock held. */
- rcu_ctrlblk.rcu_cpu_mask = cpu_online_map;
+ active = idle_cpu_mask;
+ cpus_complement(active);
+ cpus_and(rcu_ctrlblk.rcu_cpu_mask, cpu_online_map, active);
}
/*
diff -urN linux-2.6/kernel/sched.c linux-2.6-s390/kernel/sched.c
--- linux-2.6/kernel/sched.c Thu Apr 22 15:54:34 2004
+++ linux-2.6-s390/kernel/sched.c Thu Apr 22 15:54:54 2004
@@ -2684,6 +2684,15 @@
#endif
}
+/*
+ * In a system that switches off the HZ timer idle_cpu_mask
+ * indicates which cpus entered this state. This is used
+ * in the rcu update to wait only for active cpus. For system
+ * which do not switch off the HZ timer idle_cpu_mask should
+ * always be CPU_MASK_NONE.
+ */
+cpumask_t idle_cpu_mask = CPU_MASK_NONE;
+
#ifdef CONFIG_SMP
/*
* This is how migration works:
diff -urN linux-2.6/kernel/sysctl.c linux-2.6-s390/kernel/sysctl.c
--- linux-2.6/kernel/sysctl.c Thu Apr 22 15:54:34 2004
+++ linux-2.6-s390/kernel/sysctl.c Thu Apr 22 15:54:54 2004
@@ -108,6 +108,8 @@
extern int sysctl_userprocess_debug;
#endif
+extern int sysctl_hz_timer;
+
#if defined(CONFIG_PPC32) && defined(CONFIG_6xx)
extern unsigned long powersave_nap;
int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
@@ -573,6 +575,16 @@
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+ {
+ .ctl_name = KERN_HZ_TIMER,
+ .procname = "hz_timer",
+ .data = &sysctl_hz_timer,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#endif
{
.ctl_name = KERN_S390_USER_DEBUG_LOGGING,
diff -urN linux-2.6/kernel/timer.c linux-2.6-s390/kernel/timer.c
--- linux-2.6/kernel/timer.c Thu Apr 22 15:54:34 2004
+++ linux-2.6-s390/kernel/timer.c Thu Apr 22 15:54:54 2004
@@ -428,6 +428,75 @@
spin_unlock_irq(&base->lock);
}
+#ifdef CONFIG_NO_IDLE_HZ
+/*
+ * Find out when the next timer event is due to happen. This
+ * is used on S/390 to stop all activity when a cpus is idle.
+ * This functions needs to be called disabled.
+ */
+unsigned long next_timer_interrupt(void)
+{
+ tvec_base_t *base;
+ struct list_head *list;
+ struct timer_list *nte;
+ unsigned long expires;
+ tvec_t *varray[4];
+ int i, j;
+
+ base = &__get_cpu_var(tvec_bases);
+ spin_lock(&base->lock);
+ expires = base->timer_jiffies + (LONG_MAX >> 1);
+ list = 0;
+
+ /* Look for timer events in tv1. */
+ j = base->timer_jiffies & TVR_MASK;
+ do {
+ list_for_each_entry(nte, base->tv1.vec + j, entry) {
+ expires = nte->expires;
+ if (j < (base->timer_jiffies & TVR_MASK))
+ list = base->tv2.vec + (INDEX(0));
+ goto found;
+ }
+ j = (j + 1) & TVR_MASK;
+ } while (j != (base->timer_jiffies & TVR_MASK));
+
+ /* Check tv2-tv5. */
+ varray[0] = &base->tv2;
+ varray[1] = &base->tv3;
+ varray[2] = &base->tv4;
+ varray[3] = &base->tv5;
+ for (i = 0; i < 4; i++) {
+ j = INDEX(i);
+ do {
+ if (list_empty(varray[i]->vec + j)) {
+ j = (j + 1) & TVN_MASK;
+ continue;
+ }
+ list_for_each_entry(nte, varray[i]->vec + j, entry)
+ if (time_before(nte->expires, expires))
+ expires = nte->expires;
+ if (j < (INDEX(i)) && i < 3)
+ list = varray[i + 1]->vec + (INDEX(i + 1));
+ goto found;
+ } while (j != (INDEX(i)));
+ }
+found:
+ if (list) {
+ /*
+ * The search wrapped. We need to look at the next list
+ * from next tv element that would cascade into tv element
+ * where we found the timer element.
+ */
+ list_for_each_entry(nte, list, entry) {
+ if (time_before(nte->expires, expires))
+ expires = nte->expires;
+ }
+ }
+ spin_unlock(&base->lock);
+ return expires;
+}
+#endif
+
/******************************************************************/
/*
Martin Schwidefsky wrote:
>
>
>
>>xtime is easy, that's interpolated anyway afaics. Jiffies would either
>
> just
>
>>jump some, which code needs to deal with anyway given that preempt can do
>>the same, or would become an approximated thing as well based on the
>
> other
>
>>time keeping sources in the system.
>
>
> Unluckily no. xtime is not easy because the network stack uses this for
> time stamps at several locations. Living in the past and time stamps for
> network packets don't go together, do they?
>
>
>>calculating the load can be a real timer for sure (which would cause an irq
>>at that time), cpu limits we can do at the end of timeslice (and set the
>>timeslice such that the limits won't be exceeded).
Here is where this thing falls down. Some time ago I put together a tick less
system (which is what this amounts to). The patch is still on sourceforge (see
the HRT URL in my signature). The problem is this:
On context switch the scheduler needs to figure the minimum time to the next
event for the new task. This would be the minimum of the remaining slice,
profile timer, virtual time, and the cpu limit timer (at least). It would then
do an add_timer for this time. On the next context switch it would, most
likely, cancel the timer (most code does not run to the end of its slice which
is the most likely limit). The computation to find the minimum time, with a bit
of hand waving, could be shortened to eliminate a few of the timers. On switch
out, all the tasks timers would need to be updated with the actual time the task
used. The problem is that all this work is in the VERY lean and mean context
switch path. In my tests a context switching could easily occur often enough
that the savings from not doing the tick interrupts was over whelmed by the
added context switch over head with a medium cpu load. And it is down hill from
here. I.e. the tick less system incurres accounting overhead in direct
proportion to the number of context switches, while the ticking system has a
fixed accounting overhead. AND the cross over point (where the tick less system
overhead is more that the ticked system overhead) occurs with a medium load.
--
George Anzinger [email protected]
High-res-timers: http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml
> Here is where this thing falls down. Some time ago I put together a tick
less
> system (which is what this amounts to). The patch is still on
sourceforge (see
> the HRT URL in my signature).
On the HRT sourceforge page you'll find the i386 version of the tick less system
patch. The initial s390 patch can be found here:
http://www10.software.ibm.com/developerworks/opensource/linux390/current2_4_x-august2001.shtml#timer20031205
> On context switch the scheduler needs to figure the minimum time to the next
> event for the new task. This would be the minimum of the remaining slice,
> profile timer, virtual time, and the cpu limit timer (at least). It would then
> do an add_timer for this time. On the next context switch it would, most
> likely, cancel the timer (most code does not run to the end of its slice which
> is the most likely limit). The computation to find the minimum time, with a bit
> of hand waving, could be shortened to eliminate a few of the timers. On switch
> out, all the tasks timers would need to be updated with the actual time the task
> used. The problem is that all this work is in the VERY lean and mean context
> switch path. In my tests a context switching could easily occur often enough
> that the savings from not doing the tick interrupts was over whelmed by the
> added context switch over head with a medium cpu load. And it is down hill from
> here. I.e. the tick less system incurres accounting overhead in direct
> proportion to the number of context switches, while the ticking system has a
> fixed accounting overhead. AND the cross over point (where the tick less system
> overhead is more that the ticked system overhead) occurs with a medium load.
I do agree that is very likely a bad idea to do a tick less system for i386 and
friends. I still haven't given up the idea for s390 though. I plan to use the
cpu timer for all the process related stuff and the clock comparator for the
wall clock. This adds just two instructions to the system call entry path:
a store cpu timer "stpt" and a set cpu timer "spt" to switch from the process
cpu timer to the system cpu timer. The overhead is 27 cycles and benefit is
no more ticks and a much more accurate process accounting. This requires some
major surgery in the common timer code. I'm going to have fun with this.
blue skies,
Martin
Linux/390 Design & Development, IBM Deutschland Entwicklung GmbH
Sch?naicherstr. 220, D-71032 B?blingen, Telefon: 49 - (0)7031 - 16-2247
E-Mail: [email protected]
On Thu, Apr 22, 2004 at 04:56:25PM +0200, Martin Schwidefsky wrote:
> Hi Dipankar,
> here is my newest version of the timer patch. I picked up your
> suggestion to add idle_cpu_mask to sched.c. Anything else ?
Looks good except that I am wondering if idle_cpu_mask should
really be called nohz_cpu_mask. That is what it is, after all.
Thanks
Dipankar
> Looks good except that I am wondering if idle_cpu_mask should
> really be called nohz_cpu_mask. That is what it is, after all.
I don't thinks so. The idle_cpu_mask isn't dependent on the
no hz timer feature. I think it would make sense to set the
bits in idle_cpu_mask even on system that use the normal hz timer.
The tricky part is to find a way to clear the bits again after
a wakeup interrupt. This needs to be done before the interrupt
function is executed, you can't do it in idle().
blue skies,
Martin
Linux/390 Design & Development, IBM Deutschland Entwicklung GmbH
Sch?naicherstr. 220, D-71032 B?blingen, Telefon: 49 - (0)7031 - 16-2247
E-Mail: [email protected]
On Thu, Apr 29, 2004 at 09:43:21AM +0200, Martin Schwidefsky wrote:
> > Looks good except that I am wondering if idle_cpu_mask should
> > really be called nohz_cpu_mask. That is what it is, after all.
>
> I don't thinks so. The idle_cpu_mask isn't dependent on the
> no hz timer feature. I think it would make sense to set the
Well, you own patch says this :)
> +/*
> + * In a system that switches off the HZ timer idle_cpu_mask
> + * indicates which cpus entered this state. This is used
> + * in the rcu update to wait only for active cpus. For system
> + * which do not switch off the HZ timer idle_cpu_mask should
> + * always be CPU_MASK_NONE.
> + */
IOW, idle_cpu_mask is relevant (as of now) only when the
hz timer is switched off.
> bits in idle_cpu_mask even on system that use the normal hz timer.
> The tricky part is to find a way to clear the bits again after
> a wakeup interrupt. This needs to be done before the interrupt
> function is executed, you can't do it in idle().
idle_cpu_mask does not really represent CPUs that are conventionally
called "idle", it represents the ones that have hz timer switched
off (in your patch). So, why not just call it nohz_cpu_mask ?
RCU doesn't need an idle cpu mask, it has its own mechanism
for detecting idle cpus, it just needs to know about the ones
that have hz timers switched off. If you call it nohz_cpu_mask,
then it would make sense to say that for systems which do not
switch off hz timer, nohz_cpu_mask will always be CPU_MASK_NONE.
Thanks
Dipankar
> idle_cpu_mask does not really represent CPUs that are conventionally
> called "idle", it represents the ones that have hz timer switched
> off (in your patch). So, why not just call it nohz_cpu_mask ?
> RCU doesn't need an idle cpu mask, it has its own mechanism
> for detecting idle cpus, it just needs to know about the ones
> that have hz timers switched off. If you call it nohz_cpu_mask,
> then it would make sense to say that for systems which do not
> switch off hz timer, nohz_cpu_mask will always be CPU_MASK_NONE.
Ok, I don't really mind the name change. It's nohz_cpu_mask then.
blue skies,
Martin
Linux/390 Design & Development, IBM Deutschland Entwicklung GmbH
Sch?naicherstr. 220, D-71032 B?blingen, Telefon: 49 - (0)7031 - 16-2247
E-Mail: [email protected]
On Thu, Apr 29, 2004 at 10:24:23AM +0200, Martin Schwidefsky wrote:
>
> > idle_cpu_mask does not really represent CPUs that are conventionally
> > called "idle", it represents the ones that have hz timer switched
> > off (in your patch). So, why not just call it nohz_cpu_mask ?
> > RCU doesn't need an idle cpu mask, it has its own mechanism
> > for detecting idle cpus, it just needs to know about the ones
> > that have hz timers switched off. If you call it nohz_cpu_mask,
> > then it would make sense to say that for systems which do not
> > switch off hz timer, nohz_cpu_mask will always be CPU_MASK_NONE.
>
> Ok, I don't really mind the name change. It's nohz_cpu_mask then.
Thanks. Sorry about the name nitpick, I too didn't think about this
when Jan had first sent me the patch.
Dipankar