2012-06-13 16:20:33

by Frederic Weisbecker

[permalink] [raw]
Subject: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

Ingo, Thomas,

This starts the basic code that allow accounting of cputime spent tickless
outside idle, which is a first step to prepare for the adaptive nohz
infrastructure.

I hope we can set a tree in -tip for that. If you're fine with it
this is pullable from:

git://github.com/fweisbec/linux-dynticks.git
nohz-for-tip

Thanks.


Frederic Weisbecker (7):
nohz: Add more comment about CONFIG_NO_HZ
nohz: Introduce adaptive nohz config
nohz: Generalize tickless cpu time accounting
nohz: Account user and system times in adaptive nohz mode
x86: Syscall hooks for adaptive nohz mode
x86: Add adaptive tickless hooks on do_notify_resume()
x86: Exception hooks for adaptive tickless

arch/Kconfig | 8 ++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/thread_info.h | 10 ++-
arch/x86/kernel/ptrace.c | 5 ++
arch/x86/kernel/signal.c | 3 +
arch/x86/kernel/traps.c | 14 +++-
arch/x86/mm/fault.c | 13 +++-
include/linux/kernel_stat.h | 2 +
include/linux/tick.h | 59 +++++++++++-----
kernel/sched/core.c | 27 +++++++
kernel/time/Kconfig | 14 +++-
kernel/time/tick-sched.c | 139 ++++++++++++++++++++++++++++++-----
kernel/time/timer_list.c | 3 +-
13 files changed, 246 insertions(+), 52 deletions(-)

--
1.7.5.4


2012-06-13 16:20:55

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 7/7] x86: Exception hooks for adaptive tickless

Add necessary hooks to x86 exception for adaptive nohz
support so that the time spent on exceptions handling is
considered as system cputime.

This includes traps, page fault, debug exceptions, etc...

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Alessio Igor Bogani <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Avi Kivity <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Geoff Levand <[email protected]>
Cc: Gilad Ben Yossef <[email protected]>
Cc: Hakan Akkan <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Hilman <[email protected]>
Cc: Max Krasnyansky <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Sven-Thorsten Dietrich <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
arch/x86/Kconfig | 1 +
arch/x86/kernel/traps.c | 14 ++++++++++----
arch/x86/mm/fault.c | 13 +++++++++++--
3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c70684f..af77028 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -95,6 +95,7 @@ config X86
select KTIME_SCALAR if X86_32
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
+ select HAVE_NO_HZ_FULL

config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS || UPROBES)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 05b31d9..196e641 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -26,6 +26,7 @@
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/init.h>
+#include <linux/tick.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
@@ -311,6 +312,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
ftrace_int3_handler(regs))
return;
#endif
+ tick_nohz_enter_exception(regs);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@@ -330,6 +332,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
+ tick_nohz_exit_exception(regs);
}

#ifdef CONFIG_X86_64
@@ -390,6 +393,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6;
int si_code;

+ tick_nohz_enter_exception(regs);
+
get_debugreg(dr6, 6);

/* Filter out all the reserved bits which are preset to 1 */
@@ -405,7 +410,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)

/* Catch kmemcheck conditions first of all! */
if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
- return;
+ goto exit;

/* DR6 may or may not be cleared by the CPU */
set_debugreg(0, 6);
@@ -420,7 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)

if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
SIGTRAP) == NOTIFY_STOP)
- return;
+ goto exit;

/*
* Let others (NMI) know that the debug stack is in use
@@ -436,7 +441,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
X86_TRAP_DB);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
- return;
+ goto exit;
}

/*
@@ -457,7 +462,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();

- return;
+exit:
+ tick_nohz_exit_exception(regs);
}

/*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 76dcd9d..6b1ee80 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <linux/prefetch.h> /* prefetchw */
+#include <linux/tick.h>

#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -1000,8 +1001,8 @@ static int fault_in_kernel_space(unsigned long address)
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
-dotraplinkage void __kprobes
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+static void __kprobes
+__do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -1209,3 +1210,11 @@ good_area:

up_read(&mm->mmap_sem);
}
+
+dotraplinkage void __kprobes
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ tick_nohz_enter_exception(regs);
+ __do_page_fault(regs, error_code);
+ tick_nohz_exit_exception(regs);
+}
--
1.7.5.4

2012-06-13 16:20:53

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 6/7] x86: Add adaptive tickless hooks on do_notify_resume()

Before resuming to userspace, we may fall into do_notify_resume()
to handle signals or other things. And because we may be coming
from syscall/exception or interrupt exit, the current cputime is
considered as happening in userspace.

However we want do_notify_resume() cputime to be considered as
system time. Put the kernel boundaries hook in this function
to ensure that.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Alessio Igor Bogani <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Avi Kivity <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Geoff Levand <[email protected]>
Cc: Gilad Ben Yossef <[email protected]>
Cc: Hakan Akkan <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Hilman <[email protected]>
Cc: Max Krasnyansky <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Sven-Thorsten Dietrich <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/signal.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 21af737..9031fbb 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
+#include <linux/tick.h>

#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -776,6 +777,7 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
+ tick_nohz_enter_kernel();
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -801,6 +803,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
#endif /* CONFIG_X86_32 */
+ tick_nohz_exit_kernel();
}

void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
--
1.7.5.4

2012-06-13 16:20:50

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 5/7] x86: Syscall hooks for adaptive nohz mode

Add syscall hooks to notify syscall entry and exit on
CPUs running in full adative nohz mode. This way we
can account the cputime elapsed in kernel boundaries.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Alessio Igor Bogani <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Avi Kivity <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Geoff Levand <[email protected]>
Cc: Gilad Ben Yossef <[email protected]>
Cc: Hakan Akkan <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Hilman <[email protected]>
Cc: Max Krasnyansky <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Sven-Thorsten Dietrich <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
arch/x86/include/asm/thread_info.h | 10 +++++++---
arch/x86/kernel/ptrace.c | 5 +++++
2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 89f794f..c535d84 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -89,6 +89,7 @@ struct thread_info {
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
#define TIF_FORK 18 /* ret_from_fork */
+#define TIF_NOHZ 19 /* in adaptive nohz mode */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_DEBUG 21 /* uses debug registers */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -114,6 +115,7 @@ struct thread_info {
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
+#define _TIF_NOHZ (1 << TIF_NOHZ)
#define _TIF_DEBUG (1 << TIF_DEBUG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
@@ -126,12 +128,13 @@ struct thread_info {
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
- _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+ _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_NOHZ)

/* work to do in syscall_trace_leave() */
#define _TIF_WORK_SYSCALL_EXIT \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
- _TIF_SYSCALL_TRACEPOINT)
+ _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)

/* work to do on interrupt/exception return */
#define _TIF_WORK_MASK \
@@ -141,7 +144,8 @@ struct thread_info {

/* work to do on any return to user space */
#define _TIF_ALLWORK_MASK \
- ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT)
+ ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_NOHZ)

/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index c4c6a5c..e9dfd40 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/tick.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -1463,6 +1464,8 @@ long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;

+ tick_nohz_enter_kernel();
+
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
@@ -1526,4 +1529,6 @@ void syscall_trace_leave(struct pt_regs *regs)
!test_thread_flag(TIF_SYSCALL_EMU);
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
+
+ tick_nohz_exit_kernel();
}
--
1.7.5.4

2012-06-13 16:20:48

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 4/7] nohz: Account user and system times in adaptive nohz mode

When we'll run in adaptive tickless mode, the tick won't be
there anymore to maintain the user/system cputime on every jiffy.

To solve this, save a snapshot of the jiffies on the boundaries of
the kernel and keep track of where we saved it: user or system entry.
On top of this, we account the cputime elapsed when we cross
back the kernel boundaries and when we deschedule the task.

We do this only when requested through the TIF_NOHZ thread flag.
This will later be used by the timer engine when the tick gets
stopped.

This only settles system and user cputime accounting on kernel
boundaries. Further patches will complete the handling of adaptive
tickless cputime by saving and flushing the time on well defined
points: tick stop, tick restart, cputime report to user, etc...

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Alessio Igor Bogani <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Avi Kivity <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Geoff Levand <[email protected]>
Cc: Gilad Ben Yossef <[email protected]>
Cc: Hakan Akkan <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Hilman <[email protected]>
Cc: Max Krasnyansky <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Sven-Thorsten Dietrich <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
include/linux/tick.h | 14 ++++++++
kernel/sched/core.c | 1 +
kernel/time/tick-sched.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 94 insertions(+), 0 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0578207..79623fc 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -151,4 +151,18 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
# endif /* !NO_HZ */

+#ifdef CONFIG_NO_HZ_FULL
+extern void tick_nohz_enter_kernel(void);
+extern void tick_nohz_exit_kernel(void);
+extern void tick_nohz_enter_exception(struct pt_regs *regs);
+extern void tick_nohz_exit_exception(struct pt_regs *regs);
+extern void tick_nohz_pre_schedule(void);
+#else
+static inline void tick_nohz_enter_kernel(void) { }
+static inline void tick_nohz_exit_kernel(void) { }
+static inline void tick_nohz_enter_exception(struct pt_regs *regs) { }
+static inline void tick_nohz_exit_exception(struct pt_regs *regs) { }
+static inline void tick_nohz_pre_schedule(void) { }
+#endif /* !NO_HZ_FULL */
+
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 013e6f2..72acb05 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1910,6 +1910,7 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ tick_nohz_pre_schedule();
sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 66ae73a..3807d71 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -786,6 +786,85 @@ static inline void tick_check_nohz(int cpu)
}
}

+#ifdef CONFIG_NO_HZ_FULL
+void tick_nohz_exit_kernel(void)
+{
+ unsigned long flags;
+ struct tick_sched *ts;
+ unsigned long delta_jiffies;
+
+ if (!test_thread_flag(TIF_NOHZ))
+ return;
+
+ local_irq_save(flags);
+
+ ts = &__get_cpu_var(tick_cpu_sched);
+
+ WARN_ON_ONCE(!ts->tick_stopped);
+ WARN_ON_ONCE(ts->saved_jiffies_whence != JIFFIES_SAVED_SYS);
+
+ delta_jiffies = jiffies - ts->saved_jiffies;
+ account_system_ticks(current, delta_jiffies);
+
+ ts->saved_jiffies = jiffies;
+ ts->saved_jiffies_whence = JIFFIES_SAVED_USER;
+
+ local_irq_restore(flags);
+}
+
+void tick_nohz_enter_kernel(void)
+{
+ unsigned long flags;
+ struct tick_sched *ts;
+ unsigned long delta_jiffies;
+
+ if (!test_thread_flag(TIF_NOHZ))
+ return;
+
+ local_irq_save(flags);
+
+ ts = &__get_cpu_var(tick_cpu_sched);
+
+ WARN_ON_ONCE(!ts->tick_stopped);
+ WARN_ON_ONCE(ts->saved_jiffies_whence != JIFFIES_SAVED_USER);
+
+ delta_jiffies = jiffies - ts->saved_jiffies;
+ account_user_ticks(current, delta_jiffies);
+
+ ts->saved_jiffies = jiffies;
+ ts->saved_jiffies_whence = JIFFIES_SAVED_SYS;
+
+ local_irq_restore(flags);
+}
+
+void tick_nohz_enter_exception(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ tick_nohz_enter_kernel();
+}
+
+void tick_nohz_exit_exception(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ tick_nohz_exit_kernel();
+}
+
+/*
+ * Flush cputime and clear hooks before context switch so that
+ * we account the time spent tickless.
+ */
+void tick_nohz_pre_schedule(void)
+{
+ struct tick_sched *ts;
+
+ if (test_thread_flag(TIF_NOHZ)) {
+ ts = &__get_cpu_var(tick_cpu_sched);
+ tick_nohz_account_ticks(ts);
+ clear_thread_flag(TIF_NOHZ);
+ }
+}
+#endif /* CONFIG_NO_HZ_FULL */
+
#else

static inline void tick_nohz_switch_to_nohz(void) { }
--
1.7.5.4

2012-06-13 16:20:45

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 3/7] nohz: Generalize tickless cpu time accounting

When the CPU enters idle, it saves the jiffies stamp into
ts->idle_jiffies, increment this value by one every time
there is a timer interrupt and accounts "jiffies - ts->idle_jiffies"
idle ticks when we exit idle. This way we still account the
idle CPU time even if the tick is stopped.

This patch settles the ground to generalize this for user
and system accounting. ts->idle_jiffies becomes ts->saved_jiffies and
a new member ts->saved_jiffies_whence indicates from which domain
we saved the jiffies: user, system or idle.

This is one more step toward making the tickless infrastructure usable
further idle contexts.

For now this is only used by idle but further patches make use of
it for user and system.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Alessio Igor Bogani <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Avi Kivity <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Geoff Levand <[email protected]>
Cc: Gilad Ben Yossef <[email protected]>
Cc: Hakan Akkan <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Hilman <[email protected]>
Cc: Max Krasnyansky <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Sven-Thorsten Dietrich <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
include/linux/kernel_stat.h | 2 +
include/linux/tick.h | 45 +++++++++++++++++++-------------
kernel/sched/core.c | 26 ++++++++++++++++++
kernel/time/tick-sched.c | 60 ++++++++++++++++++++++++++++--------------
kernel/time/timer_list.c | 3 +-
5 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 2fbd905..be90056 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -122,7 +122,9 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
extern unsigned long long task_delta_exec(struct task_struct *);

extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
+extern void account_user_ticks(struct task_struct *, unsigned long);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
+extern void account_system_ticks(struct task_struct *, unsigned long);
extern void account_steal_time(cputime_t);
extern void account_idle_time(cputime_t);

diff --git a/include/linux/tick.h b/include/linux/tick.h
index f37fceb..0578207 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -27,25 +27,33 @@ enum tick_nohz_mode {
NOHZ_MODE_HIGHRES,
};

+enum tick_saved_jiffies {
+ JIFFIES_SAVED_NONE,
+ JIFFIES_SAVED_IDLE,
+ JIFFIES_SAVED_USER,
+ JIFFIES_SAVED_SYS,
+};
+
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
- * @sched_timer: hrtimer to schedule the periodic tick in high
- * resolution mode
- * @last_tick: Store the last tick expiry time when the tick
- * timer is modified for nohz sleeps. This is necessary
- * to resume the tick timer operation in the timeline
- * when the CPU returns from nohz sleep.
- * @tick_stopped: Indicator that the idle tick has been stopped
- * @idle_jiffies: jiffies at the entry to idle for idle time accounting
- * @idle_calls: Total number of idle calls
- * @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_entrytime: Time when the idle call was entered
- * @idle_waketime: Time when the idle was interrupted
- * @idle_exittime: Time when the idle state was left
- * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
- * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @sleep_length: Duration of the current idle sleep
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
+ * @last_tick: Store the last tick expiry time when the tick
+ * timer is modified for nohz sleeps. This is necessary
+ * to resume the tick timer operation in the timeline
+ * when the CPU returns from nohz sleep.
+ * @tick_stopped: Indicator that the idle tick has been stopped
+ * @idle_calls: Total number of idle calls
+ * @idle_sleeps: Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime: Time when the idle call was entered
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_exittime: Time when the idle state was left
+ * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
+ * @saved_jiffies: Jiffies snapshot on tick stop for cpu time accounting
+ * @saved_jiffies_whence: Area where we saved @saved_jiffies
+ * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length: Duration of the current idle sleep
+ * @do_timer_lst: CPU was the last one doing do_timer before going idle
*/
struct tick_sched {
struct hrtimer sched_timer;
@@ -54,7 +62,6 @@ struct tick_sched {
ktime_t last_tick;
int inidle;
int tick_stopped;
- unsigned long idle_jiffies;
unsigned long idle_calls;
unsigned long idle_sleeps;
int idle_active;
@@ -62,6 +69,8 @@ struct tick_sched {
ktime_t idle_waketime;
ktime_t idle_exittime;
ktime_t idle_sleeptime;
+ enum tick_saved_jiffies saved_jiffies_whence;
+ unsigned long saved_jiffies;
ktime_t iowait_sleeptime;
ktime_t sleep_length;
unsigned long last_jiffies;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4..013e6f2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2740,6 +2740,19 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
acct_update_integrals(p);
}

+#ifdef CONFIG_NO_HZ_FULL
+void account_user_ticks(struct task_struct *p, unsigned long ticks)
+{
+ cputime_t delta_cputime, delta_scaled;
+
+ if (ticks) {
+ delta_cputime = jiffies_to_cputime(ticks);
+ delta_scaled = cputime_to_scaled(ticks);
+ account_user_time(p, delta_cputime, delta_scaled);
+ }
+}
+#endif
+
/*
* Account guest cpu time to a process.
* @p: the process that the cpu time gets accounted to
@@ -2817,6 +2830,19 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
__account_system_time(p, cputime, cputime_scaled, index);
}

+#ifdef CONFIG_NO_HZ_FULL
+void account_system_ticks(struct task_struct *p, unsigned long ticks)
+{
+ cputime_t delta_cputime, delta_scaled;
+
+ if (ticks) {
+ delta_cputime = jiffies_to_cputime(ticks);
+ delta_scaled = cputime_to_scaled(ticks);
+ account_system_time(p, 0, delta_cputime, delta_scaled);
+ }
+}
+#endif
+
/*
* Account for involuntary wait time.
* @cputime: the cpu time spent in involuntary wait
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 60c9c60..66ae73a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -460,8 +460,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
ts->idle_expires = expires;
}

- if (!was_stopped && ts->tick_stopped)
- ts->idle_jiffies = ts->last_jiffies;
+ if (!was_stopped && ts->tick_stopped) {
+ ts->saved_jiffies = ts->last_jiffies;
+ ts->saved_jiffies_whence = JIFFIES_SAVED_IDLE;
+ }
}
}

@@ -578,22 +580,38 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
tick_nohz_restart(ts, now);
}

-static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+static void tick_nohz_account_ticks(struct tick_sched *ts)
{
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
unsigned long ticks;
/*
- * We stopped the tick in idle. Update process times would miss the
- * time we slept as update_process_times does only a 1 tick
- * accounting. Enforce that this is accounted to idle !
+ * We stopped the tick. Update process times would miss the
+ * time we ran tickless as update_process_times does only a 1 tick
+ * accounting. Enforce that this is accounted to nohz timeslices.
*/
- ticks = jiffies - ts->idle_jiffies;
+ ticks = jiffies - ts->saved_jiffies;
/*
* We might be one off. Do not randomly account a huge number of ticks!
*/
- if (ticks && ticks < LONG_MAX)
- account_idle_ticks(ticks);
+ if (ticks && ticks < LONG_MAX) {
+ switch (ts->saved_jiffies_whence) {
+ case JIFFIES_SAVED_IDLE:
+ account_idle_ticks(ticks);
+ break;
+#ifdef CONFIG_NO_HZ_FULL
+ case JIFFIES_SAVED_USER:
+ account_user_ticks(current, ticks);
+ break;
+ case JIFFIES_SAVED_SYS:
+ account_system_ticks(current, ticks);
+ break;
+ case JIFFIES_SAVED_NONE:
+ break;
#endif
+ default:
+ WARN_ON_ONCE(1);
+ }
+ }
+ ts->saved_jiffies_whence = JIFFIES_SAVED_NONE;
}

/**
@@ -623,7 +641,9 @@ void tick_nohz_idle_exit(void)

if (ts->tick_stopped) {
tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_ticks(ts);
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ tick_nohz_account_ticks(ts);
+#endif
}

local_irq_enable();
@@ -671,7 +691,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog();
- ts->idle_jiffies++;
+ ts->saved_jiffies++;
}

update_process_times(user_mode(regs));
@@ -820,17 +840,17 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
*/
if (regs) {
/*
- * When we are idle and the tick is stopped, we have to touch
- * the watchdog as we might not schedule for a really long
- * time. This happens on complete idle SMP systems while
- * waiting on the login prompt. We also increment the "start of
- * idle" jiffy stamp so the idle accounting adjustment we do
- * when we go busy again does not account too much ticks.
+ * When the tick is stopped, we have to touch the watchdog
+ * as we might not schedule for a really long time. This
+ * happens on complete idle SMP systems while waiting on
+ * the login prompt. We also increment the last jiffy stamp
+ * recorded when we stopped the tick so the cpu time accounting
+ * adjustment does not account too much ticks when we flush them.
*/
if (ts->tick_stopped) {
+ /* CHECKME: may be this is only needed in idle */
touch_softlockup_watchdog();
- if (idle_cpu(cpu))
- ts->idle_jiffies++;
+ ts->saved_jiffies++;
}
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9..54705e3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -169,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P(nohz_mode);
P_ns(last_tick);
P(tick_stopped);
- P(idle_jiffies);
+ /* CHECKME: Do we want saved_jiffies_whence as well? */
+ P(saved_jiffies);
P(idle_calls);
P(idle_sleeps);
P_ns(idle_entrytime);
--
1.7.5.4

2012-06-13 16:20:42

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 2/7] nohz: Introduce adaptive nohz config

Prepare a config option for the full adaptive nohz feature.
This way we can start to put the related code under appropriate
ifdefs.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Alessio Igor Bogani <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Avi Kivity <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Geoff Levand <[email protected]>
Cc: Gilad Ben Yossef <[email protected]>
Cc: Hakan Akkan <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Hilman <[email protected]>
Cc: Max Krasnyansky <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Sven-Thorsten Dietrich <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
arch/Kconfig | 8 ++++++++
kernel/time/Kconfig | 7 +++++++
2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 8c3d957..a71c698 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -251,6 +251,14 @@ config HAVE_CMPXCHG_DOUBLE
config ARCH_WANT_OLD_COMPAT_IPC
bool

+config HAVE_NO_HZ_FULL
+ bool
+ help
+ An arch should select this symbols if it provides
+ the kernel entry/exit hooks necessary to implement
+ full tickless support. This includes syscall entry/exit,
+ exceptions entry/exit and do_notify_resume() hooks.
+
config HAVE_ARCH_SECCOMP_FILTER
bool
help
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 0883fa1..5ac4b74 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -65,6 +65,13 @@ config NO_HZ
This option enables a tickless idle system: timer interrupts will
only trigger on an as-needed basis when the system is idle.

+config NO_HZ_FULL
+ bool "Full tickless system (Dynamic Ticks)"
+ depends on NO_HZ && HAVE_NO_HZ_FULL
+ help
+ This option enables a full adaptive tickless system: timer
+ interrupts will globally only trigger on an as-needed basis.
+
config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
--
1.7.5.4

2012-06-13 16:20:39

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 1/7] nohz: Add more comment about CONFIG_NO_HZ

In order to prepare for adding a new config to implement
adaptive tickless, clarify that CONFIG_NO_HZ alone only
stops the tick on idle.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Alessio Igor Bogani <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Avi Kivity <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Geoff Levand <[email protected]>
Cc: Gilad Ben Yossef <[email protected]>
Cc: Hakan Akkan <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Hilman <[email protected]>
Cc: Max Krasnyansky <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Sven-Thorsten Dietrich <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
kernel/time/Kconfig | 7 +++----
1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fd42bd4..0883fa1 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -58,13 +58,12 @@ config TICK_ONESHOT
bool

config NO_HZ
- bool "Tickless System (Dynamic Ticks)"
+ bool "Tickless idle system (Dynamic idle Ticks)"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
help
- This option enables a tickless system: timer interrupts will
- only trigger on an as-needed basis both when the system is
- busy and when the system is idle.
+ This option enables a tickless idle system: timer interrupts will
+ only trigger on an as-needed basis when the system is idle.

config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
--
1.7.5.4

2012-06-13 16:35:58

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Wed, Jun 13, 2012 at 06:19:50PM +0200, Frederic Weisbecker wrote:
> Ingo, Thomas,
>
> This starts the basic code that allow accounting of cputime spent tickless
> outside idle, which is a first step to prepare for the adaptive nohz
> infrastructure.
>
> I hope we can set a tree in -tip for that. If you're fine with it
> this is pullable from:
>
> git://github.com/fweisbec/linux-dynticks.git
> nohz-for-tip
>
> Thanks.

As you may have noticed, this is based on tip:timers/core

2012-06-14 09:07:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Wed, 2012-06-13 at 18:19 +0200, Frederic Weisbecker wrote:
> Ingo, Thomas,
>
> This starts the basic code that allow accounting of cputime spent tickless
> outside idle, which is a first step to prepare for the adaptive nohz
> infrastructure.
>
> I hope we can set a tree in -tip for that. If you're fine with it
> this is pullable from:

There's a number of architecture that already does fine grained
user/kernel time accounting on syscall boundaries etc.. s390, powerpc
and ia64.

You're now adding a 3rd way of accounting user/kernel time.. I'm not
much looking fwd to that..

2012-06-14 09:10:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, 2012-06-14 at 11:07 +0200, Peter Zijlstra wrote:
> On Wed, 2012-06-13 at 18:19 +0200, Frederic Weisbecker wrote:
> > Ingo, Thomas,
> >
> > This starts the basic code that allow accounting of cputime spent tickless
> > outside idle, which is a first step to prepare for the adaptive nohz
> > infrastructure.
> >
> > I hope we can set a tree in -tip for that. If you're fine with it
> > this is pullable from:
>
> There's a number of architecture that already does fine grained
> user/kernel time accounting on syscall boundaries etc.. s390, powerpc
> and ia64.
>
> You're now adding a 3rd way of accounting user/kernel time.. I'm not
> much looking fwd to that..

Note there's also the whole IRQ_TIME_ACCOUNTING muck.. all in all its
getting quite ridiculous.

2012-06-14 11:12:53

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, Jun 14, 2012 at 11:07:09AM +0200, Peter Zijlstra wrote:
> On Wed, 2012-06-13 at 18:19 +0200, Frederic Weisbecker wrote:
> > Ingo, Thomas,
> >
> > This starts the basic code that allow accounting of cputime spent tickless
> > outside idle, which is a first step to prepare for the adaptive nohz
> > infrastructure.
> >
> > I hope we can set a tree in -tip for that. If you're fine with it
> > this is pullable from:
>
> There's a number of architecture that already does fine grained
> user/kernel time accounting on syscall boundaries etc.. s390, powerpc
> and ia64.
>
> You're now adding a 3rd way of accounting user/kernel time.. I'm not
> much looking fwd to that..

You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
and see if I can reuse it.

I'll try something with that.

2012-06-14 11:16:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless


* Frederic Weisbecker <[email protected]> wrote:

> On Thu, Jun 14, 2012 at 11:07:09AM +0200, Peter Zijlstra wrote:
> > On Wed, 2012-06-13 at 18:19 +0200, Frederic Weisbecker wrote:
> > > Ingo, Thomas,
> > >
> > > This starts the basic code that allow accounting of cputime spent tickless
> > > outside idle, which is a first step to prepare for the adaptive nohz
> > > infrastructure.
> > >
> > > I hope we can set a tree in -tip for that. If you're fine with it
> > > this is pullable from:
> >
> > There's a number of architecture that already does fine grained
> > user/kernel time accounting on syscall boundaries etc.. s390, powerpc
> > and ia64.
> >
> > You're now adding a 3rd way of accounting user/kernel time.. I'm not
> > much looking fwd to that..
>
> You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> and see if I can reuse it.
>
> I'll try something with that.

Maybe sanitize all the variants under a single set of
wrappers/callbacks?

Thanks,

Ingo

2012-06-14 11:21:34

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, 14 Jun 2012, Ingo Molnar wrote:
> * Frederic Weisbecker <[email protected]> wrote:
> > You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> > and see if I can reuse it.
> >
> > I'll try something with that.
>
> Maybe sanitize all the variants under a single set of
> wrappers/callbacks?

Yes, please!

2012-06-14 11:22:52

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, Jun 14, 2012 at 01:21:23PM +0200, Thomas Gleixner wrote:
> On Thu, 14 Jun 2012, Ingo Molnar wrote:
> > * Frederic Weisbecker <[email protected]> wrote:
> > > You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> > > and see if I can reuse it.
> > >
> > > I'll try something with that.
> >
> > Maybe sanitize all the variants under a single set of
> > wrappers/callbacks?
>
> Yes, please!

Sure, I'm working in it.

2012-06-14 12:50:26

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, 14 Jun 2012 13:22:45 +0200
Frederic Weisbecker <[email protected]> wrote:

> On Thu, Jun 14, 2012 at 01:21:23PM +0200, Thomas Gleixner wrote:
> > On Thu, 14 Jun 2012, Ingo Molnar wrote:
> > > * Frederic Weisbecker <[email protected]> wrote:
> > > > You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> > > > and see if I can reuse it.
> > > >
> > > > I'll try something with that.
> > >
> > > Maybe sanitize all the variants under a single set of
> > > wrappers/callbacks?
> >
> > Yes, please!
>
> Sure, I'm working in it.

Please keep me in the loop, I want to avoid that things break on s390. Thanks.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2012-06-14 13:04:35

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, Jun 14, 2012 at 02:48:15PM +0200, Martin Schwidefsky wrote:
> On Thu, 14 Jun 2012 13:22:45 +0200
> Frederic Weisbecker <[email protected]> wrote:
>
> > On Thu, Jun 14, 2012 at 01:21:23PM +0200, Thomas Gleixner wrote:
> > > On Thu, 14 Jun 2012, Ingo Molnar wrote:
> > > > * Frederic Weisbecker <[email protected]> wrote:
> > > > > You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> > > > > and see if I can reuse it.
> > > > >
> > > > > I'll try something with that.
> > > >
> > > > Maybe sanitize all the variants under a single set of
> > > > wrappers/callbacks?
> > >
> > > Yes, please!
> >
> > Sure, I'm working in it.
>
> Please keep me in the loop, I want to avoid that things break on s390. Thanks.

Well, I realize I can't consolidate much between ia64, s390 and ppc because they
all handle virtual cpu time accounting very differently. I'm also not what the
virtual timer is for.

Also it seems only powerpc flushes the time when a task is descheduled. May be
I'm missing something.

2012-06-14 13:42:53

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, Jun 14, 2012 at 02:48:15PM +0200, Martin Schwidefsky wrote:
> On Thu, 14 Jun 2012 13:22:45 +0200
> Frederic Weisbecker <[email protected]> wrote:
>
> > On Thu, Jun 14, 2012 at 01:21:23PM +0200, Thomas Gleixner wrote:
> > > On Thu, 14 Jun 2012, Ingo Molnar wrote:
> > > > * Frederic Weisbecker <[email protected]> wrote:
> > > > > You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> > > > > and see if I can reuse it.
> > > > >
> > > > > I'll try something with that.
> > > >
> > > > Maybe sanitize all the variants under a single set of
> > > > wrappers/callbacks?
> > >
> > > Yes, please!
> >
> > Sure, I'm working in it.
>
> Please keep me in the loop, I want to avoid that things break on s390. Thanks.

Do you have any idea why s390 counts idle time from asm deep in the idle code
rather than just hooking in account_system_vtime() like ppc or ia64?

2012-06-14 14:36:40

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless


* Frederic Weisbecker <[email protected]> wrote:

> > > > > > I'll try something with that.
> > > > >
> > > > > Maybe sanitize all the variants under a single set of
> > > > > wrappers/callbacks?
> > > >
> > > > Yes, please!
> > >
> > > Sure, I'm working in it.
> >
> > Please keep me in the loop, I want to avoid that things
> > break on s390. Thanks.
>
> Well, I realize I can't consolidate much between ia64, s390
> and ppc because they all handle virtual cpu time accounting
> very differently. I'm also not what the virtual timer is for.

As a first step I'd suggest to create a superset of all existing
and relied-upon wrappers/callbacks, into a single obvious
sched_*() or time_*() namespace, without breaking functionality.

Once that is done we can eliminate individual, conceptually
redundant callbacks, by carefully morphing the affected arches
step by step.

No arch will be left behind.

Thanks,

Ingo

2012-06-14 15:18:13

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, 14 Jun 2012 15:42:44 +0200
Frederic Weisbecker <[email protected]> wrote:

> On Thu, Jun 14, 2012 at 02:48:15PM +0200, Martin Schwidefsky wrote:
> > On Thu, 14 Jun 2012 13:22:45 +0200
> > Frederic Weisbecker <[email protected]> wrote:
> >
> > > On Thu, Jun 14, 2012 at 01:21:23PM +0200, Thomas Gleixner wrote:
> > > > On Thu, 14 Jun 2012, Ingo Molnar wrote:
> > > > > * Frederic Weisbecker <[email protected]> wrote:
> > > > > > You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> > > > > > and see if I can reuse it.
> > > > > >
> > > > > > I'll try something with that.
> > > > >
> > > > > Maybe sanitize all the variants under a single set of
> > > > > wrappers/callbacks?
> > > >
> > > > Yes, please!
> > >
> > > Sure, I'm working in it.
> >
> > Please keep me in the loop, I want to avoid that things break on s390. Thanks.
>
> Do you have any idea why s390 counts idle time from asm deep in the idle code
> rather than just hooking in account_system_vtime() like ppc or ia64?

Well what is idle time? For s390 it is the difference in the TOD clock between
the instruction that loaded the enabled-wait-PSW and the first instruction on
the interrupt handler. To get the best precision you need to get the TOD time
stamps as close to these two instructions as possible. For s390 it is the
following sequence:

STCK __IDLE_ENTER(%r2) # idle enter time stamp
ltr %r5,%r5
stpt __VQ_IDLE_ENTER(%r3)
jz psw_idle_lpsw
spt 0(%r1)
psw_idle_lpsw:
lpswe __SF_EMPTY(%r15)

<<< sleeping >>>

int_handler:
STCK __LC_INT_CLOCK # idle exit time stamp

There are at maximum 5 instructions between the STCK for the idle
enter time stamp and the lpswe that puts the cpu to sleep.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.

2012-06-14 17:35:08

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, Jun 14, 2012 at 04:36:33PM +0200, Ingo Molnar wrote:
>
> * Frederic Weisbecker <[email protected]> wrote:
>
> > > > > > > I'll try something with that.
> > > > > >
> > > > > > Maybe sanitize all the variants under a single set of
> > > > > > wrappers/callbacks?
> > > > >
> > > > > Yes, please!
> > > >
> > > > Sure, I'm working in it.
> > >
> > > Please keep me in the loop, I want to avoid that things
> > > break on s390. Thanks.
> >
> > Well, I realize I can't consolidate much between ia64, s390
> > and ppc because they all handle virtual cpu time accounting
> > very differently. I'm also not what the virtual timer is for.
>
> As a first step I'd suggest to create a superset of all existing
> and relied-upon wrappers/callbacks, into a single obvious
> sched_*() or time_*() namespace, without breaking functionality.

But the API is already well defined. The arch just need to implement
account_system_vtime() and account_process_tick() and record the time
on the kernel boundaries. This is pretty well contained in ppc entry.S where
it is implemented through ACCOUNT_CPU_USER_ENTRY/EXIT macros (although
I see the time accounted on syscall boundaries but not in exceptions),
it's more complicated in ia64 as the virt accounting is spread here and there
in entry.S and it's always on in s390.

May be we could standardize a bit the way we save and account the time.
This require some non-trivial asm surgery on archs I don't know much about
though.

>
> Once that is done we can eliminate individual, conceptually
> redundant callbacks, by carefully morphing the affected arches
> step by step.
>
> No arch will be left behind.
>
> Thanks,
>
> Ingo

2012-06-15 12:13:32

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless


* Frederic Weisbecker <[email protected]> wrote:

> On Thu, Jun 14, 2012 at 04:36:33PM +0200, Ingo Molnar wrote:
> >
> > * Frederic Weisbecker <[email protected]> wrote:
> >
> > > > > > > > I'll try something with that.
> > > > > > >
> > > > > > > Maybe sanitize all the variants under a single set of
> > > > > > > wrappers/callbacks?
> > > > > >
> > > > > > Yes, please!
> > > > >
> > > > > Sure, I'm working in it.
> > > >
> > > > Please keep me in the loop, I want to avoid that things
> > > > break on s390. Thanks.
> > >
> > > Well, I realize I can't consolidate much between ia64, s390
> > > and ppc because they all handle virtual cpu time accounting
> > > very differently. I'm also not what the virtual timer is for.
> >
> > As a first step I'd suggest to create a superset of all existing
> > and relied-upon wrappers/callbacks, into a single obvious
> > sched_*() or time_*() namespace, without breaking functionality.
>
> But the API is already well defined. The arch just need to
> implement account_system_vtime() and account_process_tick()
> and record the time on the kernel boundaries. This is pretty
> well contained in ppc entry.S where it is implemented through
> ACCOUNT_CPU_USER_ENTRY/EXIT macros (although I see the time
> accounted on syscall boundaries but not in exceptions), it's
> more complicated in ia64 as the virt accounting is spread here
> and there in entry.S and it's always on in s390.
>
> May be we could standardize a bit the way we save and account
> the time. This require some non-trivial asm surgery on archs I
> don't know much about though.

Yeah, account_*() is a fine API too - as long as it's a
unification of all time accounting functionality.

Thanks,

Ingo

2012-06-15 17:37:29

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Thu, Jun 14, 2012 at 05:18:00PM +0200, Martin Schwidefsky wrote:
> On Thu, 14 Jun 2012 15:42:44 +0200
> Frederic Weisbecker <[email protected]> wrote:
>
> > On Thu, Jun 14, 2012 at 02:48:15PM +0200, Martin Schwidefsky wrote:
> > > On Thu, 14 Jun 2012 13:22:45 +0200
> > > Frederic Weisbecker <[email protected]> wrote:
> > >
> > > > On Thu, Jun 14, 2012 at 01:21:23PM +0200, Thomas Gleixner wrote:
> > > > > On Thu, 14 Jun 2012, Ingo Molnar wrote:
> > > > > > * Frederic Weisbecker <[email protected]> wrote:
> > > > > > > You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> > > > > > > and see if I can reuse it.
> > > > > > >
> > > > > > > I'll try something with that.
> > > > > >
> > > > > > Maybe sanitize all the variants under a single set of
> > > > > > wrappers/callbacks?
> > > > >
> > > > > Yes, please!
> > > >
> > > > Sure, I'm working in it.
> > >
> > > Please keep me in the loop, I want to avoid that things break on s390. Thanks.
> >
> > Do you have any idea why s390 counts idle time from asm deep in the idle code
> > rather than just hooking in account_system_vtime() like ppc or ia64?
>
> Well what is idle time? For s390 it is the difference in the TOD clock between
> the instruction that loaded the enabled-wait-PSW and the first instruction on
> the interrupt handler. To get the best precision you need to get the TOD time
> stamps as close to these two instructions as possible. For s390 it is the
> following sequence:
>
> STCK __IDLE_ENTER(%r2) # idle enter time stamp
> ltr %r5,%r5
> stpt __VQ_IDLE_ENTER(%r3)
> jz psw_idle_lpsw
> spt 0(%r1)
> psw_idle_lpsw:
> lpswe __SF_EMPTY(%r15)
>
> <<< sleeping >>>
>
> int_handler:
> STCK __LC_INT_CLOCK # idle exit time stamp
>
> There are at maximum 5 instructions between the STCK for the idle
> enter time stamp and the lpswe that puts the cpu to sleep.

I see. So s390 accounts only the time spent in low power mode whereas
ppc/ia64 accounts everything that happens in the idle task.

I don't know which one has chosen the right semantics but this complicates
any possible unification.

BTW, aren't you accounting the idle time as system time as well with
account_sys_vtime()?

2012-06-18 10:46:49

by Martin Schwidefsky

[permalink] [raw]
Subject: Re: [RFC GIT PULL] nohz: Basic cputime accounting for adaptive tickless

On Fri, 15 Jun 2012 19:37:18 +0200
Frederic Weisbecker <[email protected]> wrote:

> On Thu, Jun 14, 2012 at 05:18:00PM +0200, Martin Schwidefsky wrote:
> > On Thu, 14 Jun 2012 15:42:44 +0200
> > Frederic Weisbecker <[email protected]> wrote:
> >
> > > On Thu, Jun 14, 2012 at 02:48:15PM +0200, Martin Schwidefsky wrote:
> > > > On Thu, 14 Jun 2012 13:22:45 +0200
> > > > Frederic Weisbecker <[email protected]> wrote:
> > > >
> > > > > On Thu, Jun 14, 2012 at 01:21:23PM +0200, Thomas Gleixner wrote:
> > > > > > On Thu, 14 Jun 2012, Ingo Molnar wrote:
> > > > > > > * Frederic Weisbecker <[email protected]> wrote:
> > > > > > > > You're right, I should have looked into CONFIG_VIRT_CPU_ACCOUNTING sooner
> > > > > > > > and see if I can reuse it.
> > > > > > > >
> > > > > > > > I'll try something with that.
> > > > > > >
> > > > > > > Maybe sanitize all the variants under a single set of
> > > > > > > wrappers/callbacks?
> > > > > >
> > > > > > Yes, please!
> > > > >
> > > > > Sure, I'm working in it.
> > > >
> > > > Please keep me in the loop, I want to avoid that things break on s390. Thanks.
> > >
> > > Do you have any idea why s390 counts idle time from asm deep in the idle code
> > > rather than just hooking in account_system_vtime() like ppc or ia64?
> >
> > Well what is idle time? For s390 it is the difference in the TOD clock between
> > the instruction that loaded the enabled-wait-PSW and the first instruction on
> > the interrupt handler. To get the best precision you need to get the TOD time
> > stamps as close to these two instructions as possible. For s390 it is the
> > following sequence:
> >
> > STCK __IDLE_ENTER(%r2) # idle enter time stamp
> > ltr %r5,%r5
> > stpt __VQ_IDLE_ENTER(%r3)
> > jz psw_idle_lpsw
> > spt 0(%r1)
> > psw_idle_lpsw:
> > lpswe __SF_EMPTY(%r15)
> >
> > <<< sleeping >>>
> >
> > int_handler:
> > STCK __LC_INT_CLOCK # idle exit time stamp
> >
> > There are at maximum 5 instructions between the STCK for the idle
> > enter time stamp and the lpswe that puts the cpu to sleep.
>
> I see. So s390 accounts only the time spent in low power mode whereas
> ppc/ia64 accounts everything that happens in the idle task.
>
> I don't know which one has chosen the right semantics but this complicates
> any possible unification.
>
> BTW, aren't you accounting the idle time as system time as well with
> account_sys_vtime()?

No, the accout_sys_vtime call is there to account for the time spent going
in and out if idle.

--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.