2015-06-24 14:59:35

by Rik van Riel

[permalink] [raw]
Subject: [RFC INCOMPLETE] tick based timekeeping from a housekeeping CPU

This series seems to make basic tick based time sampling from a
housekeeping CPU work, allowing us to have tick based accounting
on a nohz_full CPU, and no longer doing vtime accounting on those
CPUs.

It still needs a major cleanup, and steal time accounting and irq
accounting are still missing.

Just posting this to get a sense of whether I am headed in the right
direction, or whether we need some major overhaul/cleanup of part of
the code first.


2015-06-24 14:58:43

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 01/11] nohz,time: make account_process_tick work on the task's CPU

From: Rik van Riel <[email protected]>

Teach account_process_tick to work on the CPU of the task
specified in the function argument. This allows us to do
remote tick based sampling of a nohz_full cpu from a
housekeeping CPU.

Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/cputime.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..97077c282626 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -463,8 +463,14 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
void account_process_tick(struct task_struct *p, int user_tick)
{
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- struct rq *rq = this_rq();
+ int cpu = task_cpu(p);
+ struct rq *rq = cpu_rq(cpu);

+ /*
+ * Tests current CPU, not "cpu", to see whether account_process_tick()
+ * should do work on this invocation, or whether time keeping for
+ * this CPU is done in some other way.
+ */
if (vtime_accounting_enabled())
return;

--
2.1.0

2015-06-24 14:59:39

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 02/11] time,nohz: rename vtime_accounting_enabled to tick_accounting_disabled

From: Rik van Riel <[email protected]>

Rename vtime_accounting_enabled to tick_accounting_disabled, because it
can mean either that vtime accounting is enabled, or that the system
is doing tick based sampling from a housekeeping CPU for nohz_full CPUs.

Signed-off-by: Rik van Riel <[email protected]>
---
include/linux/context_tracking.h | 4 ++--
include/linux/vtime.h | 17 ++++++++++-------
kernel/sched/cputime.c | 2 +-
kernel/time/tick-sched.c | 2 +-
4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index dc3b169b2b70..d7ee7eb9e0bc 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -90,7 +90,7 @@ static inline void context_tracking_init(void) { }
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static inline void guest_enter(void)
{
- if (vtime_accounting_enabled())
+ if (tick_accounting_disabled())
vtime_guest_enter(current);
else
current->flags |= PF_VCPU;
@@ -104,7 +104,7 @@ static inline void guest_exit(void)
if (context_tracking_is_enabled())
__context_tracking_exit(CONTEXT_GUEST);

- if (vtime_accounting_enabled())
+ if (tick_accounting_disabled())
vtime_guest_exit(current);
else
current->flags &= ~PF_VCPU;
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index c5165fd256f9..4f5c1a3712e7 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -10,14 +10,17 @@
struct task_struct;

/*
- * vtime_accounting_enabled() definitions/declarations
+ * tick_accounting_disabled() definitions/declarations
+ *
+ * This indicates that either vtime accounting is used, or that tick
+ * based sampling is done from a housekeeping CPU for nohz_full CPUs.
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-static inline bool vtime_accounting_enabled(void) { return true; }
+static inline bool tick_accounting_disabled(void) { return true; }
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static inline bool vtime_accounting_enabled(void)
+static inline bool tick_accounting_disabled(void)
{
if (context_tracking_is_enabled()) {
if (context_tracking_cpu_is_enabled())
@@ -29,7 +32,7 @@ static inline bool vtime_accounting_enabled(void)
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-static inline bool vtime_accounting_enabled(void) { return false; }
+static inline bool tick_accounting_disabled(void) { return false; }
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */


@@ -44,7 +47,7 @@ extern void vtime_task_switch(struct task_struct *prev);
extern void vtime_common_task_switch(struct task_struct *prev);
static inline void vtime_task_switch(struct task_struct *prev)
{
- if (vtime_accounting_enabled())
+ if (tick_accounting_disabled())
vtime_common_task_switch(prev);
}
#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
@@ -59,7 +62,7 @@ extern void vtime_account_irq_enter(struct task_struct *tsk);
extern void vtime_common_account_irq_enter(struct task_struct *tsk);
static inline void vtime_account_irq_enter(struct task_struct *tsk)
{
- if (vtime_accounting_enabled())
+ if (tick_accounting_disabled())
vtime_common_account_irq_enter(tsk);
}
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -78,7 +81,7 @@ extern void vtime_gen_account_irq_exit(struct task_struct *tsk);

static inline void vtime_account_irq_exit(struct task_struct *tsk)
{
- if (vtime_accounting_enabled())
+ if (tick_accounting_disabled())
vtime_gen_account_irq_exit(tsk);
}

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 97077c282626..84b2d24a2238 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -471,7 +471,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
* should do work on this invocation, or whether time keeping for
* this CPU is done in some other way.
*/
- if (vtime_accounting_enabled())
+ if (tick_accounting_disabled())
return;

if (sched_clock_irqtime) {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 914259128145..3bb5a7accc9f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -924,7 +924,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;

- if (vtime_accounting_enabled())
+ if (tick_accounting_disabled())
return;
/*
* We stopped the tick in idle. Update process times would miss the
--
2.1.0

2015-06-24 14:58:21

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 03/11] time,nohz: add cpu parameter to irqtime_account_process_tick

From: Rik van Riel <[email protected]>

Add a cpu parameter to irqtime_account_process_tick, to specify what
cpu to run the statistics for.

In order for this to actually work on a different cpu, all the functions
called by irqtime_account_process_tick need to be able to handle workng
for another CPU.

Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/cputime.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 84b2d24a2238..7df761cd6dfc 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -336,12 +336,14 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* p->stime and friends are only updated on system time and not on irq
* softirq as those do not count in task exec_runtime any more.
*/
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+static void irqtime_account_process_tick(struct task_struct *p, int cpu,
+ int user_tick,
struct rq *rq, int ticks)
{
cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ struct kernel_cpustat *kstat = &kcpustat_cpu(cpu);
u64 cputime = (__force u64) cputime_one_jiffy;
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 *cpustat = kstat->cpustat;

if (steal_account_process_tick())
return;
@@ -374,12 +376,14 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
static void irqtime_account_idle_ticks(int ticks)
{
struct rq *rq = this_rq();
+ int cpu = smp_processor_id();

- irqtime_account_process_tick(current, 0, rq, ticks);
+ irqtime_account_process_tick(current, cpu, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
-static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+static inline void irqtime_account_process_tick(struct task_struct *p, int cpu,
+ int user_tick,
struct rq *rq, int nr_ticks) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

@@ -475,7 +479,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;

if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq, 1);
+ irqtime_account_process_tick(p, cpu, user_tick, rq, 1);
return;
}

--
2.1.0

2015-06-24 14:59:52

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 04/11] time,nohz: add cpu parameter to steal_account_process_tick

From: Rik van Riel <[email protected]>

Add a cpu parameter to steal_account_process_tick, so it can
be used to do CPU time accounting for another CPU.

Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/cputime.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7df761cd6dfc..9717d86cf2ab 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -254,15 +254,15 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}

-static __always_inline bool steal_account_process_tick(void)
+static __always_inline bool steal_account_process_tick(int cpu)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
u64 steal;
cputime_t steal_ct;

- steal = paravirt_steal_clock(smp_processor_id());
- steal -= this_rq()->prev_steal_time;
+ steal = paravirt_steal_clock(cpu);
+ steal -= cpu_rq(cpu)->prev_steal_time;

/*
* cputime_t may be less precise than nsecs (eg: if it's
@@ -270,7 +270,7 @@ static __always_inline bool steal_account_process_tick(void)
* granularity and account the rest on the next rounds.
*/
steal_ct = nsecs_to_cputime(steal);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+ cpu_rq(cpu)->prev_steal_time += cputime_to_nsecs(steal_ct);

account_steal_time(steal_ct);
return steal_ct;
@@ -345,7 +345,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int cpu,
u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kstat->cpustat;

- if (steal_account_process_tick())
+ if (steal_account_process_tick(cpu))
return;

cputime *= ticks;
@@ -483,7 +483,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}

- if (steal_account_process_tick())
+ if (steal_account_process_tick(cpu))
return;

if (user_tick)
--
2.1.0

2015-06-24 15:00:25

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 05/11] time,nohz: add cpu parameter to account_steal_time

From: Rik van Riel <[email protected]>

Simple transformation to allow tick based sampling from a remote
cpu. Additional changes may be needed to actually acquire the
steal time info for remote cpus from the host/hypervisor.

Signed-off-by: Rik van Riel <[email protected]>
---
include/linux/kernel_stat.h | 2 +-
kernel/sched/cputime.c | 9 +++++----
2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 25a822f6f000..4490aef2f149 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -80,7 +80,7 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)

extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
-extern void account_steal_time(cputime_t);
+extern void account_steal_time(int cpu, cputime_t);
extern void account_idle_time(cputime_t);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9717d86cf2ab..b684c48ad954 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -232,9 +232,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
* Account for involuntary wait time.
* @cputime: the cpu time spent in involuntary wait
*/
-void account_steal_time(cputime_t cputime)
+void account_steal_time(int cpu, cputime_t cputime)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ struct kernel_cpustat *kstat = &kcpustat_cpu(cpu);
+ u64 *cpustat = kstat->cpustat;

cpustat[CPUTIME_STEAL] += (__force u64) cputime;
}
@@ -272,7 +273,7 @@ static __always_inline bool steal_account_process_tick(int cpu)
steal_ct = nsecs_to_cputime(steal);
cpu_rq(cpu)->prev_steal_time += cputime_to_nsecs(steal_ct);

- account_steal_time(steal_ct);
+ account_steal_time(cpu, steal_ct);
return steal_ct;
}
#endif
@@ -502,7 +503,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
*/
void account_steal_ticks(unsigned long ticks)
{
- account_steal_time(jiffies_to_cputime(ticks));
+ account_steal_time(smp_processor_id(), jiffies_to_cputime(ticks));
}

/*
--
2.1.0

2015-06-24 15:00:05

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 06/11] time,nohz: add cpu parameter to account_idle_time

From: Rik van Riel <[email protected]>

Simple transformation to allow account_idle_time to account the
idle time for another CPU.

Signed-off-by: Rik van Riel <[email protected]>
---
arch/ia64/kernel/time.c | 2 +-
arch/powerpc/kernel/time.c | 2 +-
arch/s390/kernel/idle.c | 2 +-
include/linux/kernel_stat.h | 2 +-
kernel/sched/cputime.c | 15 ++++++++-------
5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 9a0104a38cd3..61928b01d548 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -140,7 +140,7 @@ EXPORT_SYMBOL_GPL(vtime_account_system);

void vtime_account_idle(struct task_struct *tsk)
{
- account_idle_time(vtime_delta(tsk));
+ account_idle_time(task_cpu(tsk), vtime_delta(tsk));
}

#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 56f44848b044..f7c4cfdf0157 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -356,7 +356,7 @@ void vtime_account_idle(struct task_struct *tsk)
u64 delta, sys_scaled, stolen;

delta = vtime_delta(tsk, &sys_scaled, &stolen);
- account_idle_time(delta + stolen);
+ account_idle_time(task_cpu(tsk), delta + stolen);
}

/*
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 7a55c29b0b33..fc3945e3dc18 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -43,7 +43,7 @@ void enabled_wait(void)
idle->clock_idle_enter = idle->clock_idle_exit = 0ULL;
idle->idle_time += idle_time;
idle->idle_count++;
- account_idle_time(idle_time);
+ account_idle_time(smp_processor_id(), idle_time);
write_seqcount_end(&idle->seqcount);
}
NOKPROBE_SYMBOL(enabled_wait);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4490aef2f149..0d6e07406fd0 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,7 +81,7 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
extern void account_steal_time(int cpu, cputime_t);
-extern void account_idle_time(cputime_t);
+extern void account_idle_time(int cpu, cputime_t);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline void account_process_tick(struct task_struct *tsk, int user)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b684c48ad954..593f97b0fe3c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -244,10 +244,11 @@ void account_steal_time(int cpu, cputime_t cputime)
* Account for idle time.
* @cputime: the cpu time spent in idle wait
*/
-void account_idle_time(cputime_t cputime)
+void account_idle_time(int cpu, cputime_t cputime)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- struct rq *rq = this_rq();
+ struct kernel_cpustat *kstat = &kcpustat_cpu(cpu);
+ u64 *cpustat = kstat->cpustat;
+ struct rq *rq = cpu_rq(cpu);

if (atomic_read(&rq->nr_iowait) > 0)
cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
@@ -366,7 +367,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int cpu,
} else if (user_tick) {
account_user_time(p, cputime, scaled);
} else if (p == rq->idle) {
- account_idle_time(cputime);
+ account_idle_time(cpu, cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime, scaled);
} else {
@@ -493,7 +494,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
one_jiffy_scaled);
else
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cpu, cputime_one_jiffy);
}

/*
@@ -518,7 +519,7 @@ void account_idle_ticks(unsigned long ticks)
return;
}

- account_idle_time(jiffies_to_cputime(ticks));
+ account_idle_time(smp_processor_id(), jiffies_to_cputime(ticks));
}

/*
@@ -748,7 +749,7 @@ void vtime_account_idle(struct task_struct *tsk)
{
cputime_t delta_cpu = get_vtime_delta(tsk);

- account_idle_time(delta_cpu);
+ account_idle_time(task_cpu(tsk), delta_cpu);
}

void arch_vtime_task_switch(struct task_struct *prev)
--
2.1.0

2015-06-24 15:00:18

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 07/11] nohz,timer: designate timer housekeeping cpu

From: Rik van Riel <[email protected]>

The timer housekeeping CPU can do tick based sampling for remote
CPUs. For now this is the first CPU in the housekeeping_mask.

Eventually we could move to having one timer housekeeping cpu per
socket, if needed.

Signed-off-by: Rik van Riel <[email protected]>
---
include/linux/tick.h | 9 +++++++++
1 file changed, 9 insertions(+)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index f8492da57ad3..2fb51030587b 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -158,6 +158,15 @@ static inline bool is_housekeeping_cpu(int cpu)
return true;
}

+static inline bool is_timer_housekeeping_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_running)
+ return (cpumask_first(housekeeping_mask) == cpu);
+#endif
+ return false;
+}
+
static inline void housekeeping_affine(struct task_struct *t)
{
#ifdef CONFIG_NO_HZ_FULL
--
2.1.0

2015-06-24 14:58:16

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 08/11] nohz,timer: have housekeeper call account_process_tick for nohz cpus

From: Rik van Riel <[email protected]>

Have the housekeeper CPU call account_process_tick to do tick based
accounting for remote nohz_full CPUs.

Signed-off-by: Rik van Riel <[email protected]>
---
kernel/time/timer.c | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2ece3aa5069c..6adebb373317 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -42,6 +42,7 @@
#include <linux/sched/sysctl.h>
#include <linux/slab.h>
#include <linux/compat.h>
+#include "../sched/sched.h"

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -1382,6 +1383,29 @@ unsigned long get_next_timer_interrupt(unsigned long now)
}
#endif

+#ifdef CONFIG_NO_HZ_FULL
+static void account_remote_process_ticks(void)
+{
+ int cpu;
+
+ /*
+ * The current task on another CPU can get rescheduled while
+ * we are updating the statistics. The rcu read lock ensures
+ * the task does not get freed, so at worst the statistics will
+ * be off a little bit, which is expected with tick based sampling.
+ */
+ rcu_read_lock();
+ for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) {
+ struct task_struct *p = cpu_curr(cpu);
+ int user_tick = (per_cpu(context_tracking.state, cpu) ==
+ CONTEXT_USER);
+
+ account_process_tick(p, user_tick);
+ }
+ rcu_read_unlock();
+}
+#endif
+
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
@@ -1392,6 +1416,10 @@ void update_process_times(int user_tick)

/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
+#ifdef CONFIG_NO_HZ_FULL
+ if (is_timer_housekeeping_cpu(smp_processor_id()))
+ account_remote_process_ticks();
+#endif
run_local_timers();
rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORK
--
2.1.0

2015-06-24 14:59:30

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 09/11] nohz,time: add tick_accounting_remote macro

From: Rik van Riel <[email protected]>

With the introduction of remote tick based sampling, we now have
three ways of gathering time statistics:
- local tick based sampling
- vtime accounting (used natively on some architectures)
- remote tick based sampling

On a system with remote tick based sampling, the housekeeping
CPUs will still do local tick based sampling. This results in
needing two macros for switching the timekeeping code.

Signed-off-by: Rik van Riel <[email protected]>
---
include/linux/vtime.h | 3 +++
1 file changed, 3 insertions(+)

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 4f5c1a3712e7..a587058c7967 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -17,6 +17,7 @@ struct task_struct;
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline bool tick_accounting_disabled(void) { return true; }
+static inline bool tick_accounting_remote(void) { return false; }
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -29,10 +30,12 @@ static inline bool tick_accounting_disabled(void)

return false;
}
+static inline bool tick_accounting_remote(void) { return true; }
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

#ifndef CONFIG_VIRT_CPU_ACCOUNTING
static inline bool tick_accounting_disabled(void) { return false; }
+static inline bool tick_accounting_remote(void) { return false; }
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */


--
2.1.0

2015-06-24 15:00:08

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 10/11] nohz,kvm,time: skip vtime accounting at kernel entry & exit

From: Rik van Riel <[email protected]>

When timer statistics are sampled from a remote CPU, vtime calculations
at the kernel/user and kernel/guest boundary are no longer necessary.
Skip them.

Signed-off-by: Rik van Riel <[email protected]>
---
include/linux/context_tracking.h | 4 ++--
kernel/context_tracking.c | 6 ++++--
2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index d7ee7eb9e0bc..e3e7c674543f 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -90,7 +90,7 @@ static inline void context_tracking_init(void) { }
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static inline void guest_enter(void)
{
- if (tick_accounting_disabled())
+ if (tick_accounting_disabled() && !tick_accounting_remote())
vtime_guest_enter(current);
else
current->flags |= PF_VCPU;
@@ -104,7 +104,7 @@ static inline void guest_exit(void)
if (context_tracking_is_enabled())
__context_tracking_exit(CONTEXT_GUEST);

- if (tick_accounting_disabled())
+ if (tick_accounting_disabled() && !tick_accounting_remote())
vtime_guest_exit(current);
else
current->flags &= ~PF_VCPU;
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6da4205c3184..a58cbed13ebd 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -63,7 +63,8 @@ void __context_tracking_enter(enum ctx_state state)
*/
if (state == CONTEXT_USER) {
trace_user_enter(0);
- vtime_user_enter(current);
+ if (!tick_accounting_remote())
+ vtime_user_enter(current);
}
rcu_user_enter();
}
@@ -135,7 +136,8 @@ void __context_tracking_exit(enum ctx_state state)
*/
rcu_user_exit();
if (state == CONTEXT_USER) {
- vtime_user_exit(current);
+ if (!tick_accounting_remote())
+ vtime_user_exit(current);
trace_user_exit(0);
}
}
--
2.1.0

2015-06-24 14:58:32

by Rik van Riel

[permalink] [raw]
Subject: [RFC PATCH 11/11] nohz,kvm,time: teach account_process_tick about guest time

From: Rik van Riel <[email protected]>

When tick based accounting is run from a remote CPU, it is actually
possible to encounter a task with PF_VCPU set. Make sure to account
those as guest time.

Signed-off-by: Rik van Riel <[email protected]>
---
kernel/sched/cputime.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 593f97b0fe3c..6295679fe5f5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -488,7 +488,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (steal_account_process_tick(cpu))
return;

- if (user_tick)
+ if (p->flags & PF_VCPU)
+ account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ else if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
--
2.1.0

2015-06-24 15:07:12

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH 11/11] nohz,kvm,time: teach account_process_tick about guest time

On Wed, Jun 24, 2015 at 7:58 AM, <[email protected]> wrote:
> From: Rik van Riel <[email protected]>
>
> When tick based accounting is run from a remote CPU, it is actually
> possible to encounter a task with PF_VCPU set. Make sure to account
> those as guest time.

Why do we have PF_VCPU and CONTEXT_GUEST? What's the difference
between them (other than the fact that one is per-task and one is
per-cpu)? It would be a bit easier to understand if there were fewer
of these things.

If the issue is that remote sampling would otherwise have a race that
could account guest time to the wrong task, then maybe PF_VCPU makes
sense. Hmm.

--Andy

2015-06-24 15:08:32

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH 10/11] nohz,kvm,time: skip vtime accounting at kernel entry & exit

On Wed, Jun 24, 2015 at 7:57 AM, <[email protected]> wrote:
> From: Rik van Riel <[email protected]>
>
> When timer statistics are sampled from a remote CPU, vtime calculations
> at the kernel/user and kernel/guest boundary are no longer necessary.
> Skip them.

I plan to do my very best to clean up x86's calls into this code, but
I will try to avoid touching the implementations. I'm happy to leave
an eventual cleanup to you :)

--Andy

2015-06-30 14:24:05

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC PATCH 08/11] nohz,timer: have housekeeper call account_process_tick for nohz cpus

2015-06-24 17:57 GMT+03:00 <[email protected]>:
> From: Rik van Riel <[email protected]>
>
> Have the housekeeper CPU call account_process_tick to do tick based
> accounting for remote nohz_full CPUs.
>
> Signed-off-by: Rik van Riel <[email protected]>
> ---
> kernel/time/timer.c | 28 ++++++++++++++++++++++++++++
> 1 file changed, 28 insertions(+)
>
> diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> index 2ece3aa5069c..6adebb373317 100644
> --- a/kernel/time/timer.c
> +++ b/kernel/time/timer.c
> @@ -42,6 +42,7 @@
> #include <linux/sched/sysctl.h>
> #include <linux/slab.h>
> #include <linux/compat.h>
> +#include "../sched/sched.h"
>
> #include <asm/uaccess.h>
> #include <asm/unistd.h>
> @@ -1382,6 +1383,29 @@ unsigned long get_next_timer_interrupt(unsigned long now)
> }
> #endif
>
> +#ifdef CONFIG_NO_HZ_FULL
> +static void account_remote_process_ticks(void)
> +{
> + int cpu;
> +
> + /*
> + * The current task on another CPU can get rescheduled while
> + * we are updating the statistics. The rcu read lock ensures
> + * the task does not get freed, so at worst the statistics will
> + * be off a little bit, which is expected with tick based sampling.
> + */
> + rcu_read_lock();
> + for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) {
> + struct task_struct *p = cpu_curr(cpu);
> + int user_tick = (per_cpu(context_tracking.state, cpu) ==
> + CONTEXT_USER);
> +
> + account_process_tick(p, user_tick);
> + }
> + rcu_read_unlock();
> +}
> +#endif
> +
> /*
> * Called from the timer interrupt handler to charge one tick to the current
> * process. user_tick is 1 if the tick is user time, 0 for system.
> @@ -1392,6 +1416,10 @@ void update_process_times(int user_tick)
>
> /* Note: this timer irq context must be accounted for as well. */
> account_process_tick(p, user_tick);
> +#ifdef CONFIG_NO_HZ_FULL
> + if (is_timer_housekeeping_cpu(smp_processor_id()))
> + account_remote_process_ticks();
> +#endif

I like the idea overall. I just think that it should be done from a
process rather than an IRQ because the amount of CPUs to walk can be
high.

Perhaps a kthread? Or workqueue since we can now affine the unbound
ones to housekeepers.

Thanks.

> run_local_timers();
> rcu_check_callbacks(user_tick);
> #ifdef CONFIG_IRQ_WORK
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/