2013-08-09 00:54:45

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 0/4] nohz: Fix racy sleeptime stats

Hi,

Here is a propsition to fix the bug described here:

http://lkml.kernel.org/r/1363660703.4993.3.camel@nexus
http://lkml.kernel.org/r/[email protected]

You can try the following branch:

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
timers/nohz-sleeptime-fix

Thanks,
Frederic
---

Frederic Weisbecker (4):
nohz: Only update sleeptime stats locally
nohz: Synchronize sleep time stats with seqlock
nohz: Consolidate sleep time stats read code
nohz: Convert a few places to use local per cpu accesses


include/linux/tick.h | 2 +
kernel/time/tick-sched.c | 124 +++++++++++++++++++--------------------------
2 files changed, 54 insertions(+), 72 deletions(-)


2013-08-09 00:54:50

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 2/4] nohz: Synchronize sleep time stats with seqlock

When some call site uses get_cpu_*_time_us() to read a sleeptime
stat, it deduces the total sleeptime by adding the pending time
to the last sleeptime snapshot if the CPU target is idle.

Namely this sums up to:

sleeptime = ts($CPU)->idle_sleeptime;
if (ts($CPU)->idle_active)
sleeptime += NOW() - ts($CPU)->idle_entrytime

But this only works if idle_sleeptime, idle_entrytime and idle_active are
read and updated under some disciplined order.

Lets consider the following scenario:

CPU 0 CPU 1

(seq 1) ts(CPU 0)->idle_active = 1
ts(CPU 0)->idle_entrytime = NOW()

(seq 2) sleeptime = NOW() - ts(CPU 0)->idle_entrytime
ts(CPU 0)->idle_sleeptime += sleeptime sleeptime = ts(CPU 0)->idle_sleeptime;
if (ts(CPU 0)->idle_active)
ts(CPU 0)->idle_entrytime = NOW() sleeptime += NOW() - ts(CPU 0)->idle_entrytime

The resulting value of sleeptime in CPU 1 can vary depending of some
ordering scenario:

* If it sees the value of idle_entrytime after seq 1 and the value of idle_sleeptime
after seq 2, the value of sleeptime will be buggy because it accounts the delta twice,
so it will be too high.

* If it sees the value of idle_entrytime after seq 2 and the value of idle_sleeptime
after seq 1, the value of sleeptime will be buggy because it misses the delta, so it
will be too low.

* If it sees the value of idle_entrytime and idle_sleeptime, both as seen after seq 1 or 2,
the value will be correct.

Some more tricky scenario can also happen if idle_active value is read from a former sequence.

Hence we must honour the following constraints:

- idle_sleeptime, idle_active and idle_entrytime must be updated and read
under some correctly enforced SMP ordering

- The three variable values as read by CPU 1 must belong to the same update
sequences from CPU 0. The update sequences must be delimited such that the
resulting three values after a sequence completion produce a coherent result
together when read from the CPU 1.

- We need to prevent from fetching middle-state sequence values.

The ideal solution to implement this synchronization is to use a seqcount. Lets
use one here around these three values to enforce sequence synchronization between
updates and read.

This fixes a reported bug where non-monotonic sleeptime stats are returned by /proc/stat
when it is frequently read. And potential cpufreq governor bugs.

Reported-by: Fernando Luis Vazquez Cao <[email protected]>
Reported-by: Tetsuo Handa <[email protected]>
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Fernando Luis Vazquez Cao <[email protected]>
Cc: Tetsuo Handa <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Arjan van de Ven <[email protected]>
---
include/linux/tick.h | 2 ++
kernel/time/tick-sched.c | 37 +++++++++++++++++++++++++------------
2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 62bd8b7..49f9720 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -10,6 +10,7 @@
#include <linux/irqflags.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
+#include <linux/seqlock.h>

#ifdef CONFIG_GENERIC_CLOCKEVENTS

@@ -70,6 +71,7 @@ struct tick_sched {
unsigned long next_jiffies;
ktime_t idle_expires;
int do_timer_last;
+ seqcount_t sleeptime_seq;
};

extern void __init tick_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ede0405..f7fc27b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -414,6 +414,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
ktime_t delta;

/* Updates the per cpu time idle statistics counters */
+ write_seqcount_begin(&ts->sleeptime_seq);
delta = ktime_sub(now, ts->idle_entrytime);
if (nr_iowait_cpu(cpu) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
@@ -421,6 +422,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_entrytime = now;
ts->idle_active = 0;
+ write_seqcount_end(&ts->sleeptime_seq);

sched_clock_idle_wakeup_event(0);
}
@@ -429,8 +431,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
{
ktime_t now = ktime_get();

+ write_seqcount_begin(&ts->sleeptime_seq);
ts->idle_entrytime = now;
ts->idle_active = 1;
+ write_seqcount_end(&ts->sleeptime_seq);
+
sched_clock_idle_sleep_event();
return now;
}
@@ -453,6 +458,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, idle;
+ unsigned int seq;

if (!tick_nohz_enabled)
return -1;
@@ -461,12 +467,15 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
if (last_update_time)
*last_update_time = ktime_to_us(now);

- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
- }
+ do {
+ seq = read_seqcount_begin(&ts->sleeptime_seq);
+ if (ts->idle_active && !nr_iowait_cpu(cpu)) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+ idle = ktime_add(ts->idle_sleeptime, delta);
+ } else {
+ idle = ts->idle_sleeptime;
+ }
+ } while (read_seqcount_retry(&ts->sleeptime_seq, seq));

return ktime_to_us(idle);

@@ -491,6 +500,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, iowait;
+ unsigned int seq;

if (!tick_nohz_enabled)
return -1;
@@ -499,12 +509,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
if (last_update_time)
*last_update_time = ktime_to_us(now);

- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
+ do {
+ seq = read_seqcount_begin(&ts->sleeptime_seq);
+ if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+ iowait = ktime_add(ts->iowait_sleeptime, delta);
+ } else {
+ iowait = ts->iowait_sleeptime;
+ }
+ } while (read_seqcount_retry(&ts->sleeptime_seq, seq));

return ktime_to_us(iowait);
}
--
1.7.5.4

2013-08-09 00:54:55

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 4/4] nohz: Convert a few places to use local per cpu accesses

A few functions use remote per CPU access APIs when they
deal with local values.

Just to the right conversion to improve performance, code
readability and debug checks.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Fernando Luis Vazquez Cao <[email protected]>
Cc: Tetsuo Handa <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Arjan van de Ven <[email protected]>
---
kernel/time/tick-sched.c | 10 ++++------
1 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 017bec2..11d64e2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -395,11 +395,9 @@ __setup("nohz=", setup_tick_nohz);
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
- int cpu = smp_processor_id();
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
unsigned long flags;

- ts->idle_waketime = now;
+ __this_cpu_write(tick_cpu_sched.idle_waketime, now);

local_irq_save(flags);
tick_do_update_jiffies64(now);
@@ -410,7 +408,7 @@ static void tick_nohz_update_jiffies(ktime_t now)

static void tick_nohz_stop_idle(int cpu, ktime_t now)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t delta;

/* Updates the per cpu time idle statistics counters */
@@ -901,7 +899,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
void tick_nohz_idle_exit(void)
{
int cpu = smp_processor_id();
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t now;

local_irq_disable();
@@ -1020,7 +1018,7 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now)

static inline void tick_check_nohz(int cpu)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t now;

if (!ts->idle_active && !ts->tick_stopped)
--
1.7.5.4

2013-08-09 00:54:47

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 1/4] nohz: Only update sleeptime stats locally

The idle and io sleeptime stats can be updated concurrently from callers
of get_cpu_idle_time_us(), get_cpu_iowait_time_us() and tick_nohz_stop_idle().

Updaters can easily race and mess up with internal datas coherency , for example
when a governor calls a get_cpu_*_time_us() API and the target CPU exits idle at
the same time, because no locking or whatsoever is there to protect against
this concurrency.

To fix this, lets only update the sleeptime stats locally when the CPU
exits from idle. This is the only place where a real update is truly
needed. The callers of get_cpu_*_time_us() can simply add up the pending
sleep time delta to the last sleeptime snapshot in order to get a coherent
result. There is no need for them to also update the stats.

Reported-by: Fernando Luis Vazquez Cao <[email protected]>
Reported-by: Tetsuo Handa <[email protected]>
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Fernando Luis Vazquez Cao <[email protected]>
Cc: Tetsuo Handa <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Arjan van de Ven <[email protected]>
---
kernel/time/tick-sched.c | 65 +++++++++++++++------------------------------
1 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e77edc9..ede0405 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -408,33 +408,18 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog();
}

-/*
- * Updates the per cpu time idle statistics counters
- */
-static void
-update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
-{
- ktime_t delta;
-
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- if (nr_iowait_cpu(cpu) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- ts->idle_entrytime = now;
- }
-
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
-
-}
-
static void tick_nohz_stop_idle(int cpu, ktime_t now)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t delta;

- update_ts_time_stats(cpu, ts, now, NULL);
+ /* Updates the per cpu time idle statistics counters */
+ delta = ktime_sub(now, ts->idle_entrytime);
+ if (nr_iowait_cpu(cpu) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ ts->idle_entrytime = now;
ts->idle_active = 0;

sched_clock_idle_wakeup_event(0);
@@ -473,17 +458,14 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
return -1;

now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- idle = ts->idle_sleeptime;
- } else {
- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);

- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
- }
+ if (ts->idle_active && !nr_iowait_cpu(cpu)) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+ idle = ktime_add(ts->idle_sleeptime, delta);
+ } else {
+ idle = ts->idle_sleeptime;
}

return ktime_to_us(idle);
@@ -514,17 +496,14 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
return -1;

now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- iowait = ts->iowait_sleeptime;
- } else {
- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);

- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
+ if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+ iowait = ktime_add(ts->iowait_sleeptime, delta);
+ } else {
+ iowait = ts->iowait_sleeptime;
}

return ktime_to_us(iowait);
--
1.7.5.4

2013-08-09 00:55:28

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 3/4] nohz: Consolidate sleep time stats read code

get_cpu_idle_time_us() and get_cpu_iowait_time_us() mostly share
the same code. Lets consolidate both implementations.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Fernando Luis Vazquez Cao <[email protected]>
Cc: Tetsuo Handa <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Arjan van de Ven <[email protected]>
---
kernel/time/tick-sched.c | 76 ++++++++++++++++++++--------------------------
1 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f7fc27b..017bec2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -440,24 +440,10 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
return now;
}

-/**
- * get_cpu_idle_time_us - get the total idle time of a cpu
- * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
- * counters if NULL.
- *
- * Return the cummulative idle time (since boot) for a given
- * CPU, in microseconds.
- *
- * This time is measured via accounting rather than sampling,
- * and is as accurate as ktime_get() is.
- *
- * This function returns -1 if NOHZ is not enabled.
- */
-u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+static u64 get_cpu_sleep_time_us(int cpu, bool io, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, idle;
+ ktime_t now, sleep;
unsigned int seq;

if (!tick_nohz_enabled)
@@ -469,16 +455,41 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)

do {
seq = read_seqcount_begin(&ts->sleeptime_seq);
- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
+ if (io)
+ sleep = ts->iowait_sleeptime;
+ else
+ sleep = ts->idle_sleeptime;
+
+ if (ts->idle_active)
+ continue;
+
+ if ((io && nr_iowait_cpu(cpu)) || (!io && !nr_iowait_cpu(cpu))) {
ktime_t delta = ktime_sub(now, ts->idle_entrytime);
- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
+ sleep = ktime_add(sleep, delta);
}
} while (read_seqcount_retry(&ts->sleeptime_seq, seq));

- return ktime_to_us(idle);
+ return ktime_to_us(sleep);
+
+}

+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+ return get_cpu_sleep_time_us(cpu, false, last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);

@@ -498,28 +509,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
*/
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, iowait;
- unsigned int seq;
-
- if (!tick_nohz_enabled)
- return -1;
-
- now = ktime_get();
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
-
- do {
- seq = read_seqcount_begin(&ts->sleeptime_seq);
- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
- } while (read_seqcount_retry(&ts->sleeptime_seq, seq));
-
- return ktime_to_us(iowait);
+ return get_cpu_sleep_time_us(cpu, true, last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);

--
1.7.5.4