Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932512AbaDWTB7 (ORCPT ); Wed, 23 Apr 2014 15:01:59 -0400 Received: from mx1.redhat.com ([209.132.183.28]:33471 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932401AbaDWTB5 (ORCPT ); Wed, 23 Apr 2014 15:01:57 -0400 From: Denys Vlasenko To: linux-kernel@vger.kernel.org Cc: Denys Vlasenko , Frederic Weisbecker , Hidetoshi Seto , Fernando Luis Vazquez Cao , Tetsuo Handa , Thomas Gleixner , Ingo Molnar , Peter Zijlstra , Andrew Morton , Arjan van de Ven , Oleg Nesterov Subject: [PATCH 2/3] nohz: Synchronize sleep time stats with memory barriers Date: Wed, 23 Apr 2014 21:00:35 +0200 Message-Id: <1398279636-21503-2-git-send-email-dvlasenk@redhat.com> In-Reply-To: <1398279636-21503-1-git-send-email-dvlasenk@redhat.com> References: <1398279636-21503-1-git-send-email-dvlasenk@redhat.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org When some call site uses get_cpu_*_time_us() to read a sleeptime stat, it deduces the total sleeptime by adding the pending time to the last sleeptime snapshot if the CPU target is idle. But this only works if idle_sleeptime, idle_entrytime and idle_active are read and updated under some disciplined order. This patch changes updaters to modify idle_entrytime, {idle,iowait}_sleeptime, and idle_active exactly in this order, with write barriers on SMP to ensure other CPUs see then in this order too. Readers are changed read them in the opposite order, with read barriers. When readers detect a race by seeing cleared idle_entrytime, they retry the reads. The "iowait-ness" of every idle period is decided at the moment it starts: if this CPU's run-queue had tasks waiting on I/O, then this idle period's duration will be added to iowait_sleeptime. This, along with proper SMP syncronization, fixes the bug where iowait counts could go backwards. Signed-off-by: Denys Vlasenko Cc: Frederic Weisbecker Cc: Hidetoshi Seto Cc: Fernando Luis Vazquez Cao Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrew Morton Cc: Arjan van de Ven Cc: Oleg Nesterov --- kernel/time/tick-sched.c | 83 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 17 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 73ced0c4..a99770e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -409,10 +409,15 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) /* Updates the per cpu time idle statistics counters */ delta = ktime_sub(now, ts->idle_entrytime); - if (nr_iowait_cpu(smp_processor_id()) > 0) + ts->idle_entrytime.tv64 = 0; + smp_wmb(); + + if (ts->idle_active == 2) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + + smp_wmb(); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); @@ -423,7 +428,8 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) ktime_t now = ktime_get(); ts->idle_entrytime = now; - ts->idle_active = 1; + smp_wmb(); + ts->idle_active = nr_iowait_cpu(smp_processor_id()) ? 2 : 1; sched_clock_idle_sleep_event(); return now; } @@ -444,25 +450,46 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, idle; + struct tick_sched *ts; + ktime_t now, count; + int active; if (!tick_nohz_active) return -1; + ts = &per_cpu(tick_cpu_sched, cpu); + now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); - if (ts->idle_active && !nr_iowait_cpu(cpu)) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - idle = ktime_add(ts->idle_sleeptime, delta); + again: + active = ACCESS_ONCE(ts->idle_active); + smp_rmb(); + count = ACCESS_ONCE(ts->idle_sleeptime); + if (active == 1) { + ktime_t delta, start; + + smp_rmb(); + start = ACCESS_ONCE(ts->idle_entrytime); + if (start.tv64 == 0) { + /* + * Other CPU is updating the count. + * We don't know whether fetched count is valid. + */ + goto again; + } + delta = ktime_sub(now, start); + count = ktime_add(count, delta); } else { - idle = ts->idle_sleeptime; + /* + * Possible concurrent tick_nohz_stop_idle() already + * cleared idle_active. We fetched count *after* + * we fetched idle_active, so count must be valid. + */ } - return ktime_to_us(idle); - + return ktime_to_us(count); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); @@ -482,24 +509,46 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, iowait; + struct tick_sched *ts; + ktime_t now, count; + int active; if (!tick_nohz_active) return -1; + ts = &per_cpu(tick_cpu_sched, cpu); + now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); - if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - iowait = ktime_add(ts->iowait_sleeptime, delta); + again: + active = ACCESS_ONCE(ts->idle_active); + smp_rmb(); + count = ACCESS_ONCE(ts->iowait_sleeptime); + if (active == 2) { + ktime_t delta, start; + + smp_rmb(); + start = ACCESS_ONCE(ts->idle_entrytime); + if (start.tv64 == 0) { + /* + * Other CPU is updating the count. + * We don't know whether fetched count is valid. + */ + goto again; + } + delta = ktime_sub(now, start); + count = ktime_add(count, delta); } else { - iowait = ts->iowait_sleeptime; + /* + * Possible concurrent tick_nohz_stop_idle() already + * cleared idle_active. We fetched count *after* + * we fetched idle_active, so count must be valid. + */ } - return ktime_to_us(iowait); + return ktime_to_us(count); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/