From: Fredrik Markstrom <fredrik.markstrom@gmail.com>
To: mingo@redhat.com, peterz@infradead.org
Cc: linux-kernel@vger.kernel.org,
        Fredrik Markstrom <fredrik.markstrom@gmail.com>
Subject: [PATCH 1/1] cputime: Make the reported utime+stime correspond to the actual runtime.
Date: Fri, 12 Jun 2015 10:55:16 +0200
Message-Id: <1434099316-29749-2-git-send-email-fredrik.markstrom@gmail.com>
In-Reply-To: <1434099316-29749-1-git-send-email-fredrik.markstrom@gmail.com>
References: <1434099316-29749-1-git-send-email-fredrik.markstrom@gmail.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 2998
Lines: 94

The scaling mechanism might sometimes cause top to report >100%
(sometimes > 1000%) cpu usage for a single thread. This patch makes
sure that stime+utime corresponds to the actual runtime of the thread.

Signed-off-by: Fredrik Markstrom <fredrik.markstrom@gmail.com>
---
 kernel/sched/cputime.c | 46 +++++++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ff..2d168c8 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -554,22 +554,7 @@ drop_precision:
 	return (__force cputime_t) scaled;
 }
 
-/*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
- *
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
-	cputime_t old;
-
-	while (new > (old = READ_ONCE(*counter)))
-		cmpxchg_cputime(counter, old, new);
-}
+static DEFINE_SPINLOCK(prev_time_lock);
 
 /*
  * Adjust tick based cputime random precision against scheduler
@@ -590,17 +575,11 @@ static void cputime_adjust(struct task_cputime *curr,
 	 *
 	 * Fix this by scaling these tick based values against the total
 	 * runtime accounted by the CFS scheduler.
+	 * In addition make sure the reported stime+utime equals rtime
+	 * so that the total runtime reported is correct.
 	 */
 	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
 
-	/*
-	 * Update userspace visible utime/stime values only if actual execution
-	 * time is bigger than already exported. Note that can happen, that we
-	 * provided bigger values due to scaling inaccuracy on big numbers.
-	 */
-	if (prev->stime + prev->utime >= rtime)
-		goto out;
-
 	stime = curr->stime;
 	utime = curr->utime;
 
@@ -616,12 +595,25 @@ static void cputime_adjust(struct task_cputime *curr,
 		utime = rtime - stime;
 	}
 
-	cputime_advance(&prev->stime, stime);
-	cputime_advance(&prev->utime, utime);
+	spin_lock(&prev_time_lock);
+	if (stime < prev->stime) {
+		stime = prev->stime;
+		utime = rtime - stime;
+	} else if (utime < prev->utime) {
+		utime = prev->utime;
+		stime = rtime - utime;
+	}
+	WARN_ON(stime < prev->stime);
+	WARN_ON(utime < prev->utime);
+	WARN_ON(stime + utime != rtime);
 
-out:
+	if (prev->stime + prev->utime < rtime) {
+		prev->stime = stime;
+		prev->utime = utime;
+	}
 	*ut = prev->utime;
 	*st = prev->stime;
+	spin_unlock(&prev_time_lock);
 }
 
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/