Date: Mon, 29 Jun 2015 16:58:37 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: Fredrik =?iso-8859-1?Q?Markstr=F6m?= <fredrik.markstrom@gmail.com>
Cc: mingo@redhat.com, linux-kernel@vger.kernel.org,
        Rik van Riel <riel@redhat.com>, jason.low2@hp.com, fweisbec@gmail.com
Subject: Re: [PATCH 1/1] cputime: Make the reported utime+stime correspond to
 the actual runtime.
Message-ID: <20150629145837.GE3644@twins.programming.kicks-ass.net>
References: <1434099316-29749-1-git-send-email-fredrik.markstrom@gmail.com>
 <1434099316-29749-2-git-send-email-fredrik.markstrom@gmail.com>
 <1434104217.1495.74.camel@twins>
 <20150612110158.GA18673@twins.programming.kicks-ass.net>
 <CAKdL+dQWGtzURUe+4cB4XtsoaEPh0p0O1ejzbgxN0AUsbvUi0g@mail.gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=iso-8859-1
Content-Disposition: inline
Content-Transfer-Encoding: 8bit
In-Reply-To: <CAKdL+dQWGtzURUe+4cB4XtsoaEPh0p0O1ejzbgxN0AUsbvUi0g@mail.gmail.com>
User-Agent: Mutt/1.5.21 (2012-12-30)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7990
Lines: 238

Sorry for the delay, I seem to have gotten distracted...

On Mon, Jun 15, 2015 at 05:34:11PM +0200, Fredrik Markstr?m wrote:
> Hello Peter, your patch helps with some of the cases but not all:

Indeed, and barring cmpxchg_double(), which is not available on all
platforms, the thing needs a lock indeed.

Now, while you're probably right in that contention is unlikely for sane
behaviour, I could imagine some perverted apps hammering
getrusage() just because they can.

Therefore, find attached a version that has a per task/signal lock.

---
Subject: sched,cputime: Serialize cputime_adjust()

Fredrik reports that top and other tools can occasionally observe >100%
cpu usage and reports that this is because cputime_adjust() callers are
not serialized.

This means that when the s/u-time sample values are small, and change
can shift the balance quickly, concurrent updaters can race such that
one advances stime while the other advances utime such that the sum will
be vastly larger than the initial rtime.

There is also an issue with calculating utime as rtime - stime, where is
if the computed stime is smaller then the previously returned stime,
our utime will be larger than it should be, making the above problem
worse.

Cc: Jason Low <jason.low2@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Suggested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/init_task.h | 10 ++++++++++
 include/linux/sched.h     | 23 +++++++++++++++--------
 kernel/fork.c             |  7 ++++---
 kernel/sched/cputime.c    | 34 +++++++++-------------------------
 4 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bb9b075f0eb0..e38681f4912d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -39,6 +39,14 @@ extern struct fs_struct init_fs;
 #define INIT_CPUSET_SEQ(tsk)
 #endif
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#define INIT_PREV_CPUTIME(x)	.prev_cputime = {			\
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock),		\
+},
+#else
+#define INIT_PREV_CPUTIME(x)
+#endif
+
 #define INIT_SIGNALS(sig) {						\
 	.nr_threads	= 1,						\
 	.thread_head	= LIST_HEAD_INIT(init_task.thread_node),	\
@@ -53,6 +61,7 @@ extern struct fs_struct init_fs;
 		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
 		.running	= 0,					\
 	},								\
+	INIT_PREV_CPUTIME(sig)						\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
 	INIT_GROUP_RWSEM(sig)						\
@@ -254,6 +263,7 @@ extern struct task_group root_task_group;
 	INIT_TASK_RCU_TASKS(tsk)					\
 	INIT_CPUSET_SEQ(tsk)						\
 	INIT_RT_MUTEXES(tsk)						\
+	INIT_PREV_CPUTIME(tsk)						\
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6633e83e608a..5dfbe92ce04e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -531,17 +531,28 @@ struct cpu_itimer {
 };
 
 /**
- * struct cputime - snaphsot of system and user cputime
+ * struct prev_cputime - snaphsot of system and user cputime
  * @utime: time spent in user mode
  * @stime: time spent in system mode
  *
  * Gathers a generic snapshot of user and system time.
  */
-struct cputime {
+struct prev_cputime {
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	cputime_t utime;
 	cputime_t stime;
+	raw_spinlock_t lock;
+#endif
 };
 
+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	prev->utime = prev->stime = 0;
+	raw_spin_lock_init(&prev->lock);
+#endif
+}
+
 /**
  * struct task_cputime - collected CPU time counts
  * @utime:		time spent in user mode, in &cputime_t units
@@ -716,9 +727,7 @@ struct signal_struct {
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	struct cputime prev_cputime;
-#endif
+	struct prev_cputime prev_cputime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
@@ -1494,9 +1503,7 @@ struct task_struct {
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	struct cputime prev_cputime;
-#endif
+	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqlock_t vtime_seqlock;
 	unsigned long long vtime_snap;
diff --git a/kernel/fork.c b/kernel/fork.c
index c9507afd4a7d..306481d3a0e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
+
 	atomic_set(&sig->count, 1);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 	return 0;
@@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 	seqlock_init(&sig->stats_lock);
+	prev_cputime_init(&sig->prev_cputime);
 
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;
@@ -1338,9 +1340,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->utime = p->stime = p->gtime = 0;
 	p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+	prev_cputime_init(&p->prev_cputime);
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqlock_init(&p->vtime_seqlock);
 	p->vtime_snap = 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..3acfab426e4f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,32 +555,16 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
 }
 
 /*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
- *
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
-	cputime_t old;
-
-	while (new > (old = READ_ONCE(*counter)))
-		cmpxchg_cputime(counter, old, new);
-}
-
-/*
  * Adjust tick based cputime random precision against scheduler
  * runtime accounting.
  */
 static void cputime_adjust(struct task_cputime *curr,
-			   struct cputime *prev,
+			   struct prev_cputime *prev,
 			   cputime_t *ut, cputime_t *st)
 {
 	cputime_t rtime, stime, utime;
 
+	raw_spin_lock(&prev->lock);
 	/*
 	 * Tick based cputime accounting depend on random scheduling
 	 * timeslices of a task to be interrupted or not by the timer.
@@ -609,19 +593,19 @@ static void cputime_adjust(struct task_cputime *curr,
 	} else if (stime == 0) {
 		utime = rtime;
 	} else {
-		cputime_t total = stime + utime;
-
-		stime = scale_stime((__force u64)stime,
-				    (__force u64)rtime, (__force u64)total);
+		stime = scale_stime((__force u64)stime, (__force u64)rtime,
+				    (__force u64)(stime + utime));
+		if (stime < prev->stime)
+			stime = prev->stime;
 		utime = rtime - stime;
 	}
 
-	cputime_advance(&prev->stime, stime);
-	cputime_advance(&prev->utime, utime);
-
+	prev->stime = max(prev->stime, stime);
+	prev->utime = max(prev->utime, utime);
 out:
 	*ut = prev->utime;
 	*st = prev->stime;
+	raw_spin_unlock(&prev->lock);
 }
 
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/