2008-12-26 06:01:22

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH] posix-cpu-timers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward

posix-cpu-timers, clock_gettime(CLOCK_THREAD_CPUTIME_ID) and
clock_gettime(CLOCK_PROCESS_CPUTIME_ID) occasionally go backward.

>From user land (@HZ=250), it looks like:
prev call current call diff
(0.191195713) > (0.191200456) : 4743
(0.191200456) > (0.191205198) : 4742
(0.191205198) > (0.187213693) : -3991505
(0.187213693) > (0.191218688) : 4004995
(0.191218688) > (0.191223436) : 4748

The reasons are:

- Timer tick (and task_tick) can interrups between getting
sum_exec_runtime and task_delta_exec(p). Since task's runtime
is updated in the tick, it makes the value of cpu->sched about
a tick less than what it should be.
So, interrupts should be disabled while sampling runtime and
delta.

- Queueing task also updates runtime of task running on the rq
which a task is going to be enqueued/dequeued.
So, runqueue should be locked while sampling runtime and delta.

For thread timer (CLOCK_THREAD_CPUTIME_ID), it is easy to solve
by making a function do all in a block locked by task_rq_lock().

# clock_gettime(CLOCK_THREAD_CPUTIME_ID)
before:
cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
after:
cpu->sched = task_total_exec(p);

For process timer (CLOCK_PROCESS_CPUTIME_ID), proposed fix here
is moving thread_group_cputime() from kernel/posix-cpu-timers.c
to kernel/sched.c, and let it use rq-related operations.

# clock_gettime(CLOCK_PROCESS_CPUTIME_ID)
before:
thread_group_cputime(p,&cputime); /* returns total */
cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
after:
thread_group_cputime(p,&cputime); /* total + local delta */
cpu->sched = cputime.sum_exec_runtime;

Note: According to the current code, process timer returns
"((total runtime of thread group) + (delta of current thread))."
The former total is total of "banked runtime" in signal structure,
therefore technically speaking the latter delta should be sum of
delta of all running thread in the group. However requesting delta
for all cpu is quite heavy and nonsense, so I took a realistic
approach - keep it as is, as return banked total plus local delta.
In othe words, be careful that accuracy of process timer is not
so good, it can stale about tick*(max(num_cpu, num_thread)-1).

Signed-off-by: Hidetoshi Seto <[email protected]>
---
include/linux/kernel_stat.h | 2 +-
kernel/posix-cpu-timers.c | 38 +----------------------------
kernel/sched.c | 54 +++++++++++++++++++++++++++++++++++++++++-
3 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4a145ca..e142222 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -66,7 +66,7 @@ static inline unsigned int kstat_irqs(unsigned int irq)
return sum;
}

-extern unsigned long long task_delta_exec(struct task_struct *);
+extern unsigned long long task_total_exec(struct task_struct *);
extern void account_user_time(struct task_struct *, cputime_t);
extern void account_user_time_scaled(struct task_struct *, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a..c53cea4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -45,40 +45,6 @@ int thread_group_cputime_alloc(struct task_struct *tsk)
return 0;
}

-/**
- * thread_group_cputime - Sum the thread group time fields across all CPUs.
- *
- * @tsk: The task we use to identify the thread group.
- * @times: task_cputime structure in which we return the summed fields.
- *
- * Walk the list of CPUs to sum the per-CPU time fields in the thread group
- * time structure.
- */
-void thread_group_cputime(
- struct task_struct *tsk,
- struct task_cputime *times)
-{
- struct signal_struct *sig;
- int i;
- struct task_cputime *tot;
-
- sig = tsk->signal;
- if (unlikely(!sig) || !sig->cputime.totals) {
- times->utime = tsk->utime;
- times->stime = tsk->stime;
- times->sum_exec_runtime = tsk->se.sum_exec_runtime;
- return;
- }
- times->stime = times->utime = cputime_zero;
- times->sum_exec_runtime = 0;
- for_each_possible_cpu(i) {
- tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
- times->utime = cputime_add(times->utime, tot->utime);
- times->stime = cputime_add(times->stime, tot->stime);
- times->sum_exec_runtime += tot->sum_exec_runtime;
- }
-}
-
/*
* Called after updating RLIMIT_CPU to set timer expiration if necessary.
*/
@@ -294,7 +260,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_total_exec(p);
break;
}
return 0;
@@ -321,7 +287,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = cputime.sum_exec_runtime;
break;
}
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index e4bb1dd..13bd162 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4064,10 +4064,10 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return total of runtime which already banked and which not yet
* @p in case that task is currently running.
*/
-unsigned long long task_delta_exec(struct task_struct *p)
+unsigned long long task_total_exec(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
@@ -4083,12 +4083,62 @@ unsigned long long task_delta_exec(struct task_struct *p)
if ((s64)delta_exec > 0)
ns = delta_exec;
}
+ ns += p->se.sum_exec_runtime;

task_rq_unlock(rq, &flags);

return ns;
}

+/**
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
+ *
+ * @p: The task we use to identify the thread group.
+ * @times: task_cputime structure in which we return the summed fields.
+ *
+ * Walk the list of CPUs to sum the per-CPU time fields in the thread group
+ * time structure.
+ */
+void thread_group_cputime(struct task_struct *p, struct task_cputime *times)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 delta_exec = 0;
+ struct task_cputime *tot;
+ struct signal_struct *sig;
+ int i;
+
+ sig = p->signal;
+ if (unlikely(!sig) || !sig->cputime.totals) {
+ times->utime = p->utime;
+ times->stime = p->stime;
+ times->sum_exec_runtime = task_total_exec(p);
+ return;
+ }
+
+ times->stime = times->utime = cputime_zero;
+ times->sum_exec_runtime = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ if (task_current(rq, p)) {
+ update_rq_clock(rq);
+ delta_exec = rq->clock - p->se.exec_start;
+ if ((s64)delta_exec < 0)
+ delta_exec = 0;
+ }
+
+ for_each_possible_cpu(i) {
+ tot = per_cpu_ptr(p->signal->cputime.totals, i);
+ times->utime = cputime_add(times->utime, tot->utime);
+ times->stime = cputime_add(times->stime, tot->stime);
+ times->sum_exec_runtime += tot->sum_exec_runtime;
+ }
+ times->sum_exec_runtime += delta_exec;
+
+ task_rq_unlock(rq, &flags);
+}
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
--
1.6.1.rc4


2008-12-26 08:43:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] posix-cpu-timers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward


* Hidetoshi Seto <[email protected]> wrote:

> @@ -321,7 +287,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
> cpu->cpu = cputime.utime;
> break;
> case CPUCLOCK_SCHED:
> - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
> + cpu->sched = cputime.sum_exec_runtime;
> break;
> }

hm, doesnt this regress precision?

Ingo

2008-12-26 09:03:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] posix-cpu-timers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward

On Fri, 2008-12-26 at 09:43 +0100, Ingo Molnar wrote:
> * Hidetoshi Seto <[email protected]> wrote:
>
> > @@ -321,7 +287,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
> > cpu->cpu = cputime.utime;
> > break;
> > case CPUCLOCK_SCHED:
> > - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
> > + cpu->sched = cputime.sum_exec_runtime;
> > break;
> > }
>
> hm, doesnt this regress precision?

No, he folds it into:

> +void thread_group_cputime(struct task_struct *p, struct task_cputime *times)
> +{
> + unsigned long flags;
> + struct rq *rq;
> + u64 delta_exec = 0;
> + struct task_cputime *tot;
> + struct signal_struct *sig;
> + int i;
> +
> + sig = p->signal;
> + if (unlikely(!sig) || !sig->cputime.totals) {
> + times->utime = p->utime;
> + times->stime = p->stime;
> + times->sum_exec_runtime = task_total_exec(p);
> + return;
> + }
> +
> + times->stime = times->utime = cputime_zero;
> + times->sum_exec_runtime = 0;
> +
> + rq = task_rq_lock(p, &flags);
> +
> + if (task_current(rq, p)) {
> + update_rq_clock(rq);
> + delta_exec = rq->clock - p->se.exec_start;
> + if ((s64)delta_exec < 0)
> + delta_exec = 0;
> + }
> +
> + for_each_possible_cpu(i) {
> + tot = per_cpu_ptr(p->signal->cputime.totals, i);
> + times->utime = cputime_add(times->utime, tot->utime);
> + times->stime = cputime_add(times->stime, tot->stime);
> + times->sum_exec_runtime += tot->sum_exec_runtime;
> + }
> + times->sum_exec_runtime += delta_exec;
> +
> + task_rq_unlock(rq, &flags);
> +}

Which reminds me, why do we still have this crap in the kernel? I
thought we pretty much showed the per-cpu itimer thing was utter crap?
-- can we pretty please either revert that or apply
http://lkml.org/lkml/2008/11/24/183 ?

Also, I really don't like the above, we now do the per-cpu loop with the
RQ lock held...

2008-12-26 09:08:37

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] posix-cpu-timers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward


* Peter Zijlstra <[email protected]> wrote:

> On Fri, 2008-12-26 at 09:43 +0100, Ingo Molnar wrote:
> > * Hidetoshi Seto <[email protected]> wrote:
> >
> > > @@ -321,7 +287,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
> > > cpu->cpu = cputime.utime;
> > > break;
> > > case CPUCLOCK_SCHED:
> > > - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
> > > + cpu->sched = cputime.sum_exec_runtime;
> > > break;
> > > }
> >
> > hm, doesnt this regress precision?
>
> No, he folds it into:
>
> > +void thread_group_cputime(struct task_struct *p, struct task_cputime *times)
> > +{
> > + unsigned long flags;
> > + struct rq *rq;
> > + u64 delta_exec = 0;
> > + struct task_cputime *tot;
> > + struct signal_struct *sig;
> > + int i;
> > +
> > + sig = p->signal;
> > + if (unlikely(!sig) || !sig->cputime.totals) {
> > + times->utime = p->utime;
> > + times->stime = p->stime;
> > + times->sum_exec_runtime = task_total_exec(p);
> > + return;
> > + }
> > +
> > + times->stime = times->utime = cputime_zero;
> > + times->sum_exec_runtime = 0;
> > +
> > + rq = task_rq_lock(p, &flags);
> > +
> > + if (task_current(rq, p)) {
> > + update_rq_clock(rq);
> > + delta_exec = rq->clock - p->se.exec_start;
> > + if ((s64)delta_exec < 0)
> > + delta_exec = 0;
> > + }
> > +
> > + for_each_possible_cpu(i) {
> > + tot = per_cpu_ptr(p->signal->cputime.totals, i);
> > + times->utime = cputime_add(times->utime, tot->utime);
> > + times->stime = cputime_add(times->stime, tot->stime);
> > + times->sum_exec_runtime += tot->sum_exec_runtime;
> > + }
> > + times->sum_exec_runtime += delta_exec;
> > +
> > + task_rq_unlock(rq, &flags);
> > +}
>
> Which reminds me, why do we still have this crap in the kernel? I
> thought we pretty much showed the per-cpu itimer thing was utter crap?
> -- can we pretty please either revert that or apply
> http://lkml.org/lkml/2008/11/24/183 ?

yes, i agree with that patch - could you please send it with a signoff?

> Also, I really don't like the above, we now do the per-cpu loop with the
> RQ lock held...

yes.

Ingo

2009-01-05 07:00:20

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH] posix-cpu-timers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward

OK, this patch will follow to Peter's "remove the per-cpu-ish-ness"
patch. How about this?

Thanks,
H.Seto


posix-cpu-timers, clock_gettime(CLOCK_THREAD_CPUTIME_ID) and
clock_gettime(CLOCK_PROCESS_CPUTIME_ID) occasionally go backward.

>From user land (@HZ=250), it looks like:
prev call current call diff
(0.191195713) > (0.191200456) : 4743
(0.191200456) > (0.191205198) : 4742
(0.191205198) > (0.187213693) : -3991505
(0.187213693) > (0.191218688) : 4004995
(0.191218688) > (0.191223436) : 4748

The reasons are:

- Timer tick (and task_tick) can interrups between getting
sum_exec_runtime and task_delta_exec(p). Since task's runtime
is updated in the tick, it makes the value of cpu->sched about
a tick less than what it should be.
So, interrupts should be disabled while sampling runtime and
delta.

- Queueing task also updates runtime of task running on the rq
which a task is going to be enqueued/dequeued.
So, runqueue should be locked while sampling runtime and delta.

For thread timer (CLOCK_THREAD_CPUTIME_ID), it is easy to solve
by making a function do all in a block locked by task_rq_lock().

# clock_gettime(CLOCK_THREAD_CPUTIME_ID)
before:
cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
after:
cpu->sched = task_total_exec(p);

For process timer (CLOCK_PROCESS_CPUTIME_ID), proposed fix here
is moving thread_group_cputime() from include/linux/sched.h to
kernel/sched.c, and let it use rq-related operations.

# clock_gettime(CLOCK_PROCESS_CPUTIME_ID)
before:
thread_group_cputime(p,&cputime); /* returns total */
cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
after:
thread_group_cputime(p,&cputime); /* total + local delta */
cpu->sched = cputime.sum_exec_runtime;

[Note]: According to the current code, process timer returns
"((total runtime of thread group) + (delta of current thread))."
The former total is total of "banked runtime" in signal structure,
therefore technically speaking the latter delta should be sum of
delta of all running thread in the group. However requesting delta
for all cpu is quite heavy and nonsense, so I took a realistic
approach - keep it as is, as return banked total plus local delta.
In othe words, be careful that accuracy of process timer is not
so good (while clock_getres() returns a resolution of 1ns), it
can stale about tick*(max(NR_CPUS, num_thread)-1).

Signed-off-by: Hidetoshi Seto <[email protected]>
---
include/linux/kernel_stat.h | 2 +-
include/linux/sched.h | 11 +----------
kernel/posix-cpu-timers.c | 4 ++--
kernel/sched.c | 29 +++++++++++++++++++++++++++--
4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 570d204..50fabb3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,7 +78,7 @@ static inline unsigned int kstat_irqs(unsigned int irq)
return sum;
}

-extern unsigned long long task_delta_exec(struct task_struct *);
+extern unsigned long long task_total_exec(struct task_struct *);
extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
extern void account_steal_time(cputime_t);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29c4b8d..c6a07bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2179,16 +2179,7 @@ static inline int spin_needbreak(spinlock_t *lock)
* Thread group CPU time accounting.
*/

-static inline
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
- struct task_cputime *totals = &tsk->signal->cputime.totals;
- unsigned long flags;
-
- spin_lock_irqsave(&totals->lock, flags);
- *times = *totals;
- spin_unlock_irqrestore(&totals->lock, flags);
-}
+extern void thread_group_cputime(struct task_struct *, struct task_cputime *);

static inline void thread_group_cputime_init(struct signal_struct *sig)
{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fa07da9..56c91b9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_total_exec(p);
break;
}
return 0;
@@ -251,7 +251,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = cputime.sum_exec_runtime;
break;
}
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 545c6fc..8315cd5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4121,10 +4121,10 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return total of runtime which already banked and which not yet
* @p in case that task is currently running.
*/
-unsigned long long task_delta_exec(struct task_struct *p)
+unsigned long long task_total_exec(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
@@ -4140,12 +4140,37 @@ unsigned long long task_delta_exec(struct task_struct *p)
if ((s64)delta_exec > 0)
ns = delta_exec;
}
+ ns += p->se.sum_exec_runtime;

task_rq_unlock(rq, &flags);

return ns;
}

+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct task_cputime *totals = &tsk->signal->cputime.totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 delta_exec = 0;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ spin_lock(&totals->lock);
+ *times = *totals;
+ spin_unlock(&totals->lock);
+
+ if (task_current(rq, tsk)) {
+ update_rq_clock(rq);
+ delta_exec = rq->clock - tsk->se.exec_start;
+ if ((s64)delta_exec < 0)
+ delta_exec = 0;
+ }
+ times->sum_exec_runtime += delta_exec;
+
+ task_rq_unlock(rq, &flags);
+}
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
--
1.6.1

2009-01-07 17:59:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] posix-cpu-timers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward


* Peter Zijlstra <[email protected]> wrote:

> Which reminds me, why do we still have this crap in the kernel? I
> thought we pretty much showed the per-cpu itimer thing was utter crap?
> -- can we pretty please either revert that or apply
> http://lkml.org/lkml/2008/11/24/183 ?
>
> Also, I really don't like the above, we now do the per-cpu loop with the
> RQ lock held...

ok, i've applied it to tip/timers/urgent. This should solve the original
itimer testcase as well, correct?

Ingo

2009-01-08 08:01:29

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] posix-cpu-timers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward

On Wed, 2009-01-07 at 18:59 +0100, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > Which reminds me, why do we still have this crap in the kernel? I
> > thought we pretty much showed the per-cpu itimer thing was utter crap?
> > -- can we pretty please either revert that or apply
> > http://lkml.org/lkml/2008/11/24/183 ?
> >
> > Also, I really don't like the above, we now do the per-cpu loop with the
> > RQ lock held...
>
> ok, i've applied it to tip/timers/urgent. This should solve the original
> itimer testcase as well, correct?

Yeah, but what would need to happen to make itimers not lock up large
machines is reduce its granularity depeding on machine size.

The whole (process wide) itimer idea just plain sucks -- sadly its part
of unix so we can't just drop it, the best we can do is make it useless
(but within spec) so that nobody sane will use it ;-)

2009-01-21 05:55:09

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH] posixtimers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward

(resend, rebased on today's linux-next 20090121)

posix-cpu-timers, clock_gettime(CLOCK_THREAD_CPUTIME_ID) and
clock_gettime(CLOCK_PROCESS_CPUTIME_ID) occasionally go backward.

>From user land (@HZ=250), it looks like:
prev call current call diff
(0.191195713) > (0.191200456) : 4743
(0.191200456) > (0.191205198) : 4742
(0.191205198) > (0.187213693) : -3991505
(0.187213693) > (0.191218688) : 4004995
(0.191218688) > (0.191223436) : 4748

The reasons are:

- Timer tick (and task_tick) can interrups between getting
sum_exec_runtime and task_delta_exec(p). Since task's runtime
is updated in the tick, it makes the value of cpu->sched about
a tick less than what it should be.
So, interrupts should be disabled while sampling runtime and
delta.

- Queuing task also updates runtime of task running on the rq
which a task is going to be enqueued/dequeued.
So, runqueue should be locked while sampling runtime and delta.

For thread time (CLOCK_THREAD_CPUTIME_ID), it is easy to solve
by making a function do all in a block locked by task_rq_lock().

# clock_gettime(CLOCK_THREAD_CPUTIME_ID)
before:
cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
after:
cpu->sched = task_total_exec(p);

For process time (CLOCK_PROCESS_CPUTIME_ID), proposed fix here
is moving thread_group_cputime() from include/linux/sched.h to
kernel/sched.c, and let it use rq-related operations.

# clock_gettime(CLOCK_PROCESS_CPUTIME_ID)
before:
thread_group_cputime(p,&cputime); /* returns total */
cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
after:
thread_group_cputime(p,&cputime); /* total + local delta */
cpu->sched = cputime.sum_exec_runtime;

[Note]: According to the current code, process timer returns
"((total runtime of thread group) + (delta of current thread))."
The former total is total of "banked runtime" in signal structure,
therefore technically speaking the latter delta should be sum of
delta of all running thread in the group. However requesting delta
for all cpu is quite heavy and nonsense, so I took a realistic
approach - keep it as is, as return banked total plus local delta.
In othe words, people should be careful that accuracy of process
timer is not so good (while clock_getres() returns a resolution of
1ns), it can stale about tick*(max(NR_CPUS, num_thread)-1).

Signed-off-by: Hidetoshi Seto <[email protected]>
---
include/linux/kernel_stat.h | 2 +-
include/linux/sched.h | 11 +----------
kernel/posix-cpu-timers.c | 4 ++--
kernel/sched.c | 29 +++++++++++++++++++++++++++--
4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 570d204..50fabb3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,7 +78,7 @@ static inline unsigned int kstat_irqs(unsigned int irq)
return sum;
}

-extern unsigned long long task_delta_exec(struct task_struct *);
+extern unsigned long long task_total_exec(struct task_struct *);
extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
extern void account_steal_time(cputime_t);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cbd02c..a689b69 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2209,16 +2209,7 @@ static inline int spin_needbreak(spinlock_t *lock)
* Thread group CPU time accounting.
*/

-static inline
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
- struct task_cputime *totals = &tsk->signal->cputime.totals;
- unsigned long flags;
-
- spin_lock_irqsave(&totals->lock, flags);
- *times = *totals;
- spin_unlock_irqrestore(&totals->lock, flags);
-}
+extern void thread_group_cputime(struct task_struct *, struct task_cputime *);

static inline void thread_group_cputime_init(struct signal_struct *sig)
{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fa07da9..56c91b9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_total_exec(p);
break;
}
return 0;
@@ -251,7 +251,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = cputime.sum_exec_runtime;
break;
}
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7aba647..bb3e806 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4220,10 +4220,10 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return total of runtime which already banked and which not yet
* @p in case that task is currently running.
*/
-unsigned long long task_delta_exec(struct task_struct *p)
+unsigned long long task_total_exec(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
@@ -4239,12 +4239,37 @@ unsigned long long task_delta_exec(struct task_struct *p)
if ((s64)delta_exec > 0)
ns = delta_exec;
}
+ ns += p->se.sum_exec_runtime;

task_rq_unlock(rq, &flags);

return ns;
}

+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct task_cputime *totals = &tsk->signal->cputime.totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 delta_exec = 0;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ spin_lock(&totals->lock);
+ *times = *totals;
+ spin_unlock(&totals->lock);
+
+ if (task_current(rq, tsk)) {
+ update_rq_clock(rq);
+ delta_exec = rq->clock - tsk->se.exec_start;
+ if ((s64)delta_exec < 0)
+ delta_exec = 0;
+ }
+ times->sum_exec_runtime += delta_exec;
+
+ task_rq_unlock(rq, &flags);
+}
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
--
1.6.0.3

2009-01-21 13:18:51

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] posixtimers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward

On Wed, 2009-01-21 at 14:54 +0900, Hidetoshi Seto wrote:
> (resend, rebased on today's linux-next 20090121)
>
> posix-cpu-timers, clock_gettime(CLOCK_THREAD_CPUTIME_ID) and
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID) occasionally go backward.
>
> From user land (@HZ=250), it looks like:
> prev call current call diff
> (0.191195713) > (0.191200456) : 4743
> (0.191200456) > (0.191205198) : 4742
> (0.191205198) > (0.187213693) : -3991505
> (0.187213693) > (0.191218688) : 4004995
> (0.191218688) > (0.191223436) : 4748
>
> The reasons are:
>
> - Timer tick (and task_tick) can interrups between getting
> sum_exec_runtime and task_delta_exec(p). Since task's runtime
> is updated in the tick, it makes the value of cpu->sched about
> a tick less than what it should be.
> So, interrupts should be disabled while sampling runtime and
> delta.
>
> - Queuing task also updates runtime of task running on the rq
> which a task is going to be enqueued/dequeued.
> So, runqueue should be locked while sampling runtime and delta.

Correct.

> For thread time (CLOCK_THREAD_CPUTIME_ID), it is easy to solve
> by making a function do all in a block locked by task_rq_lock().
>
> # clock_gettime(CLOCK_THREAD_CPUTIME_ID)
> before:
> cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
> after:
> cpu->sched = task_total_exec(p);

OK.

> For process time (CLOCK_PROCESS_CPUTIME_ID), proposed fix here
> is moving thread_group_cputime() from include/linux/sched.h to
> kernel/sched.c, and let it use rq-related operations.
>
> # clock_gettime(CLOCK_PROCESS_CPUTIME_ID)
> before:
> thread_group_cputime(p,&cputime); /* returns total */
> cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
> after:
> thread_group_cputime(p,&cputime); /* total + local delta */
> cpu->sched = cputime.sum_exec_runtime;
>
> [Note]: According to the current code, process timer returns
> "((total runtime of thread group) + (delta of current thread))."
> The former total is total of "banked runtime" in signal structure,
> therefore technically speaking the latter delta should be sum of
> delta of all running thread in the group. However requesting delta
> for all cpu is quite heavy and nonsense, so I took a realistic
> approach - keep it as is, as return banked total plus local delta.
> In othe words, people should be careful that accuracy of process
> timer is not so good (while clock_getres() returns a resolution of
> 1ns), it can stale about tick*(max(NR_CPUS, num_thread)-1).

IMO process wide clocks and timers should be discouraged anyway. The
proposed restriction in resolution looks perfectly fine (esp as its what
it already was).

However I cannot quite see how its different from before.

We used to do:

thread_group_cputime(p, &cputime);
*cputime = *totals
cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p)

Which is equal to totals.sum_exec_runtime + task_delta_exec(p).

The new code does exactly that, but in a more correct way.

> Signed-off-by: Hidetoshi Seto <[email protected]>

Acked-by: Peter Zijlstra <[email protected]>

> ---
> include/linux/kernel_stat.h | 2 +-
> include/linux/sched.h | 11 +----------
> kernel/posix-cpu-timers.c | 4 ++--
> kernel/sched.c | 29 +++++++++++++++++++++++++++--
> 4 files changed, 31 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
> index 570d204..50fabb3 100644
> --- a/include/linux/kernel_stat.h
> +++ b/include/linux/kernel_stat.h
> @@ -78,7 +78,7 @@ static inline unsigned int kstat_irqs(unsigned int irq)
> return sum;
> }
>
> -extern unsigned long long task_delta_exec(struct task_struct *);
> +extern unsigned long long task_total_exec(struct task_struct *);
> extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
> extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
> extern void account_steal_time(cputime_t);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8cbd02c..a689b69 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2209,16 +2209,7 @@ static inline int spin_needbreak(spinlock_t *lock)
> * Thread group CPU time accounting.
> */
>
> -static inline
> -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
> -{
> - struct task_cputime *totals = &tsk->signal->cputime.totals;
> - unsigned long flags;
> -
> - spin_lock_irqsave(&totals->lock, flags);
> - *times = *totals;
> - spin_unlock_irqrestore(&totals->lock, flags);
> -}
> +extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
>
> static inline void thread_group_cputime_init(struct signal_struct *sig)
> {
> diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
> index fa07da9..56c91b9 100644
> --- a/kernel/posix-cpu-timers.c
> +++ b/kernel/posix-cpu-timers.c
> @@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
> cpu->cpu = virt_ticks(p);
> break;
> case CPUCLOCK_SCHED:
> - cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
> + cpu->sched = task_total_exec(p);
> break;
> }
> return 0;
> @@ -251,7 +251,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
> cpu->cpu = cputime.utime;
> break;
> case CPUCLOCK_SCHED:
> - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
> + cpu->sched = cputime.sum_exec_runtime;
> break;
> }
> return 0;
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 7aba647..bb3e806 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -4220,10 +4220,10 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
> EXPORT_PER_CPU_SYMBOL(kstat);
>
> /*
> - * Return any ns on the sched_clock that have not yet been banked in
> + * Return total of runtime which already banked and which not yet
> * @p in case that task is currently running.
> */
> -unsigned long long task_delta_exec(struct task_struct *p)
> +unsigned long long task_total_exec(struct task_struct *p)
> {
> unsigned long flags;
> struct rq *rq;
> @@ -4239,12 +4239,37 @@ unsigned long long task_delta_exec(struct task_struct *p)
> if ((s64)delta_exec > 0)
> ns = delta_exec;
> }
> + ns += p->se.sum_exec_runtime;
>
> task_rq_unlock(rq, &flags);
>
> return ns;
> }
>
> +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
> +{
> + struct task_cputime *totals = &tsk->signal->cputime.totals;
> + unsigned long flags;
> + struct rq *rq;
> + u64 delta_exec = 0;
> +
> + rq = task_rq_lock(tsk, &flags);
> +
> + spin_lock(&totals->lock);
> + *times = *totals;
> + spin_unlock(&totals->lock);
> +
> + if (task_current(rq, tsk)) {
> + update_rq_clock(rq);
> + delta_exec = rq->clock - tsk->se.exec_start;
> + if ((s64)delta_exec < 0)
> + delta_exec = 0;
> + }
> + times->sum_exec_runtime += delta_exec;
> +
> + task_rq_unlock(rq, &flags);
> +}
> +
> /*
> * Account user cpu time to a process.
> * @p: the process that the cpu time gets accounted to

2009-01-27 05:52:20

by Hidetoshi Seto

[permalink] [raw]
Subject: [RESEND][PATCH] posixtimers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward

Thank you for providing ack, Peter!
I've revised description a bit, reflecting your comment.

Ingo (or Thomas?), could you pick this up?

Thanks,
H.Seto


posix-cpu-timers, clock_gettime(CLOCK_THREAD_CPUTIME_ID) and
clock_gettime(CLOCK_PROCESS_CPUTIME_ID) occasionally go backward.

>From user land (@HZ=250), it looks like:
prev call current call diff
(0.191195713) > (0.191200456) : 4743
(0.191200456) > (0.191205198) : 4742
(0.191205198) > (0.187213693) : -3991505
(0.187213693) > (0.191218688) : 4004995
(0.191218688) > (0.191223436) : 4748

The reasons are:

- Timer tick (and task_tick) can interrupts between getting
sum_exec_runtime and task_delta_exec(p). Since task's runtime
is updated in the tick, it makes the value of cpu->sched about
a tick less than what it should be.
So, interrupts should be disabled while sampling runtime and
delta.

- Queuing task also updates runtime of task running on the rq
which a task is going to be enqueued/dequeued.
So, runqueue should be locked while sampling runtime and delta.

For thread time (CLOCK_THREAD_CPUTIME_ID), it is easy to solve
by making a function do all in a block locked by task_rq_lock().

# clock_gettime(CLOCK_THREAD_CPUTIME_ID)
before:
cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
after:
cpu->sched = task_total_exec(p);

For process time (CLOCK_PROCESS_CPUTIME_ID), it is required to let
it do proper lock operations. Proposed fix here realizes it by moving
thread_group_cputime() from include/linux/sched.h to kernel/sched.c,
to enable it to do rq-related stuffs.

# clock_gettime(CLOCK_PROCESS_CPUTIME_ID)
before:
thread_group_cputime(p, &cputime);
/* cputime have total only, sampled w/o any locks */
cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
after:
thread_group_cputime(p, &cputime); /* new */
/* cputime have total + delta, sampled w/ proper locks */
cpu->sched = cputime.sum_exec_runtime;

[Note]: Process wide clocks and timers should be discouraged.
In both of previous and new implementation, process timer returns
"((total runtime of thread group) + (delta of current thread))."
The former total is total of "banked runtime" in signal structure,
therefore technically speaking the latter delta should be sum of
delta of all running thread in the group. However requesting delta
for all processor is quite heavy and nonsense, we took a realistic
approach - keep it as is, as return banked total plus local delta.
In other words, people should not expect that accuracy of process
timer is usefully good (while clock_getres() returns a resolution of
1ns). In fact, it can stale about tick*(max(NR_CPUS, num_thread)-1).

Signed-off-by: Hidetoshi Seto <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
---
include/linux/kernel_stat.h | 2 +-
include/linux/sched.h | 11 +----------
kernel/posix-cpu-timers.c | 4 ++--
kernel/sched.c | 29 +++++++++++++++++++++++++++--
4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 570d204..50fabb3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,7 +78,7 @@ static inline unsigned int kstat_irqs(unsigned int irq)
return sum;
}

-extern unsigned long long task_delta_exec(struct task_struct *);
+extern unsigned long long task_total_exec(struct task_struct *);
extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
extern void account_steal_time(cputime_t);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cbd02c..a689b69 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2209,16 +2209,7 @@ static inline int spin_needbreak(spinlock_t *lock)
* Thread group CPU time accounting.
*/

-static inline
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
- struct task_cputime *totals = &tsk->signal->cputime.totals;
- unsigned long flags;
-
- spin_lock_irqsave(&totals->lock, flags);
- *times = *totals;
- spin_unlock_irqrestore(&totals->lock, flags);
-}
+extern void thread_group_cputime(struct task_struct *, struct task_cputime *);

static inline void thread_group_cputime_init(struct signal_struct *sig)
{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fa07da9..56c91b9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_total_exec(p);
break;
}
return 0;
@@ -251,7 +251,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = cputime.sum_exec_runtime;
break;
}
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7aba647..bb3e806 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4220,10 +4220,10 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return total of runtime which already banked and which not yet
* @p in case that task is currently running.
*/
-unsigned long long task_delta_exec(struct task_struct *p)
+unsigned long long task_total_exec(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
@@ -4239,12 +4239,37 @@ unsigned long long task_delta_exec(struct task_struct *p)
if ((s64)delta_exec > 0)
ns = delta_exec;
}
+ ns += p->se.sum_exec_runtime;

task_rq_unlock(rq, &flags);

return ns;
}

+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct task_cputime *totals = &tsk->signal->cputime.totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 delta_exec = 0;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ spin_lock(&totals->lock);
+ *times = *totals;
+ spin_unlock(&totals->lock);
+
+ if (task_current(rq, tsk)) {
+ update_rq_clock(rq);
+ delta_exec = rq->clock - tsk->se.exec_start;
+ if ((s64)delta_exec < 0)
+ delta_exec = 0;
+ }
+ times->sum_exec_runtime += delta_exec;
+
+ task_rq_unlock(rq, &flags);
+}
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
--
1.6.0.3