2009-03-17 06:13:27

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH] posixtimers: Fix posix clock monotonicity

This patch rehires task_sched_runtime() and thread_group_sched_runtime()
which were removed at the time of 2.6.28-rc1.

These functions protect the sampling of clock with rq lock.
This rq lock is required not to update rq->clock during the sampling.
i.e. You may get ((banked runtime before update)+(delta after update)).

Signed-off-by: Hidetoshi Seto <[email protected]>
Cc: [email protected] [2.6.28.x]

---
kernel/posix-cpu-timers.c | 7 +++--
kernel/sched.c | 58 +++++++++++++++++++++++++++++++++++++++-----
2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a..a65641a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -294,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_sched_runtime(p);
break;
}
return 0;
@@ -310,18 +310,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
{
struct task_cputime cputime;

- thread_group_cputime(p, &cputime);
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime_add(cputime.utime, cputime.stime);
break;
case CPUCLOCK_VIRT:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = thread_group_sched_runtime(p);
break;
}
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index db66874..617d1b8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4066,7 +4066,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
/*
* Return any ns on the sched_clock that have not yet been banked in
* @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
*/
+static u64 __task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ if (task_current(rq, p)) {
+ update_rq_clock(rq);
+ ns = rq->clock - p->se.exec_start;
+ if ((s64)ns < 0)
+ ns = 0;
+ }
+
+ return ns;
+}
+
unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
@@ -4074,16 +4090,44 @@ unsigned long long task_delta_exec(struct task_struct *p)
u64 ns = 0;

rq = task_rq_lock(p, &flags);
+ ns = __task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);

- if (task_current(rq, p)) {
- u64 delta_exec;
+ return ns;
+}

- update_rq_clock(rq);
- delta_exec = rq->clock - p->se.exec_start;
- if ((s64)delta_exec > 0)
- ns = delta_exec;
- }
+/*
+ * Return p->sum_exec_runtime plus any more ns on the sched_clock
+ * that have not yet been banked in case the task is currently running.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns = 0;
+
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime + __task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);
+
+ return ns;
+}

+/*
+ * Return sum_exec_runtime for the thread group plus any more ns on the
+ * sched_clock that have not yet been banked in case the task is currently
+ * running.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+ struct task_cputime totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+ rq = task_rq_lock(p, &flags);
+ thread_group_cputime(p, &totals);
+ ns = totals.sum_exec_runtime + __task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);

return ns;
--
1.6.2.1


2009-03-17 06:18:17

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH] posixtimers: Fix posix clock monotonicity

On Tue, 17 Mar 2009 15:13:07 +0900
Hidetoshi Seto <[email protected]> wrote:

> This patch rehires task_sched_runtime() and thread_group_sched_runtime()
> which were removed at the time of 2.6.28-rc1.
>
> These functions protect the sampling of clock with rq lock.
> This rq lock is required not to update rq->clock during the sampling.
> i.e. You may get ((banked runtime before update)+(delta after update)).
>
Does clock_gettime() go backward without lock ?

Thanks,
-Kame


> Signed-off-by: Hidetoshi Seto <[email protected]>
> Cc: [email protected] [2.6.28.x]
>
> ---
> kernel/posix-cpu-timers.c | 7 +++--
> kernel/sched.c | 58 +++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 55 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
> index 4e5288a..a65641a 100644
> --- a/kernel/posix-cpu-timers.c
> +++ b/kernel/posix-cpu-timers.c
> @@ -294,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
> cpu->cpu = virt_ticks(p);
> break;
> case CPUCLOCK_SCHED:
> - cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
> + cpu->sched = task_sched_runtime(p);
> break;
> }
> return 0;
> @@ -310,18 +310,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
> {
> struct task_cputime cputime;
>
> - thread_group_cputime(p, &cputime);
> switch (CPUCLOCK_WHICH(which_clock)) {
> default:
> return -EINVAL;
> case CPUCLOCK_PROF:
> + thread_group_cputime(p, &cputime);
> cpu->cpu = cputime_add(cputime.utime, cputime.stime);
> break;
> case CPUCLOCK_VIRT:
> + thread_group_cputime(p, &cputime);
> cpu->cpu = cputime.utime;
> break;
> case CPUCLOCK_SCHED:
> - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
> + cpu->sched = thread_group_sched_runtime(p);
> break;
> }
> return 0;
> diff --git a/kernel/sched.c b/kernel/sched.c
> index db66874..617d1b8 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -4066,7 +4066,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
> /*
> * Return any ns on the sched_clock that have not yet been banked in
> * @p in case that task is currently running.
> + *
> + * Called with task_rq_lock() held on @rq.
> */
> +static u64 __task_delta_exec(struct task_struct *p, struct rq *rq)
> +{
> + u64 ns = 0;
> +
> + if (task_current(rq, p)) {
> + update_rq_clock(rq);
> + ns = rq->clock - p->se.exec_start;
> + if ((s64)ns < 0)
> + ns = 0;
> + }
> +
> + return ns;
> +}
> +
> unsigned long long task_delta_exec(struct task_struct *p)
> {
> unsigned long flags;
> @@ -4074,16 +4090,44 @@ unsigned long long task_delta_exec(struct task_struct *p)
> u64 ns = 0;
>
> rq = task_rq_lock(p, &flags);
> + ns = __task_delta_exec(p, rq);
> + task_rq_unlock(rq, &flags);
>
> - if (task_current(rq, p)) {
> - u64 delta_exec;
> + return ns;
> +}
>
> - update_rq_clock(rq);
> - delta_exec = rq->clock - p->se.exec_start;
> - if ((s64)delta_exec > 0)
> - ns = delta_exec;
> - }
> +/*
> + * Return p->sum_exec_runtime plus any more ns on the sched_clock
> + * that have not yet been banked in case the task is currently running.
> + */
> +unsigned long long task_sched_runtime(struct task_struct *p)
> +{
> + unsigned long flags;
> + struct rq *rq;
> + u64 ns = 0;
> +
> + rq = task_rq_lock(p, &flags);
> + ns = p->se.sum_exec_runtime + __task_delta_exec(p, rq);
> + task_rq_unlock(rq, &flags);
> +
> + return ns;
> +}
>
> +/*
> + * Return sum_exec_runtime for the thread group plus any more ns on the
> + * sched_clock that have not yet been banked in case the task is currently
> + * running.
> + */
> +unsigned long long thread_group_sched_runtime(struct task_struct *p)
> +{
> + struct task_cputime totals;
> + unsigned long flags;
> + struct rq *rq;
> + u64 ns;
> +
> + rq = task_rq_lock(p, &flags);
> + thread_group_cputime(p, &totals);
> + ns = totals.sum_exec_runtime + __task_delta_exec(p, rq);
> task_rq_unlock(rq, &flags);
>
> return ns;
> --
> 1.6.2.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-03-17 06:31:28

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH] posixtimers: Fix posix clock monotonicity

KAMEZAWA Hiroyuki wrote:
> Hidetoshi Seto <[email protected]> wrote:
>
>> This patch rehires task_sched_runtime() and thread_group_sched_runtime()
>> which were removed at the time of 2.6.28-rc1.
>>
>> These functions protect the sampling of clock with rq lock.
>> This rq lock is required not to update rq->clock during the sampling.
>> i.e. You may get ((banked runtime before update)+(delta after update)).
>>
> Does clock_gettime() go backward without lock ?

Yes, that's right.

You can find the ancestor of this patch here:
[RESEND][PATCH] posixtimers: clock_gettime(CLOCK_*_CPUTIME_ID) goes backward
http://lkml.org/lkml/2009/1/27/18

After the last post, I dug the git-log and found that there were functions,
task_sched_runtime() and thread_group_sched_runtime(), worked fine at the
time of 2.6.27. I think it is better to reintroduce these functions again
than making almost same function with different name.

Thanks,
H.Seto

2009-03-18 10:41:30

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH] posixtimers: Fix posix clock monotonicity v2

Impact: Regression fix (against clock_gettime() backwarding bug)

This patch re-introduces a couple of function, task_sched_runtime
and thread_group_sched_runtime, which was once removed at the
time of 2.6.28-rc1.

These functions protect the sampling of thread/process clock with
rq lock. This rq lock is required not to update rq->clock durling
the sampling.
i.e.
The clock_gettime() may return
((accounted runtime before update) + (delta after update))
that is less than what it should be.

v1 -> v2:
- Revises comments for functions and patch description.
- Add note about accuracy of thread group's runtime.

Signed-off-by: Hidetoshi Seto <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: [email protected] [2.6.28.x]
---
kernel/posix-cpu-timers.c | 7 +++--
kernel/sched.c | 65 +++++++++++++++++++++++++++++++++++++++-----
2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a..a65641a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -294,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_sched_runtime(p);
break;
}
return 0;
@@ -310,18 +310,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
{
struct task_cputime cputime;

- thread_group_cputime(p, &cputime);
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime_add(cputime.utime, cputime.stime);
break;
case CPUCLOCK_VIRT:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = thread_group_sched_runtime(p);
break;
}
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index db66874..1d7dacf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4064,9 +4064,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
*/
+static u64 __task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ if (task_current(rq, p)) {
+ update_rq_clock(rq);
+ ns = rq->clock - p->se.exec_start;
+ if ((s64)ns < 0)
+ ns = 0;
+ }
+
+ return ns;
+}
+
unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
@@ -4074,16 +4090,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
u64 ns = 0;

rq = task_rq_lock(p, &flags);
+ ns = __task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);

- if (task_current(rq, p)) {
- u64 delta_exec;
+ return ns;
+}

- update_rq_clock(rq);
- delta_exec = rq->clock - p->se.exec_start;
- if ((s64)delta_exec > 0)
- ns = delta_exec;
- }
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns = 0;
+
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime + __task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);
+
+ return ns;
+}

+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+ struct task_cputime totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+ rq = task_rq_lock(p, &flags);
+ thread_group_cputime(p, &totals);
+ ns = totals.sum_exec_runtime + __task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);

return ns;
--
1.6.2.1

2009-03-18 10:58:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] posixtimers: Fix posix clock monotonicity v2

On Wed, 2009-03-18 at 19:41 +0900, Hidetoshi Seto wrote:
> Impact: Regression fix (against clock_gettime() backwarding bug)
>
> This patch re-introduces a couple of function, task_sched_runtime
> and thread_group_sched_runtime, which was once removed at the
> time of 2.6.28-rc1.
>
> These functions protect the sampling of thread/process clock with
> rq lock. This rq lock is required not to update rq->clock durling
> the sampling.
> i.e.
> The clock_gettime() may return
> ((accounted runtime before update) + (delta after update))
> that is less than what it should be.
>
> v1 -> v2:
> - Revises comments for functions and patch description.
> - Add note about accuracy of thread group's runtime.
>
> Signed-off-by: Hidetoshi Seto <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: [email protected] [2.6.28.x]

Acked-by: Peter Zijlstra <[email protected]>

Thanks Hidetoshi-san.

2009-03-18 11:35:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] posixtimers: Fix posix clock monotonicity v2


* Hidetoshi Seto <[email protected]> wrote:

> Impact: Regression fix (against clock_gettime() backwarding bug)

Looks good, thanks! Would you mind to post a version against -tip?

http://people.redhat.com/mingo/tip.git/README

Which already has a __task_delta_exec() (different version).

Thanks,

Ingo

2009-03-23 05:07:53

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH] posixtimers: Fix posix clock monotonicity v2

Ingo Molnar wrote:
> * Hidetoshi Seto <[email protected]> wrote:
>
>> Impact: Regression fix (against clock_gettime() backwarding bug)
>
> Looks good, thanks! Would you mind to post a version against -tip?
>
> http://people.redhat.com/mingo/tip.git/README
>
> Which already has a __task_delta_exec() (different version).

Sorry, I was AFK last few days.

Well, I confirmed that the current version (v2) is good for Linus's
tree and 28-stable tree, but it troubles on -tip.
So I made patches for -tip. It will follow to this post.

[PATCH 1/2] x86: Rename __task_delta_exec() to task_delta_exec_locked()
[PATCH 2/2] posixtimers: Fix posix clock monotonicity v2

The [2/2] is essentially no different from previous version, but
just have solved a fuzz against -tip.


Thanks,
H.Seto

2009-03-23 05:12:21

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH 1/2] x86: Rename __task_delta_exec() to task_delta_exec_locked()

Externing function with prefix "__" is unpleasant.
This patch renames the function and fix the caller.
This change is desirable for other fix that against posix clocks,
since the fix introduces another __task_delta_exec() which is static.

Signed-off-by: Hidetoshi Seto <[email protected]>
---
include/linux/kernel_stat.h | 2 +-
kernel/perf_counter.c | 6 +++---
kernel/sched.c | 6 +++---
3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b6d2887..459bc1e 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -87,7 +87,7 @@ static inline unsigned int kstat_irqs(unsigned int irq)
*/
extern void curr_rq_lock_irq_save(unsigned long *flags);
extern void curr_rq_unlock_irq_restore(unsigned long *flags);
-extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
+extern unsigned long long task_delta_exec_locked(struct task_struct *tsk, int update);
extern unsigned long long task_delta_exec(struct task_struct *);

extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f054b8c..66ddb2f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -881,7 +881,7 @@ int perf_counter_task_disable(void)
cpu = smp_processor_id();

/* force the update of the task clock: */
- __task_delta_exec(curr, 1);
+ task_delta_exec_locked(curr, 1);

perf_counter_task_sched_out(curr, cpu);

@@ -922,7 +922,7 @@ int perf_counter_task_enable(void)
cpu = smp_processor_id();

/* force the update of the task clock: */
- __task_delta_exec(curr, 1);
+ task_delta_exec_locked(curr, 1);

perf_counter_task_sched_out(curr, cpu);

@@ -1635,7 +1635,7 @@ static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
struct task_struct *curr = counter->task;
u64 delta;

- delta = __task_delta_exec(curr, update);
+ delta = task_delta_exec_locked(curr, update);

return curr->se.sum_exec_runtime + delta;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 3e827b8..5ee5a06 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4304,10 +4304,10 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * Return any ns on the sched_clock that have not yet been banked in
- * @p in case that task is currently running.
+ * Return any ns on the sched_clock that have not yet been accounted.
+ * @p must be a task currently running.
*/
-unsigned long long __task_delta_exec(struct task_struct *p, int update)
+unsigned long long task_delta_exec_locked(struct task_struct *p, int update)
{
s64 delta_exec;
struct rq *rq;
--
1.6.2.1

2009-03-23 05:13:42

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH 2/2] posixtimers: Fix posix clock monotonicity v2

Impact: Regression fix (against clock_gettime() backwarding bug)

This patch re-introduces a couple of function, task_sched_runtime
and thread_group_sched_runtime, which was once removed at the
time of 2.6.28-rc1.

These functions protect the sampling of thread/process clock with
rq lock. This rq lock is required not to update rq->clock during
the sampling.
i.e.
The clock_gettime() may return
((accounted runtime before update) + (delta after update))
that is less than what it should be.

v1 -> v2:
- Revises comments of function and patch description.
- Add note about accuracy of thread group's runtime.

Signed-off-by: Hidetoshi Seto <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: [email protected] [2.6.28.x]
---
kernel/posix-cpu-timers.c | 7 +++--
kernel/sched.c | 65 +++++++++++++++++++++++++++++++++++++++-----
2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e976e50..476607f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_sched_runtime(p);
break;
}
return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
{
struct task_cputime cputime;

- thread_group_cputime(p, &cputime);
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime_add(cputime.utime, cputime.stime);
break;
case CPUCLOCK_VIRT:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = thread_group_sched_runtime(p);
break;
}
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5ee5a06..b3c0b1f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4327,9 +4327,25 @@ unsigned long long task_delta_exec_locked(struct task_struct *p, int update)
}

/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
*/
+static u64 __task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ if (task_current(rq, p)) {
+ update_rq_clock(rq);
+ ns = rq->clock - p->se.exec_start;
+ if ((s64)ns < 0)
+ ns = 0;
+ }
+
+ return ns;
+}
+
unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
@@ -4337,16 +4353,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
u64 ns = 0;

rq = task_rq_lock(p, &flags);
+ ns = __task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);

- if (task_current(rq, p)) {
- u64 delta_exec;
+ return ns;
+}

- update_rq_clock(rq);
- delta_exec = rq->clock - p->se.exec_start;
- if ((s64)delta_exec > 0)
- ns = delta_exec;
- }
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns = 0;
+
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime + __task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);
+
+ return ns;
+}

+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+ struct task_cputime totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+ rq = task_rq_lock(p, &flags);
+ thread_group_cputime(p, &totals);
+ ns = totals.sum_exec_runtime + __task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);

return ns;
--
1.6.2.1

2009-03-23 07:58:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86: Rename __task_delta_exec() to task_delta_exec_locked()

On Mon, 2009-03-23 at 14:11 +0900, Hidetoshi Seto wrote:
> Externing function with prefix "__" is unpleasant.
> This patch renames the function and fix the caller.
> This change is desirable for other fix that against posix clocks,
> since the fix introduces another __task_delta_exec() which is static.

There's nothing wrong with __ prefixed functions, the core kernel is
stuffed with them, and its a well known prefix for functions that need
special care. Furthermore your argument seems contradictory in that it
introduces one.

2009-03-23 09:13:46

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86: Rename __task_delta_exec() to task_delta_exec_locked()

Peter Zijlstra wrote:
> On Mon, 2009-03-23 at 14:11 +0900, Hidetoshi Seto wrote:
>> Externing function with prefix "__" is unpleasant.
>> This patch renames the function and fix the caller.
>> This change is desirable for other fix that against posix clocks,
>> since the fix introduces another __task_delta_exec() which is static.
>
> There's nothing wrong with __ prefixed functions, the core kernel is
> stuffed with them, and its a well known prefix for functions that need
> special care.

I often suppose that the functions with __ prefix is used for a kind of
(local) subroutine that mostly used internally in the *.c file, and that
therefore it is often defined as static (or static inline).

I also suppose that __Func() followed by Func() is called from Func()
(and its relatives, like Func_foo() or Func_bar() etc., if exists)
as a common core part of Func().

> Furthermore your argument seems contradictory in that it
> introduces one.

I thought it is acceptable since the introduced one is static.

Still I feel uneasy about the non-static __Func(), however it is
easy to fix with a little patients.
Wait a moment for next patch (it will be a single patch, v3).


Thanks,
H.Seto

2009-03-23 09:41:01

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH] posixtimers: Fix posix clock monotonicity v3

Impact: Regression fix (against clock_gettime() backwarding bug)

This patch re-introduces a couple of function, task_sched_runtime
and thread_group_sched_runtime, which was once removed at the
time of 2.6.28-rc1.

These functions protect the sampling of thread/process clock with
rq lock. This rq lock is required not to update rq->clock during
the sampling.
i.e.
The clock_gettime() may return
((accounted runtime before update) + (delta after update))
that is less than what it should be.

v2 -> v3:
- Rename static helper function __task_delta_exec()
to do_task_delta_exec() since -tip tree already has
a __task_delta_exec() of different version.

v1 -> v2:
- Revises comments of function and patch description.
- Add note about accuracy of thread group's runtime.

Signed-off-by: Hidetoshi Seto <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: [email protected] [2.6.28.x]
---
kernel/posix-cpu-timers.c | 7 +++--
kernel/sched.c | 65 +++++++++++++++++++++++++++++++++++++++-----
2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a..a65641a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -294,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_sched_runtime(p);
break;
}
return 0;
@@ -310,18 +310,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
{
struct task_cputime cputime;

- thread_group_cputime(p, &cputime);
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime_add(cputime.utime, cputime.stime);
break;
case CPUCLOCK_VIRT:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = thread_group_sched_runtime(p);
break;
}
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index db66874..2674597 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4064,9 +4064,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
*/
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ if (task_current(rq, p)) {
+ update_rq_clock(rq);
+ ns = rq->clock - p->se.exec_start;
+ if ((s64)ns < 0)
+ ns = 0;
+ }
+
+ return ns;
+}
+
unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
@@ -4074,16 +4090,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
u64 ns = 0;

rq = task_rq_lock(p, &flags);
+ ns = do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);

- if (task_current(rq, p)) {
- u64 delta_exec;
+ return ns;
+}

- update_rq_clock(rq);
- delta_exec = rq->clock - p->se.exec_start;
- if ((s64)delta_exec > 0)
- ns = delta_exec;
- }
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns = 0;
+
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);
+
+ return ns;
+}

+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+ struct task_cputime totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+ rq = task_rq_lock(p, &flags);
+ thread_group_cputime(p, &totals);
+ ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);

return ns;
--
1.6.2.1

2009-03-23 09:43:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] posixtimers: Fix posix clock monotonicity v3

On Mon, 2009-03-23 at 18:40 +0900, Hidetoshi Seto wrote:
> Impact: Regression fix (against clock_gettime() backwarding bug)
>
> This patch re-introduces a couple of function, task_sched_runtime
> and thread_group_sched_runtime, which was once removed at the
> time of 2.6.28-rc1.
>
> These functions protect the sampling of thread/process clock with
> rq lock. This rq lock is required not to update rq->clock during
> the sampling.
> i.e.
> The clock_gettime() may return
> ((accounted runtime before update) + (delta after update))
> that is less than what it should be.
>
> v2 -> v3:
> - Rename static helper function __task_delta_exec()
> to do_task_delta_exec() since -tip tree already has
> a __task_delta_exec() of different version.
>
> v1 -> v2:
> - Revises comments of function and patch description.
> - Add note about accuracy of thread group's runtime.
>
> Signed-off-by: Hidetoshi Seto <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: [email protected] [2.6.28.x]


Looks good to me, thanks!

Acked-by: Peter Zijlstra <[email protected]>

> ---
> kernel/posix-cpu-timers.c | 7 +++--
> kernel/sched.c | 65 +++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 61 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
> index 4e5288a..a65641a 100644
> --- a/kernel/posix-cpu-timers.c
> +++ b/kernel/posix-cpu-timers.c
> @@ -294,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
> cpu->cpu = virt_ticks(p);
> break;
> case CPUCLOCK_SCHED:
> - cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
> + cpu->sched = task_sched_runtime(p);
> break;
> }
> return 0;
> @@ -310,18 +310,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
> {
> struct task_cputime cputime;
>
> - thread_group_cputime(p, &cputime);
> switch (CPUCLOCK_WHICH(which_clock)) {
> default:
> return -EINVAL;
> case CPUCLOCK_PROF:
> + thread_group_cputime(p, &cputime);
> cpu->cpu = cputime_add(cputime.utime, cputime.stime);
> break;
> case CPUCLOCK_VIRT:
> + thread_group_cputime(p, &cputime);
> cpu->cpu = cputime.utime;
> break;
> case CPUCLOCK_SCHED:
> - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
> + cpu->sched = thread_group_sched_runtime(p);
> break;
> }
> return 0;
> diff --git a/kernel/sched.c b/kernel/sched.c
> index db66874..2674597 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -4064,9 +4064,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
> EXPORT_PER_CPU_SYMBOL(kstat);
>
> /*
> - * Return any ns on the sched_clock that have not yet been banked in
> + * Return any ns on the sched_clock that have not yet been accounted in
> * @p in case that task is currently running.
> + *
> + * Called with task_rq_lock() held on @rq.
> */
> +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
> +{
> + u64 ns = 0;
> +
> + if (task_current(rq, p)) {
> + update_rq_clock(rq);
> + ns = rq->clock - p->se.exec_start;
> + if ((s64)ns < 0)
> + ns = 0;
> + }
> +
> + return ns;
> +}
> +
> unsigned long long task_delta_exec(struct task_struct *p)
> {
> unsigned long flags;
> @@ -4074,16 +4090,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
> u64 ns = 0;
>
> rq = task_rq_lock(p, &flags);
> + ns = do_task_delta_exec(p, rq);
> + task_rq_unlock(rq, &flags);
>
> - if (task_current(rq, p)) {
> - u64 delta_exec;
> + return ns;
> +}
>
> - update_rq_clock(rq);
> - delta_exec = rq->clock - p->se.exec_start;
> - if ((s64)delta_exec > 0)
> - ns = delta_exec;
> - }
> +/*
> + * Return accounted runtime for the task.
> + * In case the task is currently running, return the runtime plus current's
> + * pending runtime that have not been accounted yet.
> + */
> +unsigned long long task_sched_runtime(struct task_struct *p)
> +{
> + unsigned long flags;
> + struct rq *rq;
> + u64 ns = 0;
> +
> + rq = task_rq_lock(p, &flags);
> + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
> + task_rq_unlock(rq, &flags);
> +
> + return ns;
> +}
>
> +/*
> + * Return sum_exec_runtime for the thread group.
> + * In case the task is currently running, return the sum plus current's
> + * pending runtime that have not been accounted yet.
> + *
> + * Note that the thread group might have other running tasks as well,
> + * so the return value not includes other pending runtime that other
> + * running tasks might have.
> + */
> +unsigned long long thread_group_sched_runtime(struct task_struct *p)
> +{
> + struct task_cputime totals;
> + unsigned long flags;
> + struct rq *rq;
> + u64 ns;
> +
> + rq = task_rq_lock(p, &flags);
> + thread_group_cputime(p, &totals);
> + ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
> task_rq_unlock(rq, &flags);
>
> return ns;