2009-11-20 04:44:52

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH tip/sched/core] introduce task_times() to replace task_[us]time() pair

Function task_[us]times() are called consecutively in almost all
cases. However task_stime() is implemented to call task_utime()
from its inside, so such paired calls run task_utime() twice.

It means we do heavy divisions (div_u64 + do_div) twice to get
stime and utime which can be obtained at same time by one set
of divisions.

This patch introduces task_times(*tsk, *utime, *stime) to get
stime and utime at once, in better, optimized way.

Signed-off-by: Hidetoshi Seto <[email protected]>
---
fs/proc/array.c | 3 +-
include/linux/sched.h | 1 +
kernel/exit.c | 7 ++++-
kernel/sched.c | 55 +++++++++++++++++++++++++++++++-----------------
kernel/sys.c | 3 +-
5 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index e209f64..330deda 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -535,8 +535,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- utime = task_utime(task);
- stime = task_stime(task);
+ task_times(task, &utime, &stime);
gtime = task_gtime(task);
}

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78ba664..fe6ae15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1723,6 +1723,7 @@ static inline void put_task_struct(struct task_struct *t)
extern cputime_t task_utime(struct task_struct *p);
extern cputime_t task_stime(struct task_struct *p);
extern cputime_t task_gtime(struct task_struct *p);
+extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);

/*
* Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f..a19e429 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,6 +91,8 @@ static void __exit_signal(struct task_struct *tsk)
if (atomic_dec_and_test(&sig->count))
posix_cpu_timers_exit_group(tsk);
else {
+ cputime_t utime, stime;
+
/*
* If there is any task waiting for the group exit
* then notify it:
@@ -110,8 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime = cputime_add(sig->utime, task_utime(tsk));
- sig->stime = cputime_add(sig->stime, task_stime(tsk));
+ task_times(tsk, &utime, &stime);
+ sig->utime = cputime_add(sig->utime, utime);
+ sig->stime = cputime_add(sig->stime, stime);
sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
diff --git a/kernel/sched.c b/kernel/sched.c
index ab9a034..18c89e0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5155,6 +5155,14 @@ cputime_t task_stime(struct task_struct *p)
{
return p->stime;
}
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ if (ut)
+ *ut = task_utime(p);
+ if (st)
+ *st = task_stime(p);
+}
#else

#ifndef nsecs_to_cputime
@@ -5162,41 +5170,48 @@ cputime_t task_stime(struct task_struct *p)
msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
#endif

-cputime_t task_utime(struct task_struct *p)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
- cputime_t utime = p->utime, total = utime + p->stime;
- u64 temp;
+ cputime_t rtime, utime = p->utime, total = utime + p->stime;

/*
* Use CFS's precise accounting:
*/
- temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
+ rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

if (total) {
- temp *= utime;
+ u64 temp;
+
+ temp = (u64)(rtime * utime);
do_div(temp, total);
- }
- utime = (cputime_t)temp;
+ utime = (cputime_t)temp;
+ } else
+ utime = rtime;

+ /*
+ * Compare with previous values, to keep monotonicity:
+ */
p->prev_utime = max(p->prev_utime, utime);
- return p->prev_utime;
+ p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+ if (ut)
+ *ut = p->prev_utime;
+ if (st)
+ *st = p->prev_stime;
+}
+
+cputime_t task_utime(struct task_struct *p)
+{
+ cputime_t utime;
+ task_times(p, &utime, NULL);
+ return utime;
}

cputime_t task_stime(struct task_struct *p)
{
cputime_t stime;
-
- /*
- * Use CFS's precise accounting. (we subtract utime from
- * the total, to make sure the total observed by userspace
- * grows monotonically - apps rely on that):
- */
- stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
-
- if (stime >= 0)
- p->prev_stime = max(p->prev_stime, stime);
-
- return p->prev_stime;
+ task_times(p, NULL, &stime);
+ return stime;
}
#endif

diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d..b6caf1e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1344,8 +1344,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
utime = stime = cputime_zero;

if (who == RUSAGE_THREAD) {
- utime = task_utime(current);
- stime = task_stime(current);
+ task_times(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = p->signal->maxrss;
goto out;
--
1.6.5.3


2009-11-23 10:30:35

by Stanislaw Gruszka

[permalink] [raw]
Subject: Re: [PATCH tip/sched/core] introduce task_times() to replace task_[us]time() pair

On Fri, Nov 20, 2009 at 01:44:10PM +0900, Hidetoshi Seto wrote:
> Function task_[us]times() are called consecutively in almost all
> cases. However task_stime() is implemented to call task_utime()
> from its inside, so such paired calls run task_utime() twice.
>
> It means we do heavy divisions (div_u64 + do_div) twice to get
> stime and utime which can be obtained at same time by one set
> of divisions.
>
> This patch introduces task_times(*tsk, *utime, *stime) to get
> stime and utime at once, in better, optimized way.
>
> Signed-off-by: Hidetoshi Seto <[email protected]>

[snip]

> @@ -5155,6 +5155,14 @@ cputime_t task_stime(struct task_struct *p)
> {
> return p->stime;
> }
> +
> +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
> +{
> + if (ut)
> + *ut = task_utime(p);
> + if (st)
> + *st = task_stime(p);
> +}
> #else

I think task_{u,s}time are not needed anymore. Can we just fully get
rid of them and only use task_times() ?

> #ifndef nsecs_to_cputime
> @@ -5162,41 +5170,48 @@ cputime_t task_stime(struct task_struct *p)
> msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
> #endif

Could we furhter optimize this? Perhaps we can use below code
(taken from timespec_to_jiffies()):

cputime = (nsec * NSEC_CONVERSION) >>
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;


Stanislaw

2009-11-24 05:38:36

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH tip/sched/core] introduce task_times() to replace task_[us]time() pair

Stanislaw Gruszka wrote:
> On Fri, Nov 20, 2009 at 01:44:10PM +0900, Hidetoshi Seto wrote:
>> Function task_[us]times() are called consecutively in almost all
>> cases. However task_stime() is implemented to call task_utime()
>> from its inside, so such paired calls run task_utime() twice.
>>
>> It means we do heavy divisions (div_u64 + do_div) twice to get
>> stime and utime which can be obtained at same time by one set
>> of divisions.
>>
>> This patch introduces task_times(*tsk, *utime, *stime) to get
>> stime and utime at once, in better, optimized way.
>>
>> Signed-off-by: Hidetoshi Seto <[email protected]>
>
> [snip]
>
>> @@ -5155,6 +5155,14 @@ cputime_t task_stime(struct task_struct *p)
>> {
>> return p->stime;
>> }
>> +
>> +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
>> +{
>> + if (ut)
>> + *ut = task_utime(p);
>> + if (st)
>> + *st = task_stime(p);
>> +}
>> #else
>
> I think task_{u,s}time are not needed anymore. Can we just fully get
> rid of them and only use task_times() ?

Yes, we can :-)

I was just afraid that there were other task_{u,s}time users I could
not find. So I separated it in another patch to remove the API, to be
posted later. But if it is OK, I can put them together in one patch.
(Or it is still better to be separated and incremental one?)

>> #ifndef nsecs_to_cputime
>> @@ -5162,41 +5170,48 @@ cputime_t task_stime(struct task_struct *p)
>> msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
>> #endif
>
> Could we furhter optimize this? Perhaps we can use below code
> (taken from timespec_to_jiffies()):
>
> cputime = (nsec * NSEC_CONVERSION) >>
> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;

I hope there were nsecs_to_jiffies().
It will be complex than:

cputime = (nsec * NSEC_CONVERSION) >> NSEC_JIFFIE_SC;

In timespec_to_jiffies(), nsec is never greater than NSEC_PER_SEC.
So above will work without any overflow (I confirmed it becomes wrong
if nsec > (LLONG_MAX / NSEC_CONVERSION) = about 8190ms).

But here in task_timers() the nsec can be greater than hours (or days),
we must be careful...

And just now I noticed that using msecs_to_cputime() is problematic,
since the type of its return value is "unsigned long" so not 64bit.
I'll make and post a patch to fix this asap.


...BTW, could anyone explain what the following (line 661) is doing?:

[kernel/time.c]
649 u64 nsec_to_clock_t(u64 x)
650 {
651 #if (NSEC_PER_SEC % USER_HZ) == 0
652 return div_u64(x, NSEC_PER_SEC / USER_HZ);
653 #elif (USER_HZ % 512) == 0
654 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
655 #else
656 /*
657 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
658 * overflow after 64.99 years.
659 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
660 */
661 return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
662 #endif
663 }


Thanks,
H.Seto

2009-11-24 07:09:05

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH tip/sched/core] introduce task_times() to replace task_[us]time() pair

Hidetoshi Seto wrote:
> And just now I noticed that using msecs_to_cputime() is problematic,
> since the type of its return value is "unsigned long" so not 64bit.
> I'll make and post a patch to fix this asap.

Sorry, I was confused. Please ignore the above.

The problem is that since the type of msecs_to_cputime() argument is
unsigned int, it would be wrong if calculated msecs is greater than
UINT_MAX.

Another problem is that msecs_to_jiffies() returns MAX_JIFFY_OFFSET
if msecs > INT_MAX.

5168 #ifndef nsecs_to_cputime
5169 # define nsecs_to_cputime(__nsecs) \
5170 msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
5171 #endif

452 unsigned long msecs_to_jiffies(const unsigned int m)
453 {
454 /*
455 * Negative value, means infinite timeout:
456 */
457 if ((int)m < 0)
458 return MAX_JIFFY_OFFSET;
:

What I need here is a simple function can convert 64bit nsecs to
cputime_t, which is unsigned long in asm-generic/cputime.h.
Humm...


Thanks,
H.Seto

2009-11-26 05:49:02

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH -tip 1/3] introduce task_times() to replace task_{u,s}time() pair

Functions task_{u,s}time() are called in pair in almost all
cases. However task_stime() is implemented to call task_utime()
from its inside, so such paired calls run task_utime() twice.

It means we do heavy divisions (div_u64 + do_div) twice to get
utime and stime which can be obtained at same time by one set
of divisions.

This patch introduces a function task_times(*tsk, *utime, *stime)
to retrieve utime and stime at once in better, optimized way.

Signed-off-by: Hidetoshi Seto <[email protected]>
---
fs/proc/array.c | 3 +-
include/linux/sched.h | 1 +
kernel/exit.c | 7 ++++-
kernel/sched.c | 55 +++++++++++++++++++++++++++++++-----------------
kernel/sys.c | 3 +-
5 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index e209f64..330deda 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -535,8 +535,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- utime = task_utime(task);
- stime = task_stime(task);
+ task_times(task, &utime, &stime);
gtime = task_gtime(task);
}

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78ba664..fe6ae15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1723,6 +1723,7 @@ static inline void put_task_struct(struct task_struct *t)
extern cputime_t task_utime(struct task_struct *p);
extern cputime_t task_stime(struct task_struct *p);
extern cputime_t task_gtime(struct task_struct *p);
+extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);

/*
* Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f..a19e429 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,6 +91,8 @@ static void __exit_signal(struct task_struct *tsk)
if (atomic_dec_and_test(&sig->count))
posix_cpu_timers_exit_group(tsk);
else {
+ cputime_t utime, stime;
+
/*
* If there is any task waiting for the group exit
* then notify it:
@@ -110,8 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime = cputime_add(sig->utime, task_utime(tsk));
- sig->stime = cputime_add(sig->stime, task_stime(tsk));
+ task_times(tsk, &utime, &stime);
+ sig->utime = cputime_add(sig->utime, utime);
+ sig->stime = cputime_add(sig->stime, stime);
sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
diff --git a/kernel/sched.c b/kernel/sched.c
index a57c6ae..4d5e1a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5155,6 +5155,14 @@ cputime_t task_stime(struct task_struct *p)
{
return p->stime;
}
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ if (ut)
+ *ut = task_utime(p);
+ if (st)
+ *st = task_stime(p);
+}
#else

#ifndef nsecs_to_cputime
@@ -5162,41 +5170,48 @@ cputime_t task_stime(struct task_struct *p)
msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
#endif

-cputime_t task_utime(struct task_struct *p)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
- cputime_t utime = p->utime, total = utime + p->stime;
- u64 temp;
+ cputime_t rtime, utime = p->utime, total = utime + p->stime;

/*
* Use CFS's precise accounting:
*/
- temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
+ rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

if (total) {
- temp *= utime;
+ u64 temp;
+
+ temp = (u64)(rtime * utime);
do_div(temp, total);
- }
- utime = (cputime_t)temp;
+ utime = (cputime_t)temp;
+ } else
+ utime = rtime;

+ /*
+ * Compare with previous values, to keep monotonicity:
+ */
p->prev_utime = max(p->prev_utime, utime);
- return p->prev_utime;
+ p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+ if (ut)
+ *ut = p->prev_utime;
+ if (st)
+ *st = p->prev_stime;
+}
+
+cputime_t task_utime(struct task_struct *p)
+{
+ cputime_t utime;
+ task_times(p, &utime, NULL);
+ return utime;
}

cputime_t task_stime(struct task_struct *p)
{
cputime_t stime;
-
- /*
- * Use CFS's precise accounting. (we subtract utime from
- * the total, to make sure the total observed by userspace
- * grows monotonically - apps rely on that):
- */
- stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
-
- if (stime >= 0)
- p->prev_stime = max(p->prev_stime, stime);
-
- return p->prev_stime;
+ task_times(p, NULL, &stime);
+ return stime;
}
#endif

diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d..b6caf1e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1344,8 +1344,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
utime = stime = cputime_zero;

if (who == RUSAGE_THREAD) {
- utime = task_utime(current);
- stime = task_stime(current);
+ task_times(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = p->signal->maxrss;
goto out;
--
1.6.5.3

2009-11-26 05:49:22

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH -tip 2/3] remove task_{u,s,g}time()

Now all task_{u,s}time() pairs are replaced by task_times().
And task_gtime() is too simple to be an inline function.

Cleanup them all.

Signed-off-by: Hidetoshi Seto <[email protected]>
---
fs/proc/array.c | 4 ++--
include/linux/sched.h | 3 ---
kernel/exit.c | 2 +-
kernel/sched.c | 33 ++-------------------------------
4 files changed, 5 insertions(+), 37 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 330deda..ca61a88 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- gtime = cputime_add(gtime, task_gtime(t));
+ gtime = cputime_add(gtime, t->gtime);
t = next_thread(t);
} while (t != task);

@@ -536,7 +536,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt = task->min_flt;
maj_flt = task->maj_flt;
task_times(task, &utime, &stime);
- gtime = task_gtime(task);
+ gtime = task->gtime;
}

/* scale priority and nice values from timeslices to -20..20 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe6ae15..0395b0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1720,9 +1720,6 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t);
}

-extern cputime_t task_utime(struct task_struct *p);
-extern cputime_t task_stime(struct task_struct *p);
-extern cputime_t task_gtime(struct task_struct *p);
extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);

/*
diff --git a/kernel/exit.c b/kernel/exit.c
index a19e429..480de5b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,7 +115,7 @@ static void __exit_signal(struct task_struct *tsk)
task_times(tsk, &utime, &stime);
sig->utime = cputime_add(sig->utime, utime);
sig->stime = cputime_add(sig->stime, stime);
- sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
+ sig->gtime = cputime_add(sig->gtime, tsk->gtime);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
diff --git a/kernel/sched.c b/kernel/sched.c
index 4d5e1a2..a7093c1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5146,22 +5146,12 @@ void account_idle_ticks(unsigned long ticks)
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-cputime_t task_utime(struct task_struct *p)
-{
- return p->utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
- return p->stime;
-}
-
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
if (ut)
- *ut = task_utime(p);
+ *ut = p->utime;
if (st)
- *st = task_stime(p);
+ *st = p->stime;
}
#else

@@ -5199,27 +5189,8 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
if (st)
*st = p->prev_stime;
}
-
-cputime_t task_utime(struct task_struct *p)
-{
- cputime_t utime;
- task_times(p, &utime, NULL);
- return utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
- cputime_t stime;
- task_times(p, NULL, &stime);
- return stime;
-}
#endif

-inline cputime_t task_gtime(struct task_struct *p)
-{
- return p->gtime;
-}
-
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
--
1.6.5.3

2009-11-26 05:49:50

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH -tip 3/3] define nsecs_to_jiffies()

Use of msecs_to_jiffies() for nsecs_to_cputime() have some problems:

- The type of msecs_to_jiffies()'s argument is unsigned int, so
it cannot convert msecs greater than UINT_MAX = about 49.7 days.
- msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument
is set, assuming that input was negative value. So it cannot
convert msecs greater than INT_MAX = about 24.8 days too.

This patch defines a new function nsecs_to_jiffies() that can deal
greater values, and that can deal all incoming values as unsigned.

Signed-off-by: Hidetoshi Seto <[email protected]>
---
include/linux/jiffies.h | 1 +
kernel/sched.c | 3 +--
kernel/time.c | 30 ++++++++++++++++++++++++++++++
3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 1a9cf78..6811f4b 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
extern unsigned long clock_t_to_jiffies(unsigned long x);
extern u64 jiffies_64_to_clock_t(u64 x);
extern u64 nsec_to_clock_t(u64 x);
+extern unsigned long nsecs_to_jiffies(u64 n);

#define TIMESTAMP_SIZE 30

diff --git a/kernel/sched.c b/kernel/sched.c
index a7093c1..792b25e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5156,8 +5156,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
#else

#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) \
- msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
#endif

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469..b30016c 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x)
#endif
}

+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n: nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+#if (NSEC_PER_SEC % HZ) == 0
+ /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
+ return div_u64(n, NSEC_PER_SEC / HZ);
+#elif (HZ % 512) == 0
+ /* overflow after 292 years if HZ = 1024 */
+ return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
+#else
+ /*
+ * Generic case - optimized for cases where HZ is a multiple of 3.
+ * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
+ */
+ return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
+#endif
+}
+
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
{
--
1.6.5.3

2009-11-26 10:25:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] introduce task_times() to replace task_{u,s}time() pair

On Thu, 2009-11-26 at 14:48 +0900, Hidetoshi Seto wrote:
> Functions task_{u,s}time() are called in pair in almost all
> cases. However task_stime() is implemented to call task_utime()
> from its inside, so such paired calls run task_utime() twice.
>
> It means we do heavy divisions (div_u64 + do_div) twice to get
> utime and stime which can be obtained at same time by one set
> of divisions.
>
> This patch introduces a function task_times(*tsk, *utime, *stime)
> to retrieve utime and stime at once in better, optimized way.
>
> Signed-off-by: Hidetoshi Seto <[email protected]>

OK, the patch looks good, but it does not solve those non monotonic
times funnies that started all this, right?

2009-11-26 10:26:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -tip 2/3] remove task_{u,s,g}time()

On Thu, 2009-11-26 at 14:49 +0900, Hidetoshi Seto wrote:
> Now all task_{u,s}time() pairs are replaced by task_times().
> And task_gtime() is too simple to be an inline function.
>
> Cleanup them all.

Cleanups are always nice!

2009-11-26 10:27:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] define nsecs_to_jiffies()

On Thu, 2009-11-26 at 14:49 +0900, Hidetoshi Seto wrote:
> Use of msecs_to_jiffies() for nsecs_to_cputime() have some problems:
>
> - The type of msecs_to_jiffies()'s argument is unsigned int, so
> it cannot convert msecs greater than UINT_MAX = about 49.7 days.
> - msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument
> is set, assuming that input was negative value. So it cannot
> convert msecs greater than INT_MAX = about 24.8 days too.
>
> This patch defines a new function nsecs_to_jiffies() that can deal
> greater values, and that can deal all incoming values as unsigned.

This looks good too,

Thanks Hidetoshi-san!

2009-11-26 12:34:31

by Hidetoshi Seto

[permalink] [raw]
Subject: [tip:sched/core] sched: Introduce task_times() to replace task_{u,s}time() pair

Commit-ID: d180c5bccec02612256fd8076ff3c1fac3429553
Gitweb: http://git.kernel.org/tip/d180c5bccec02612256fd8076ff3c1fac3429553
Author: Hidetoshi Seto <[email protected]>
AuthorDate: Thu, 26 Nov 2009 14:48:30 +0900
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 26 Nov 2009 12:59:19 +0100

sched: Introduce task_times() to replace task_{u,s}time() pair

Functions task_{u,s}time() are called in pair in almost all
cases. However task_stime() is implemented to call task_utime()
from its inside, so such paired calls run task_utime() twice.

It means we do heavy divisions (div_u64 + do_div) twice to get
utime and stime which can be obtained at same time by one set
of divisions.

This patch introduces a function task_times(*tsk, *utime,
*stime) to retrieve utime and stime at once in better, optimized
way.

Signed-off-by: Hidetoshi Seto <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Cc: Stanislaw Gruszka <[email protected]>
Cc: Spencer Candland <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Balbir Singh <[email protected]>
Cc: Americo Wang <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
fs/proc/array.c | 3 +-
include/linux/sched.h | 1 +
kernel/exit.c | 7 ++++-
kernel/sched.c | 55 +++++++++++++++++++++++++++++++-----------------
kernel/sys.c | 3 +-
5 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index e209f64..330deda 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -535,8 +535,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- utime = task_utime(task);
- stime = task_stime(task);
+ task_times(task, &utime, &stime);
gtime = task_gtime(task);
}

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78ba664..fe6ae15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1723,6 +1723,7 @@ static inline void put_task_struct(struct task_struct *t)
extern cputime_t task_utime(struct task_struct *p);
extern cputime_t task_stime(struct task_struct *p);
extern cputime_t task_gtime(struct task_struct *p);
+extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);

/*
* Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac..29068ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,6 +91,8 @@ static void __exit_signal(struct task_struct *tsk)
if (atomic_dec_and_test(&sig->count))
posix_cpu_timers_exit_group(tsk);
else {
+ cputime_t utime, stime;
+
/*
* If there is any task waiting for the group exit
* then notify it:
@@ -110,8 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime = cputime_add(sig->utime, task_utime(tsk));
- sig->stime = cputime_add(sig->stime, task_stime(tsk));
+ task_times(tsk, &utime, &stime);
+ sig->utime = cputime_add(sig->utime, utime);
+ sig->stime = cputime_add(sig->stime, stime);
sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
diff --git a/kernel/sched.c b/kernel/sched.c
index 315ba40..475a6f2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5191,6 +5191,14 @@ cputime_t task_stime(struct task_struct *p)
{
return p->stime;
}
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ if (ut)
+ *ut = task_utime(p);
+ if (st)
+ *st = task_stime(p);
+}
#else

#ifndef nsecs_to_cputime
@@ -5198,41 +5206,48 @@ cputime_t task_stime(struct task_struct *p)
msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
#endif

-cputime_t task_utime(struct task_struct *p)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
- cputime_t utime = p->utime, total = utime + p->stime;
- u64 temp;
+ cputime_t rtime, utime = p->utime, total = utime + p->stime;

/*
* Use CFS's precise accounting:
*/
- temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
+ rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

if (total) {
- temp *= utime;
+ u64 temp;
+
+ temp = (u64)(rtime * utime);
do_div(temp, total);
- }
- utime = (cputime_t)temp;
+ utime = (cputime_t)temp;
+ } else
+ utime = rtime;

+ /*
+ * Compare with previous values, to keep monotonicity:
+ */
p->prev_utime = max(p->prev_utime, utime);
- return p->prev_utime;
+ p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+ if (ut)
+ *ut = p->prev_utime;
+ if (st)
+ *st = p->prev_stime;
+}
+
+cputime_t task_utime(struct task_struct *p)
+{
+ cputime_t utime;
+ task_times(p, &utime, NULL);
+ return utime;
}

cputime_t task_stime(struct task_struct *p)
{
cputime_t stime;
-
- /*
- * Use CFS's precise accounting. (we subtract utime from
- * the total, to make sure the total observed by userspace
- * grows monotonically - apps rely on that):
- */
- stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
-
- if (stime >= 0)
- p->prev_stime = max(p->prev_stime, stime);
-
- return p->prev_stime;
+ task_times(p, NULL, &stime);
+ return stime;
}
#endif

diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760..bbdfce0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1346,8 +1346,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
utime = stime = cputime_zero;

if (who == RUSAGE_THREAD) {
- utime = task_utime(current);
- stime = task_stime(current);
+ task_times(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = p->signal->maxrss;
goto out;

2009-11-26 12:34:27

by Hidetoshi Seto

[permalink] [raw]
Subject: [tip:sched/core] sched: Remove task_{u,s,g}time()

Commit-ID: d5b7c78e975302a1bab28263266c39ecb71acad4
Gitweb: http://git.kernel.org/tip/d5b7c78e975302a1bab28263266c39ecb71acad4
Author: Hidetoshi Seto <[email protected]>
AuthorDate: Thu, 26 Nov 2009 14:49:05 +0900
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 26 Nov 2009 12:59:20 +0100

sched: Remove task_{u,s,g}time()

Now all task_{u,s}time() pairs are replaced by task_times().
And task_gtime() is too simple to be an inline function.

Cleanup them all.

Signed-off-by: Hidetoshi Seto <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Cc: Stanislaw Gruszka <[email protected]>
Cc: Spencer Candland <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Balbir Singh <[email protected]>
Cc: Americo Wang <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
fs/proc/array.c | 4 ++--
include/linux/sched.h | 3 ---
kernel/exit.c | 2 +-
kernel/sched.c | 33 ++-------------------------------
4 files changed, 5 insertions(+), 37 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 330deda..ca61a88 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- gtime = cputime_add(gtime, task_gtime(t));
+ gtime = cputime_add(gtime, t->gtime);
t = next_thread(t);
} while (t != task);

@@ -536,7 +536,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt = task->min_flt;
maj_flt = task->maj_flt;
task_times(task, &utime, &stime);
- gtime = task_gtime(task);
+ gtime = task->gtime;
}

/* scale priority and nice values from timeslices to -20..20 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe6ae15..0395b0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1720,9 +1720,6 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t);
}

-extern cputime_t task_utime(struct task_struct *p);
-extern cputime_t task_stime(struct task_struct *p);
-extern cputime_t task_gtime(struct task_struct *p);
extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);

/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 29068ab..2eaf68b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,7 +115,7 @@ static void __exit_signal(struct task_struct *tsk)
task_times(tsk, &utime, &stime);
sig->utime = cputime_add(sig->utime, utime);
sig->stime = cputime_add(sig->stime, stime);
- sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
+ sig->gtime = cputime_add(sig->gtime, tsk->gtime);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
diff --git a/kernel/sched.c b/kernel/sched.c
index 475a6f2..82251c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5182,22 +5182,12 @@ void account_idle_ticks(unsigned long ticks)
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-cputime_t task_utime(struct task_struct *p)
-{
- return p->utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
- return p->stime;
-}
-
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
if (ut)
- *ut = task_utime(p);
+ *ut = p->utime;
if (st)
- *st = task_stime(p);
+ *st = p->stime;
}
#else

@@ -5235,27 +5225,8 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
if (st)
*st = p->prev_stime;
}
-
-cputime_t task_utime(struct task_struct *p)
-{
- cputime_t utime;
- task_times(p, &utime, NULL);
- return utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
- cputime_t stime;
- task_times(p, NULL, &stime);
- return stime;
-}
#endif

-inline cputime_t task_gtime(struct task_struct *p)
-{
- return p->gtime;
-}
-
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.

2009-11-26 12:34:39

by Hidetoshi Seto

[permalink] [raw]
Subject: [tip:sched/core] sched, time: Define nsecs_to_jiffies()

Commit-ID: b7b20df91d43d5e59578b8fc16e895c0c8cbd9b5
Gitweb: http://git.kernel.org/tip/b7b20df91d43d5e59578b8fc16e895c0c8cbd9b5
Author: Hidetoshi Seto <[email protected]>
AuthorDate: Thu, 26 Nov 2009 14:49:27 +0900
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 26 Nov 2009 12:59:20 +0100

sched, time: Define nsecs_to_jiffies()

Use of msecs_to_jiffies() for nsecs_to_cputime() have some
problems:

- The type of msecs_to_jiffies()'s argument is unsigned int, so
it cannot convert msecs greater than UINT_MAX = about 49.7 days.

- msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument
is set, assuming that input was negative value. So it cannot
convert msecs greater than INT_MAX = about 24.8 days too.

This patch defines a new function nsecs_to_jiffies() that can
deal greater values, and that can deal all incoming values as
unsigned.

Signed-off-by: Hidetoshi Seto <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Cc: Stanislaw Gruszka <[email protected]>
Cc: Spencer Candland <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Balbir Singh <[email protected]>
Cc: Amrico Wang <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: John Stultz <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/jiffies.h | 1 +
kernel/sched.c | 3 +--
kernel/time.c | 30 ++++++++++++++++++++++++++++++
3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 1a9cf78..6811f4b 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
extern unsigned long clock_t_to_jiffies(unsigned long x);
extern u64 jiffies_64_to_clock_t(u64 x);
extern u64 nsec_to_clock_t(u64 x);
+extern unsigned long nsecs_to_jiffies(u64 n);

#define TIMESTAMP_SIZE 30

diff --git a/kernel/sched.c b/kernel/sched.c
index 82251c2..b3d4e2b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5192,8 +5192,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
#else

#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) \
- msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
#endif

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469..8047980 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x)
#endif
}

+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n: nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+#if (NSEC_PER_SEC % HZ) == 0
+ /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
+ return div_u64(n, NSEC_PER_SEC / HZ);
+#elif (HZ % 512) == 0
+ /* overflow after 292 years if HZ = 1024 */
+ return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
+#else
+ /*
+ * Generic case - optimized for cases where HZ is a multiple of 3.
+ * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
+ */
+ return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
+#endif
+}
+
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
{

2009-11-27 00:37:53

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] introduce task_times() to replace task_{u,s}time() pair

Peter Zijlstra wrote:
> On Thu, 2009-11-26 at 14:48 +0900, Hidetoshi Seto wrote:
>> This patch introduces a function task_times(*tsk, *utime, *stime)
>> to retrieve utime and stime at once in better, optimized way.
>>
>> Signed-off-by: Hidetoshi Seto <[email protected]>
>
> OK, the patch looks good, but it does not solve those non monotonic
> times funnies that started all this, right?

Yes, you are right.
These three patches are cleanup, not bugfix.

I don't have any clear idea to solve the original issue yet...


Thanks,
H.Seto