Remove conversion of nice load to microseconds which caused addition
of times measured in different units and thus unreasonable behaviour
with both governors.
Signed-off-by: Alexander Miller <[email protected]>
---
diff -uprN linux-2.6.git/drivers/cpufreq/cpufreq_conservative.c linux/drivers/cpufreq/cpufreq_conservative.c
--- linux-2.6.git/drivers/cpufreq/cpufreq_conservative.c 2009-10-10 15:56:58.010595257 +0200
+++ linux/drivers/cpufreq/cpufreq_conservative.c 2009-10-10 20:56:52.194598889 +0200
@@ -400,20 +400,10 @@ static void dbs_check_cpu(struct cpu_dbs
j_dbs_info->prev_cpu_idle = cur_idle_time;
if (dbs_tuners_ins.ignore_nice) {
- cputime64_t cur_nice;
- unsigned long cur_nice_jiffies;
-
- cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
+ idle_time += (unsigned int) cputime64_sub(
+ kstat_cpu(j).cpustat.nice,
j_dbs_info->prev_cpu_nice);
- /*
- * Assumption: nice time between sampling periods will
- * be less than 2^32 jiffies for 32 bit sys
- */
- cur_nice_jiffies = (unsigned long)
- cputime64_to_jiffies64(cur_nice);
-
j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
- idle_time += jiffies_to_usecs(cur_nice_jiffies);
}
if (unlikely(!wall_time || wall_time < idle_time))
diff -uprN linux-2.6.git/drivers/cpufreq/cpufreq_ondemand.c linux/drivers/cpufreq/cpufreq_ondemand.c
--- linux-2.6.git/drivers/cpufreq/cpufreq_ondemand.c 2009-10-10 16:00:40.380595816 +0200
+++ linux/drivers/cpufreq/cpufreq_ondemand.c 2009-10-10 20:56:52.195596654 +0200
@@ -488,20 +488,10 @@ static void dbs_check_cpu(struct cpu_dbs
j_dbs_info->prev_cpu_idle = cur_idle_time;
if (dbs_tuners_ins.ignore_nice) {
- cputime64_t cur_nice;
- unsigned long cur_nice_jiffies;
-
- cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
+ idle_time += (unsigned int) cputime64_sub(
+ kstat_cpu(j).cpustat.nice,
j_dbs_info->prev_cpu_nice);
- /*
- * Assumption: nice time between sampling periods will
- * be less than 2^32 jiffies for 32 bit sys
- */
- cur_nice_jiffies = (unsigned long)
- cputime64_to_jiffies64(cur_nice);
-
j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
- idle_time += jiffies_to_usecs(cur_nice_jiffies);
}
if (unlikely(!wall_time || wall_time < idle_time))
>-----Original Message-----
>From: [email protected]
>[mailto:[email protected]] On Behalf Of Alexander Miller
>Sent: Friday, November 06, 2009 8:27 AM
>To: [email protected]
>Cc: [email protected]; Dave Jones
>Subject: [PATCH] cpufreq: fix conservative/ondemand behaviour
>with ignore_nice_load
>
>Remove conversion of nice load to microseconds which caused addition
>of times measured in different units and thus unreasonable behaviour
>with both governors.
Can you describe the "unresonable behavior" you are seeing. Is it
with NO_HZ enabled or disabled?
I see there can be a problem with this code when NO_HZ is disabled.
But, the patch below is not the right solution as it will result in
Adding times in different units with NO_HZ enabled.
Thanks,
Venki
>
>Signed-off-by: Alexander Miller <[email protected]>
>---
>diff -uprN
>linux-2.6.git/drivers/cpufreq/cpufreq_conservative.c
>linux/drivers/cpufreq/cpufreq_conservative.c
>--- linux-2.6.git/drivers/cpufreq/cpufreq_conservative.c
>2009-10-10 15:56:58.010595257 +0200
>+++ linux/drivers/cpufreq/cpufreq_conservative.c
>2009-10-10 20:56:52.194598889 +0200
>@@ -400,20 +400,10 @@ static void dbs_check_cpu(struct cpu_dbs
> j_dbs_info->prev_cpu_idle = cur_idle_time;
>
> if (dbs_tuners_ins.ignore_nice) {
>- cputime64_t cur_nice;
>- unsigned long cur_nice_jiffies;
>-
>- cur_nice =
>cputime64_sub(kstat_cpu(j).cpustat.nice,
>+ idle_time += (unsigned int) cputime64_sub(
>+ kstat_cpu(j).cpustat.nice,
> j_dbs_info->prev_cpu_nice);
>- /*
>- * Assumption: nice time between
>sampling periods will
>- * be less than 2^32 jiffies for 32 bit sys
>- */
>- cur_nice_jiffies = (unsigned long)
>-
>cputime64_to_jiffies64(cur_nice);
>-
> j_dbs_info->prev_cpu_nice =
>kstat_cpu(j).cpustat.nice;
>- idle_time += jiffies_to_usecs(cur_nice_jiffies);
> }
>
> if (unlikely(!wall_time || wall_time < idle_time))
>diff -uprN linux-2.6.git/drivers/cpufreq/cpufreq_ondemand.c
>linux/drivers/cpufreq/cpufreq_ondemand.c
>--- linux-2.6.git/drivers/cpufreq/cpufreq_ondemand.c
>2009-10-10 16:00:40.380595816 +0200
>+++ linux/drivers/cpufreq/cpufreq_ondemand.c 2009-10-10
>20:56:52.195596654 +0200
>@@ -488,20 +488,10 @@ static void dbs_check_cpu(struct cpu_dbs
> j_dbs_info->prev_cpu_idle = cur_idle_time;
>
> if (dbs_tuners_ins.ignore_nice) {
>- cputime64_t cur_nice;
>- unsigned long cur_nice_jiffies;
>-
>- cur_nice =
>cputime64_sub(kstat_cpu(j).cpustat.nice,
>+ idle_time += (unsigned int) cputime64_sub(
>+ kstat_cpu(j).cpustat.nice,
> j_dbs_info->prev_cpu_nice);
>- /*
>- * Assumption: nice time between
>sampling periods will
>- * be less than 2^32 jiffies for 32 bit sys
>- */
>- cur_nice_jiffies = (unsigned long)
>-
>cputime64_to_jiffies64(cur_nice);
>-
> j_dbs_info->prev_cpu_nice =
>kstat_cpu(j).cpustat.nice;
>- idle_time += jiffies_to_usecs(cur_nice_jiffies);
> }
>
> if (unlikely(!wall_time || wall_time < idle_time))
>--
>To unsubscribe from this list: send the line "unsubscribe cpufreq" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>-
On Tue, Nov 10, 2009 at 11:42:02AM -0800, Pallipadi, Venkatesh wrote:
> >-----Original Message-----
> >From: [email protected]
> >[mailto:[email protected]] On Behalf Of Alexander Miller
> >Sent: Friday, November 06, 2009 8:27 AM
> >To: [email protected]
> >Cc: [email protected]; Dave Jones
> >Subject: [PATCH] cpufreq: fix conservative/ondemand behaviour
> >with ignore_nice_load
> >
> >Remove conversion of nice load to microseconds which caused addition
> >of times measured in different units and thus unreasonable behaviour
> >with both governors.
>
> Can you describe the "unresonable behavior" you are seeing. Is it
> with NO_HZ enabled or disabled?
>
> I see there can be a problem with this code when NO_HZ is disabled.
> But, the patch below is not the right solution as it will result in
> Adding times in different units with NO_HZ enabled.
>
> Thanks,
> Venki
Does the below test patch (only compile tested) resolve the problem you
are seeing?
Thanks,
Venki
---
drivers/cpufreq/cpufreq_conservative.c | 4 ++--
drivers/cpufreq/cpufreq_ondemand.c | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index bc33ddc..c7b081b 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -116,9 +116,9 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
idle_time = cputime64_sub(cur_wall_time, busy_time);
if (wall)
- *wall = cur_wall_time;
+ *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
- return idle_time;
+ return (cputime64_t)jiffies_to_usecs(idle_time);;
}
static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 071699d..4b34ade 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -133,9 +133,9 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
idle_time = cputime64_sub(cur_wall_time, busy_time);
if (wall)
- *wall = cur_wall_time;
+ *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
- return idle_time;
+ return (cputime64_t)jiffies_to_usecs(idle_time);
}
static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
--
1.6.0.6
On Tue, Nov 10, 2009, Pallipadi, Venkatesh wrote:
> > Can you describe the "unresonable behavior" you are seeing. Is it
> > with NO_HZ enabled or disabled?
$ zgrep NO_HZ /proc/config.gz
# CONFIG_NO_HZ is not set
When there are two cpu-intense processes, one with nice 19 and the other
with nice 0, then the latter will use almost 100% cpu time, of course.
But the cpu has been stuck at the lowest frequency without the patch.
To be exact, it would change the freq sometimes, but return to the
lowest freq within a fraction of a second.
I would expect it to select a freq such that the non-nice processes
take <80% or the highest freq (which it does with the patch).
> > I see there can be a problem with this code when NO_HZ is disabled.
> > But, the patch below is not the right solution as it will result in
> > Adding times in different units with NO_HZ enabled.
Yes, you are right. Looks like I've patched the wrong half of the
inconsistency :-(
I think it's a bit irritating you are using cputime64_t to store
microseconds. At least it fooled me (I'm no kernel guy though) into
thinking that get_cpu_idle_time_jiffy() returning jiffies was the
intended behaviour.
> Does the below test patch (only compile tested) resolve the problem you
> are seeing?
I've just rebooted the machine with the new patched kernel, and
it looks good.
Thank you,
Alex
> ---
> drivers/cpufreq/cpufreq_conservative.c | 4 ++--
> drivers/cpufreq/cpufreq_ondemand.c | 4 ++--
> 2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
> index bc33ddc..c7b081b 100644
> --- a/drivers/cpufreq/cpufreq_conservative.c
> +++ b/drivers/cpufreq/cpufreq_conservative.c
> @@ -116,9 +116,9 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
>
> idle_time = cputime64_sub(cur_wall_time, busy_time);
> if (wall)
> - *wall = cur_wall_time;
> + *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
>
> - return idle_time;
> + return (cputime64_t)jiffies_to_usecs(idle_time);;
> }
>
> static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
> diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
> index 071699d..4b34ade 100644
> --- a/drivers/cpufreq/cpufreq_ondemand.c
> +++ b/drivers/cpufreq/cpufreq_ondemand.c
> @@ -133,9 +133,9 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
>
> idle_time = cputime64_sub(cur_wall_time, busy_time);
> if (wall)
> - *wall = cur_wall_time;
> + *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
>
> - return idle_time;
> + return (cputime64_t)jiffies_to_usecs(idle_time);
> }
>
> static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
> --
> 1.6.0.6
On Wed, 2009-11-11 at 14:10 -0800, Alexander Miller wrote:
> On Tue, Nov 10, 2009, Pallipadi, Venkatesh wrote:
> > > Can you describe the "unresonable behavior" you are seeing. Is it
> > > with NO_HZ enabled or disabled?
> $ zgrep NO_HZ /proc/config.gz
> # CONFIG_NO_HZ is not set
>
> When there are two cpu-intense processes, one with nice 19 and the other
> with nice 0, then the latter will use almost 100% cpu time, of course.
> But the cpu has been stuck at the lowest frequency without the patch.
> To be exact, it would change the freq sometimes, but return to the
> lowest freq within a fraction of a second.
> I would expect it to select a freq such that the non-nice processes
> take <80% or the highest freq (which it does with the patch).
>
> > > I see there can be a problem with this code when NO_HZ is disabled.
> > > But, the patch below is not the right solution as it will result in
> > > Adding times in different units with NO_HZ enabled.
>
> Yes, you are right. Looks like I've patched the wrong half of the
> inconsistency :-(
> I think it's a bit irritating you are using cputime64_t to store
> microseconds. At least it fooled me (I'm no kernel guy though) into
> thinking that get_cpu_idle_time_jiffy() returning jiffies was the
> intended behaviour.
Agreed. That cputime64 is ugly. There is also some confusion with naming
of get_cpu_idle_time_jiffy and get_cpu_idle_time_us and these routines
being redundantly repeated across 2 files. Will add them to my todo
queue.
>
> > Does the below test patch (only compile tested) resolve the problem you
> > are seeing?
>
> I've just rebooted the machine with the new patched kernel, and
> it looks good.
Ok. Thanks for reporting (and diagnosing as well :)) and verifying the
patch. I will resend the patch with a bit more description and your
Reported/Tested-by.
Thanks,
Venki
Dave,
Here is the fix for the bug reported on this thread. Please Apply. Looks to
be a stable candidate as well.
Thanks,
Venki
ondemand and conservative governors are messing up time units in the
code path where NO_HZ is not enabled and ignore_nice is set. The walltime
idletime stored is in jiffies and nice time calculation is happening in
microseconds.
The problem was reported and diagnosed by Alexander here.
http://marc.info/?l=linux-kernel&m=125752550404513&w=2
The patch below fixes this thinko.
Reported-by: Alexander Miller <[email protected]>
Tested-by: Alexander Miller <[email protected]>
Signed-off-by: Venkatesh Pallipadi <[email protected]>
---
drivers/cpufreq/cpufreq_conservative.c | 4 ++--
drivers/cpufreq/cpufreq_ondemand.c | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index bc33ddc..c7b081b 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -116,9 +116,9 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
idle_time = cputime64_sub(cur_wall_time, busy_time);
if (wall)
- *wall = cur_wall_time;
+ *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
- return idle_time;
+ return (cputime64_t)jiffies_to_usecs(idle_time);;
}
static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 071699d..4b34ade 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -133,9 +133,9 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
idle_time = cputime64_sub(cur_wall_time, busy_time);
if (wall)
- *wall = cur_wall_time;
+ *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
- return idle_time;
+ return (cputime64_t)jiffies_to_usecs(idle_time);
}
static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
--
1.6.0.6