The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
as found in conservative governor, to chose the frequency increase step,
expressed as percentage (default = 100 is previous behaviour).
This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
* absorb punctual load spikes
* stabilize at the needed frequency, without passing for more power consuming states, and
Signed-off-by: Corrado Zoccolo [email protected]
---
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index e741c33..baa7b5e 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -83,6 +83,7 @@ struct cpu_dbs_info_s {
unsigned int freq_lo;
unsigned int freq_lo_jiffies;
unsigned int freq_hi_jiffies;
+ int requested_delta;
int cpu;
unsigned int enable:1,
sample_type:1;
@@ -112,11 +113,13 @@ static struct dbs_tuners {
unsigned int down_differential;
unsigned int ignore_nice;
unsigned int powersave_bias;
+ unsigned int freq_step;
} dbs_tuners_ins = {
.up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
.down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
.ignore_nice = 0,
.powersave_bias = 0,
+ .freq_step = 100,
};
static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
@@ -261,6 +264,7 @@ show_one(sampling_rate, sampling_rate);
show_one(up_threshold, up_threshold);
show_one(ignore_nice_load, ignore_nice);
show_one(powersave_bias, powersave_bias);
+show_one(freq_step, freq_step);
static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
const char *buf, size_t count)
@@ -358,6 +362,28 @@ static ssize_t store_powersave_bias(struct cpufreq_policy *unused,
return count;
}
+static ssize_t store_freq_step(struct cpufreq_policy *policy,
+ const char *buf, size_t count)
+{
+ unsigned int input;
+ int ret;
+ ret = sscanf(buf, "%u", &input);
+
+ if (ret != 1)
+ return -EINVAL;
+
+ if (input > 100)
+ input = 100;
+
+ /* no need to test here if freq_step is zero as the user might actually
+ * want this, they would be crazy though :) */
+ mutex_lock(&dbs_mutex);
+ dbs_tuners_ins.freq_step = input;
+ mutex_unlock(&dbs_mutex);
+
+ return count;
+}
+
#define define_one_rw(_name) \
static struct freq_attr _name = \
__ATTR(_name, 0644, show_##_name, store_##_name)
@@ -366,6 +392,7 @@ define_one_rw(sampling_rate);
define_one_rw(up_threshold);
define_one_rw(ignore_nice_load);
define_one_rw(powersave_bias);
+define_one_rw(freq_step);
static struct attribute *dbs_attributes[] = {
&sampling_rate_max.attr,
@@ -374,6 +401,7 @@ static struct attribute *dbs_attributes[] = {
&up_threshold.attr,
&ignore_nice_load.attr,
&powersave_bias.attr,
+ &freq_step.attr,
NULL
};
@@ -464,19 +492,30 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
/* Check for frequency increase */
if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) {
+ unsigned int freq_target = this_dbs_info->requested_delta
+ + policy->cur;
+ unsigned int freq_step;
+
/* if we are already at full speed then break out early */
- if (!dbs_tuners_ins.powersave_bias) {
- if (policy->cur == policy->max)
- return;
+ if (freq_target == policy->max)
+ return;
+
+ freq_step = (dbs_tuners_ins.freq_step * (policy->max-policy->min))
+ / 100;
- __cpufreq_driver_target(policy, policy->max,
- CPUFREQ_RELATION_H);
+ freq_target += max(freq_step, 5U);
+ freq_target = max(policy->min, min(policy->max, freq_target));
+
+ if (!dbs_tuners_ins.powersave_bias) {
+ __cpufreq_driver_target(policy, freq_target,
+ CPUFREQ_RELATION_H);
} else {
- int freq = powersave_bias_target(policy, policy->max,
- CPUFREQ_RELATION_H);
+ unsigned int freq = powersave_bias_target(policy, freq_target,
+ CPUFREQ_RELATION_H);
__cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_L);
}
+ this_dbs_info->requested_delta = freq_target - policy->cur;
return;
}
@@ -507,6 +546,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
__cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_L);
}
+ this_dbs_info->requested_delta = freq_next - policy->cur;
}
}
On Wed July 8 2009, Corrado Zoccolo wrote:
> The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
> as found in conservative governor, to chose the frequency increase step,
> expressed as percentage (default = 100 is previous behaviour).
>
> This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
> * absorb punctual load spikes
> * stabilize at the needed frequency, without passing for more power consuming states, and
>
Has this been tested on VIA C7-M and similar VIA products?
Reason I ask is because they only step in increments of the
clock multiplier - which varies among the models.
Also, the factory recommendation is to stay on the freq/voltage
curve for each product.
Only the programmer accessable VID (Voltage IDentifier) codes
are (on-silicon) lookup table mapped to the VRM (Voltage Regulator Module)
control lines - not all products provide an "on curve" mapping
for each possible multiplier step.
How about a few conditional statements in this bit of code, please.
Mike
> Signed-off-by: Corrado Zoccolo [email protected]
>
> ---
> diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
> index e741c33..baa7b5e 100644
> --- a/drivers/cpufreq/cpufreq_ondemand.c
> +++ b/drivers/cpufreq/cpufreq_ondemand.c
> @@ -83,6 +83,7 @@ struct cpu_dbs_info_s {
> unsigned int freq_lo;
> unsigned int freq_lo_jiffies;
> unsigned int freq_hi_jiffies;
> + int requested_delta;
> int cpu;
> unsigned int enable:1,
> sample_type:1;
> @@ -112,11 +113,13 @@ static struct dbs_tuners {
> unsigned int down_differential;
> unsigned int ignore_nice;
> unsigned int powersave_bias;
> + unsigned int freq_step;
> } dbs_tuners_ins = {
> .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
> .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
> .ignore_nice = 0,
> .powersave_bias = 0,
> + .freq_step = 100,
> };
>
> static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
> @@ -261,6 +264,7 @@ show_one(sampling_rate, sampling_rate);
> show_one(up_threshold, up_threshold);
> show_one(ignore_nice_load, ignore_nice);
> show_one(powersave_bias, powersave_bias);
> +show_one(freq_step, freq_step);
>
> static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
> const char *buf, size_t count)
> @@ -358,6 +362,28 @@ static ssize_t store_powersave_bias(struct cpufreq_policy *unused,
> return count;
> }
>
> +static ssize_t store_freq_step(struct cpufreq_policy *policy,
> + const char *buf, size_t count)
> +{
> + unsigned int input;
> + int ret;
> + ret = sscanf(buf, "%u", &input);
> +
> + if (ret != 1)
> + return -EINVAL;
> +
> + if (input > 100)
> + input = 100;
> +
> + /* no need to test here if freq_step is zero as the user might actually
> + * want this, they would be crazy though :) */
> + mutex_lock(&dbs_mutex);
> + dbs_tuners_ins.freq_step = input;
> + mutex_unlock(&dbs_mutex);
> +
> + return count;
> +}
> +
> #define define_one_rw(_name) \
> static struct freq_attr _name = \
> __ATTR(_name, 0644, show_##_name, store_##_name)
> @@ -366,6 +392,7 @@ define_one_rw(sampling_rate);
> define_one_rw(up_threshold);
> define_one_rw(ignore_nice_load);
> define_one_rw(powersave_bias);
> +define_one_rw(freq_step);
>
> static struct attribute *dbs_attributes[] = {
> &sampling_rate_max.attr,
> @@ -374,6 +401,7 @@ static struct attribute *dbs_attributes[] = {
> &up_threshold.attr,
> &ignore_nice_load.attr,
> &powersave_bias.attr,
> + &freq_step.attr,
> NULL
> };
>
> @@ -464,19 +492,30 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
>
> /* Check for frequency increase */
> if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) {
> + unsigned int freq_target = this_dbs_info->requested_delta
> + + policy->cur;
> + unsigned int freq_step;
> +
> /* if we are already at full speed then break out early */
> - if (!dbs_tuners_ins.powersave_bias) {
> - if (policy->cur == policy->max)
> - return;
> + if (freq_target == policy->max)
> + return;
> +
> + freq_step = (dbs_tuners_ins.freq_step * (policy->max-policy->min))
> + / 100;
>
> - __cpufreq_driver_target(policy, policy->max,
> - CPUFREQ_RELATION_H);
> + freq_target += max(freq_step, 5U);
> + freq_target = max(policy->min, min(policy->max, freq_target));
> +
> + if (!dbs_tuners_ins.powersave_bias) {
> + __cpufreq_driver_target(policy, freq_target,
> + CPUFREQ_RELATION_H);
> } else {
> - int freq = powersave_bias_target(policy, policy->max,
> - CPUFREQ_RELATION_H);
> + unsigned int freq = powersave_bias_target(policy, freq_target,
> + CPUFREQ_RELATION_H);
> __cpufreq_driver_target(policy, freq,
> CPUFREQ_RELATION_L);
> }
> + this_dbs_info->requested_delta = freq_target - policy->cur;
> return;
> }
>
> @@ -507,6 +546,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
> __cpufreq_driver_target(policy, freq,
> CPUFREQ_RELATION_L);
> }
> + this_dbs_info->requested_delta = freq_next - policy->cur;
> }
> }
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>
On Wed, Jul 08, 2009 at 03:56:33PM +0200, Corrado Zoccolo wrote:
> The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
> as found in conservative governor, to chose the frequency increase step,
> expressed as percentage (default = 100 is previous behaviour).
>
> This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
> * absorb punctual load spikes
> * stabilize at the needed frequency, without passing for more power consuming states, and
Is this a measured powersaving? The ondemand model is based on the
assumption that the idle state is disproportionately lower in power than
any running state, and therefore it's more sensible to run flat out for
short periods of time than run at half speed for longer. Is this
inherently flawed, or is it an artifact of differences in your processor
design?
--
Matthew Garrett | [email protected]
On Wed, 2009-07-08 at 09:10 -0700, Matthew Garrett wrote:
> On Wed, Jul 08, 2009 at 03:56:33PM +0200, Corrado Zoccolo wrote:
> > The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
> > as found in conservative governor, to chose the frequency increase step,
> > expressed as percentage (default = 100 is previous behaviour).
> >
> > This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
> > * absorb punctual load spikes
> > * stabilize at the needed frequency, without passing for more power consuming states, and
>
> Is this a measured powersaving? The ondemand model is based on the
> assumption that the idle state is disproportionately lower in power than
> any running state, and therefore it's more sensible to run flat out for
> short periods of time than run at half speed for longer. Is this
> inherently flawed, or is it an artifact of differences in your processor
> design?
>
As Matthew mentioned, ondemand governor wants to run at highest speed
and get to idle sooner. Another aspect of ondemand governor is to have
very low response time for freq increase on sudden increase in load.
With freq_step, it may take long time before we can respond to sudden
increase of load from idle to full busy.
Even though you have default step as 100, as soon as we have this
variable, there will be users/distros setting it in a wrong way.
So, it will be interesting to see any data you have with and without
this change.
Alternatives to explore would be:
- Can we identify some characteristics of this system and turn this on
automatically instead of user tunable.
- Long standing goal of combining conservative and ondemand with a
mode_switch at the driver load, instead of run time tunables.
Thanks,
Venki
Hi Matthew,
On Wed, Jul 8, 2009 at 6:10 PM, Matthew Garrett<[email protected]> wrote:
> On Wed, Jul 08, 2009 at 03:56:33PM +0200, Corrado Zoccolo wrote:
>> The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
>> as found in conservative governor, to chose the frequency increase step,
>> expressed as percentage (default = 100 is previous behaviour).
>>
>> This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
>> * absorb punctual load spikes
>> * stabilize at the needed frequency, without passing for more power consuming states, and
>
> Is this a measured powersaving? The ondemand model is based on the
> assumption that the idle state is disproportionately lower in power than
> any running state, and therefore it's more sensible to run flat out for
> short periods of time than run at half speed for longer. Is this
> inherently flawed, or is it an artifact of differences in your processor
> design?
The flawed assumption is that running at doubled frequency halves the
completion time.
On cpus that can change the core speed without impacting the
memory-cache bandwidth
(i.e. the Pentium M), workloads that access lot of memory go at the
same speed at
maximum and minimum frequency.
Now I see new CPUs that can flush their cache during deep idle states (Atoms),
this aggravates the aforementioned problem, rendering the high
frequency state much less appetible.
Corrado
>
> --
> Matthew Garrett | [email protected]
>
--
__________________________________________________________________________
dott. Corrado Zoccolo mailto:[email protected]
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------
Hi Michael,
On Wed, Jul 8, 2009 at 4:18 PM, Michael S. Zick<[email protected]> wrote:
> On Wed July 8 2009, Corrado Zoccolo wrote:
>> The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
>> as found in conservative governor, to chose the frequency increase step,
>> expressed as percentage (default = 100 is previous behaviour).
>>
>> This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
>> * absorb punctual load spikes
>> * stabilize at the needed frequency, without passing for more power consuming states, and
>>
>
> Has this been tested on VIA C7-M and similar VIA products?
> Reason I ask is because they only step in increments of the
> clock multiplier - which varies among the models.
>
Using the correct clock multipliers and voltages is driver's duty.
This change affects the governor, instead, that selects, among the
available ones,
which one will be used, according to a policy.
Corrado
> Also, the factory recommendation is to stay on the freq/voltage
> curve for each product.
> Only the programmer accessable VID (Voltage IDentifier) codes
> are (on-silicon) lookup table mapped to the VRM (Voltage Regulator Module)
> control lines - not all products provide an "on curve" mapping
> for each possible multiplier step.
>
> How about a few conditional statements in this bit of code, please.
>
> Mike
>> Signed-off-by: Corrado Zoccolo [email protected]
>>
>> ---
>> diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
>> index e741c33..baa7b5e 100644
>> --- a/drivers/cpufreq/cpufreq_ondemand.c
>> +++ b/drivers/cpufreq/cpufreq_ondemand.c
>> @@ -83,6 +83,7 @@ struct cpu_dbs_info_s {
>> unsigned int freq_lo;
>> unsigned int freq_lo_jiffies;
>> unsigned int freq_hi_jiffies;
>> + int requested_delta;
>> int cpu;
>> unsigned int enable:1,
>> sample_type:1;
>> @@ -112,11 +113,13 @@ static struct dbs_tuners {
>> unsigned int down_differential;
>> unsigned int ignore_nice;
>> unsigned int powersave_bias;
>> + unsigned int freq_step;
>> } dbs_tuners_ins = {
>> .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
>> .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
>> .ignore_nice = 0,
>> .powersave_bias = 0,
>> + .freq_step = 100,
>> };
>>
>> static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
>> @@ -261,6 +264,7 @@ show_one(sampling_rate, sampling_rate);
>> show_one(up_threshold, up_threshold);
>> show_one(ignore_nice_load, ignore_nice);
>> show_one(powersave_bias, powersave_bias);
>> +show_one(freq_step, freq_step);
>>
>> static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
>> const char *buf, size_t count)
>> @@ -358,6 +362,28 @@ static ssize_t store_powersave_bias(struct cpufreq_policy *unused,
>> return count;
>> }
>>
>> +static ssize_t store_freq_step(struct cpufreq_policy *policy,
>> + const char *buf, size_t count)
>> +{
>> + unsigned int input;
>> + int ret;
>> + ret = sscanf(buf, "%u", &input);
>> +
>> + if (ret != 1)
>> + return -EINVAL;
>> +
>> + if (input > 100)
>> + input = 100;
>> +
>> + /* no need to test here if freq_step is zero as the user might actually
>> + * want this, they would be crazy though :) */
>> + mutex_lock(&dbs_mutex);
>> + dbs_tuners_ins.freq_step = input;
>> + mutex_unlock(&dbs_mutex);
>> +
>> + return count;
>> +}
>> +
>> #define define_one_rw(_name) \
>> static struct freq_attr _name = \
>> __ATTR(_name, 0644, show_##_name, store_##_name)
>> @@ -366,6 +392,7 @@ define_one_rw(sampling_rate);
>> define_one_rw(up_threshold);
>> define_one_rw(ignore_nice_load);
>> define_one_rw(powersave_bias);
>> +define_one_rw(freq_step);
>>
>> static struct attribute *dbs_attributes[] = {
>> &sampling_rate_max.attr,
>> @@ -374,6 +401,7 @@ static struct attribute *dbs_attributes[] = {
>> &up_threshold.attr,
>> &ignore_nice_load.attr,
>> &powersave_bias.attr,
>> + &freq_step.attr,
>> NULL
>> };
>>
>> @@ -464,19 +492,30 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
>>
>> /* Check for frequency increase */
>> if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) {
>> + unsigned int freq_target = this_dbs_info->requested_delta
>> + + policy->cur;
>> + unsigned int freq_step;
>> +
>> /* if we are already at full speed then break out early */
>> - if (!dbs_tuners_ins.powersave_bias) {
>> - if (policy->cur == policy->max)
>> - return;
>> + if (freq_target == policy->max)
>> + return;
>> +
>> + freq_step = (dbs_tuners_ins.freq_step * (policy->max-policy->min))
>> + / 100;
>>
>> - __cpufreq_driver_target(policy, policy->max,
>> - CPUFREQ_RELATION_H);
>> + freq_target += max(freq_step, 5U);
>> + freq_target = max(policy->min, min(policy->max, freq_target));
>> +
>> + if (!dbs_tuners_ins.powersave_bias) {
>> + __cpufreq_driver_target(policy, freq_target,
>> + CPUFREQ_RELATION_H);
>> } else {
>> - int freq = powersave_bias_target(policy, policy->max,
>> - CPUFREQ_RELATION_H);
>> + unsigned int freq = powersave_bias_target(policy, freq_target,
>> + CPUFREQ_RELATION_H);
>> __cpufreq_driver_target(policy, freq,
>> CPUFREQ_RELATION_L);
>> }
>> + this_dbs_info->requested_delta = freq_target - policy->cur;
>> return;
>> }
>>
>> @@ -507,6 +546,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
>> __cpufreq_driver_target(policy, freq,
>> CPUFREQ_RELATION_L);
>> }
>> + this_dbs_info->requested_delta = freq_next - policy->cur;
>> }
>> }
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>
>>
>
>
>
--
__________________________________________________________________________
dott. Corrado Zoccolo mailto:[email protected]
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------
On Wed, Jul 08, 2009 at 07:41:23PM +0200, Corrado Zoccolo wrote:
> Hi Matthew,
> > Is this a measured powersaving? The ondemand model is based on the
> > assumption that the idle state is disproportionately lower in power than
> > any running state, and therefore it's more sensible to run flat out for
> > short periods of time than run at half speed for longer. Is this
> > inherently flawed, or is it an artifact of differences in your processor
> > design?
>
> The flawed assumption is that running at doubled frequency halves the
> completion time.
> On cpus that can change the core speed without impacting the
> memory-cache bandwidth
> (i.e. the Pentium M), workloads that access lot of memory go at the
> same speed at
> maximum and minimum frequency.
> Now I see new CPUs that can flush their cache during deep idle states (Atoms),
> this aggravates the aforementioned problem, rendering the high
> frequency state much less appetible.
Do you have numbers to support this? What effect does the ramping up
have on user-visible latency?
--
Matthew Garrett | [email protected]
Hi Venki,
On Wed, Jul 8, 2009 at 6:33 PM, Pallipadi,
Venkatesh<[email protected]> wrote:
> On Wed, 2009-07-08 at 09:10 -0700, Matthew Garrett wrote:
>> On Wed, Jul 08, 2009 at 03:56:33PM +0200, Corrado Zoccolo wrote:
>> > The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
>> > as found in conservative governor, to chose the frequency increase step,
>> > expressed as percentage (default = 100 is previous behaviour).
>> >
>> > This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
>> > * absorb punctual load spikes
>> > * stabilize at the needed frequency, without passing for more power consuming states, and
>>
>> Is this a measured powersaving? The ondemand model is based on the
>> assumption that the idle state is disproportionately lower in power than
>> any running state, and therefore it's more sensible to run flat out for
>> short periods of time than run at half speed for longer. Is this
>> inherently flawed, or is it an artifact of differences in your processor
>> design?
>>
>
> As Matthew mentioned, ondemand governor wants to run at highest speed
> and get to idle sooner. Another aspect of ondemand governor is to have
> very low response time for freq increase on sudden increase in load.
> With freq_step, it may take long time before we can respond to sudden
> increase of load from idle to full busy.
Yes. freq_step is a tunable that allows trading latency for power saving.
>
> Even though you have default step as 100, as soon as we have this
> variable, there will be users/distros setting it in a wrong way.
>
I think having it as a tunable adds some value, since power managing
applications could use various inputs to chose the best value at run
time.
Some of them allow to specify complex policies, as switch to powersave
governor when battery charge is below a threshold.
With this change, one can gradually transition from current ondemand
to powersave, passing through intermediate states,
by feeding back the battery charge % into it, so the system is more
responsive when fully charged, and tries to save more power when it is
running low.
> So, it will be interesting to see any data you have with and without
> this change.
Ok, I'll collect some data.
>
> Alternatives to explore would be:
> - Can we identify some characteristics of this system and turn this on
> automatically instead of user tunable.
I think the user tunable has some reason to exist, to implement more
complex user policies as explained above.
We can identify, though, cases in which this should never be enabled,
like the P4s, in which the clock modulation reduces also the
memory-cache bandwidth. In those cases, the additional latency is very
noticeable, and can also be a loss in terms of power.
> - Long standing goal of combining conservative and ondemand with a
> mode_switch at the driver load, instead of run time tunables.
>
> Thanks,
> Venki
>
>
--
__________________________________________________________________________
dott. Corrado Zoccolo mailto:[email protected]
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------
On Wed 2009-07-08 19:41:23, Corrado Zoccolo wrote:
> Hi Matthew,
>
> On Wed, Jul 8, 2009 at 6:10 PM, Matthew Garrett<[email protected]> wrote:
> > On Wed, Jul 08, 2009 at 03:56:33PM +0200, Corrado Zoccolo wrote:
> >> The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
> >> as found in conservative governor, to chose the frequency increase step,
> >> expressed as percentage (default = 100 is previous behaviour).
> >>
> >> This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
> >> * absorb punctual load spikes
> >> * stabilize at the needed frequency, without passing for more power consuming states, and
> >
> > Is this a measured powersaving? The ondemand model is based on the
> > assumption that the idle state is disproportionately lower in power than
> > any running state, and therefore it's more sensible to run flat out for
> > short periods of time than run at half speed for longer. Is this
> > inherently flawed, or is it an artifact of differences in your processor
> > design?
Different processors behave differently -- that assumption is wrong at
least for old athlon64s... Those have power-hungry idle states, and 4x
power consumption at 2x frequency....
(Original Intel speedstep was similar iirc).
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Wed, Jul 8, 2009 at 7:47 PM, Matthew Garrett<[email protected]> wrote:
> On Wed, Jul 08, 2009 at 07:41:23PM +0200, Corrado Zoccolo wrote:
>> Hi Matthew,
>> > Is this a measured powersaving? The ondemand model is based on the
>> > assumption that the idle state is disproportionately lower in power than
>> > any running state, and therefore it's more sensible to run flat out for
>> > short periods of time than run at half speed for longer. Is this
>> > inherently flawed, or is it an artifact of differences in your processor
>> > design?
>>
>> The flawed assumption is that running at doubled frequency halves the
>> completion time.
>> On cpus that can change the core speed without impacting the
>> memory-cache bandwidth
>> (i.e. the Pentium M), workloads that access lot of memory go at the
>> same speed at
>> maximum and minimum frequency.
>> Now I see new CPUs that can flush their cache during deep idle states (Atoms),
>> this aggravates the aforementioned problem, rendering the high
>> frequency state much less appetible.
>
> Do you have numbers to support this? What effect does the ramping up
> have on user-visible latency?
I have the numbers now (see attached).
On my Pentium M machine, I run twice the following test, one with
freq_step = 100, and one with 5, sampling the remaining capacity every
20 minutes, for 12 samples.
* booted from battery after full discharge and full recharge.
* started firefox (with empty page)
* run the script:
for cpu in /sys/devices/system/cpu/cpu[0]/; do
# reset to defaults for my system
cat $cpu/cpufreq/phc_default_vids > $cpu/cpufreq/phc_vids
echo ondemand > $cpu/cpufreq/scaling_governor
echo 0 > $cpu/cpufreq/ondemand/ignore_nice_load
echo 20000 > $cpu/cpufreq/ondemand/sampling_rate
done
for cpu in /sys/devices/system/cpu/cpu[0]/; do
echo $1 > $cpu/cpufreq/ondemand/freq_step;
done
killall xscreensaver
sync
xrandr --output LVDS --off
for i in `seq 0 11`; do
cat /proc/acpi/battery/BAT*/state > test.$1.$i.bat
sleep 1200
done
xrandr --output LVDS --auto
shutdown -h now
The attached tsv shows that freq_step=5 saves around 1%-2% of power
with respect to freq_step=100 (default cpufreq behaviour).
>
> --
> Matthew Garrett | [email protected]
>
--
__________________________________________________________________________
dott. Corrado Zoccolo mailto:[email protected]
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------
On Fri, Jul 10, 2009 at 1:34 AM, Pavel Machek<[email protected]> wrote:
> On Wed 2009-07-08 19:41:23, Corrado Zoccolo wrote:
>> Hi Matthew,
>>
>> On Wed, Jul 8, 2009 at 6:10 PM, Matthew Garrett<[email protected]> wrote:
>> > On Wed, Jul 08, 2009 at 03:56:33PM +0200, Corrado Zoccolo wrote:
>> >> The patch introduces a new sysfs tunable cpufreq/ondemand/freq_step,
>> >> as found in conservative governor, to chose the frequency increase step,
>> >> expressed as percentage (default = 100 is previous behaviour).
>> >>
>> >> This allows fine tuning powersaving on mobile CPUs, since smaller steps will allow to:
>> >> * absorb punctual load spikes
>> >> * stabilize at the needed frequency, without passing for more power consuming states, and
>> >
>> > Is this a measured powersaving? The ondemand model is based on the
>> > assumption that the idle state is disproportionately lower in power than
>> > any running state, and therefore it's more sensible to run flat out for
>> > short periods of time than run at half speed for longer. Is this
>> > inherently flawed, or is it an artifact of differences in your processor
>> > design?
>
> Different processors behave differently -- that assumption is wrong at
> least for old athlon64s... Those have power-hungry idle states, and 4x
> power consumption at 2x frequency....
>
> (Original Intel speedstep was similar iirc).
Actually, I think that if the assumption was completely fulfilled by
some hardware, the 'performance' governor would be better than
cpufreq, since it would have less latency, and reach idle faster.
However, even for recent hardware, at least states C1/C2 drain more
power at higher frequencies, so cpufreq is actually useful, and
avoiding unnecessary peaks (as this patch intends to do) can be useful
as well.
Moreover. as I said before, for some processors, the performances are
not directly proportional to the core frequency, but depend on the
workload.
For example: on Pentium M (banias), with frequency ranging from 600 to 1300MHz,
the following program, compiled with -O3 -funroll-loops:
#include <stdlib.h>
#define SIZE 16*1024*1024
#define TYPE long
int main() {
TYPE * memory=(TYPE*)calloc(SIZE,sizeof(TYPE));
int i, j;
for (j=0; j<20; ++j)
for (i=1; i<SIZE-1; ++i)
memory[i] += memory[i-1] + memory[i+1];
return 0;
}
achieves:
[root@localhost hd]# echo powersave >
/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
[root@localhost hd]# time ./time_me
3.19user 0.11system 0:03.36elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+16506minor)pagefaults 0swaps
[root@localhost hd]# echo performance >
/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
[root@localhost hd]# time ./time_me
3.06user 0.07system 0:03.18elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+16506minor)pagefaults 0swaps
[root@localhost hd]# echo ondemand >
/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
[root@localhost hd]# echo 5 >
/sys/devices/system/cpu/cpu0/cpufreq/ondemand/freq_step
[root@localhost hd]# sleep 5; time ./time_me
2.90user 0.11system 0:03.18elapsed 94%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+16506minor)pagefaults 0swaps
it has only 9% performance decrease passing from 1300MHz to 600MHz,
and the difference between performance and ondemand with
(freq_step=5%) is not measurable.
The same program on an Atom doesn't show the same effect. It seems
that Atoms scale also the FSB frequency together with core frequency.
Corrado
>
> Pavel
> --
> (english) http://www.livejournal.com/~pavelmachek
> (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
>
--
__________________________________________________________________________
dott. Corrado Zoccolo mailto:[email protected]
PhD - Department of Computer Science - University of Pisa, Italy
--------------------------------------------------------------------------
On Tue, Jul 14, 2009 at 04:44:57PM +0200, Corrado Zoccolo wrote:
> The same program on an Atom doesn't show the same effect. It seems
> that Atoms scale also the FSB frequency together with core frequency.
I believe that this is true of all current Intel mobile chipsets - the
lowest P-state will also reduce the FSB.
--
Matthew Garrett | [email protected]