Hi,
These two patches result from the discussion at the OSPM-summit last week
and are targeted at alleviating some issues related to the frequency change
rate limitting in schedutil. They are intended to be used along with
https://patchwork.kernel.org/patch/9655173/ and are based on the current
linux-next branch of the linux-pm.git tree (which should be equivalent to
linux-next from the PM material perspective).
The first one is to allow scaling drivers to specify their own (smaller) latency
multipliers for computing the default value of rate_limit_us.
The second one makes schedutil store the maximum CPU utilization value seen
after the previous frequency update and use it for computing the new frequency
next time to address the problem with discarding intermediate utilization values.
They have been lightly tested on a Dell Vostro laptop with an Intel Sandy Bridge
processor.
Thanks,
Rafael
From: Rafael J. Wysocki <[email protected]>
Due to the limitation of the rate of frequency changes the schedutil
governor only estimates the CPU utilization entirely when it is about
to update the frequency for the corresponding cpufreq policy. As a
result, the intermediate utilization values are discarded by it,
but that is not appropriate in general (like, for example, when
tasks migrate from one CPU to another or exit, in which cases the
utilization measured by PELT may change abruptly between frequency
updates).
For this reason, modify schedutil to estimate CPU utilization
completely whenever it is invoked for the given CPU and store the
maximum encountered value of it as input for subsequent new frequency
computations. This way the new frequency is always based on the
maximum utilization value seen by the governor after the previous
frequency update which effectively prevents intermittent utilization
variations from causing it to be reduced unnecessarily.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
kernel/sched/cpufreq_schedutil.c | 90 +++++++++++++++++++++------------------
1 file changed, 50 insertions(+), 40 deletions(-)
Index: linux-pm/kernel/sched/cpufreq_schedutil.c
===================================================================
--- linux-pm.orig/kernel/sched/cpufreq_schedutil.c
+++ linux-pm/kernel/sched/cpufreq_schedutil.c
@@ -57,7 +57,6 @@ struct sugov_cpu {
unsigned long iowait_boost_max;
u64 last_update;
- /* The fields below are only needed when sharing a policy. */
unsigned long util;
unsigned long max;
unsigned int flags;
@@ -154,22 +153,30 @@ static unsigned int get_next_freq(struct
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
{
+ unsigned long cfs_util, cfs_max;
struct rq *rq = this_rq();
- unsigned long cfs_max;
- cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+ sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
+ if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+ return;
- *util = min(rq->cfs.avg.util_avg, cfs_max);
- *max = cfs_max;
+ cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+ cfs_util = min(rq->cfs.avg.util_avg, cfs_max);
+ if (sg_cpu->util * cfs_max < sg_cpu->max * cfs_util) {
+ sg_cpu->util = cfs_util;
+ sg_cpu->max = cfs_max;
+ }
}
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
- unsigned int flags)
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
{
+ unsigned long boost_util, boost_max = sg_cpu->iowait_boost_max;
+
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ sg_cpu->iowait_boost = boost_max;
} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;
@@ -177,22 +184,15 @@ static void sugov_set_iowait_boost(struc
if (delta_ns > TICK_NSEC)
sg_cpu->iowait_boost = 0;
}
-}
-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
- unsigned long *max)
-{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
-
- if (!boost_util)
+ boost_util = sg_cpu->iowait_boost;
+ if (!boost_util || sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
return;
- if (*util * boost_max < *max * boost_util) {
- *util = boost_util;
- *max = boost_max;
+ if (sg_cpu->util * boost_max < sg_cpu->max * boost_util) {
+ sg_cpu->util = boost_util;
+ sg_cpu->max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -208,30 +208,42 @@ static bool sugov_cpu_is_busy(struct sug
static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */
+static void sugov_cpu_update(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ sugov_get_util(sg_cpu, flags);
+ sugov_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+}
+
+static void sugov_reset_util(struct sugov_cpu *sg_cpu)
+{
+ sg_cpu->util = 0;
+ sg_cpu->max = 1;
+ sg_cpu->flags = 0;
+ sg_cpu->iowait_boost >>= 1;
+}
+
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util, max;
unsigned int next_f;
bool busy;
- sugov_set_iowait_boost(sg_cpu, time, flags);
- sg_cpu->last_update = time;
+ sugov_cpu_update(sg_cpu, time, flags);
if (!sugov_should_update_freq(sg_policy, time))
return;
busy = sugov_cpu_is_busy(sg_cpu);
- if (flags & SCHED_CPUFREQ_RT_DL) {
+ if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max);
- sugov_iowait_boost(sg_cpu, &util, &max);
- next_f = get_next_freq(sg_policy, util, max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
@@ -240,6 +252,7 @@ static void sugov_update_single(struct u
next_f = sg_policy->next_freq;
}
sugov_update_commit(sg_policy, time, next_f);
+ sugov_reset_util(sg_cpu);
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
@@ -276,8 +289,6 @@ static unsigned int sugov_next_freq_shar
util = j_util;
max = j_max;
}
-
- sugov_iowait_boost(j_sg_cpu, &util, &max);
}
return get_next_freq(sg_policy, util, max);
@@ -288,27 +299,25 @@ static void sugov_update_shared(struct u
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- unsigned long util, max;
- unsigned int next_f;
-
- sugov_get_util(&util, &max);
raw_spin_lock(&sg_policy->update_lock);
- sg_cpu->util = util;
- sg_cpu->max = max;
- sg_cpu->flags = flags;
-
- sugov_set_iowait_boost(sg_cpu, time, flags);
- sg_cpu->last_update = time;
+ sugov_cpu_update(sg_cpu, time, flags);
if (sugov_should_update_freq(sg_policy, time)) {
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int next_f;
+ unsigned int j;
+
if (flags & SCHED_CPUFREQ_RT_DL)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
next_f = sugov_next_freq_shared(sg_cpu);
sugov_update_commit(sg_policy, time, next_f);
+
+ for_each_cpu(j, policy->cpus)
+ sugov_reset_util(&per_cpu(sugov_cpu, j));
}
raw_spin_unlock(&sg_policy->update_lock);
@@ -606,6 +615,7 @@ static int sugov_start(struct cpufreq_po
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ sugov_reset_util(sg_cpu);
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
sugov_update_shared :
From: Rafael J. Wysocki <[email protected]>
Make the schedutil governor compute the initial (default) value of
the rate_limit_us sysfs attribute by multiplying the transition
latency by a multiplier depending on the policy and set by the
scaling driver (instead of using a constant for this purpose).
That will allow scaling drivers to make schedutil use smaller default
values of rate_limit_us and reduce the default average time interval
between consecutive frequency changes.
Make intel_pstate use the opportunity to reduce the rate limit
somewhat.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
drivers/cpufreq/cpufreq.c | 1 +
drivers/cpufreq/intel_pstate.c | 2 ++
include/linux/cpufreq.h | 8 ++++++++
kernel/sched/cpufreq_schedutil.c | 2 +-
4 files changed, 12 insertions(+), 1 deletion(-)
Index: linux-pm/drivers/cpufreq/cpufreq.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq.c
+++ linux-pm/drivers/cpufreq/cpufreq.c
@@ -1072,6 +1072,7 @@ static struct cpufreq_policy *cpufreq_po
init_waitqueue_head(&policy->transition_wait);
init_completion(&policy->kobj_unregister);
INIT_WORK(&policy->update, handle_update);
+ policy->latency_multiplier = LATENCY_MULTIPLIER;
policy->cpu = cpu;
return policy;
Index: linux-pm/include/linux/cpufreq.h
===================================================================
--- linux-pm.orig/include/linux/cpufreq.h
+++ linux-pm/include/linux/cpufreq.h
@@ -120,6 +120,14 @@ struct cpufreq_policy {
bool fast_switch_possible;
bool fast_switch_enabled;
+ /*
+ * Multiplier to apply to the transition latency to obtain the preferred
+ * average time interval between consecutive invocations of the driver
+ * to set the frequency for this policy. Initialized by the core to the
+ * LATENCY_MULTIPLIER value.
+ */
+ unsigned int latency_multiplier;
+
/* Cached frequency lookup from cpufreq_driver_resolve_freq. */
unsigned int cached_target_freq;
int cached_resolved_idx;
Index: linux-pm/kernel/sched/cpufreq_schedutil.c
===================================================================
--- linux-pm.orig/kernel/sched/cpufreq_schedutil.c
+++ linux-pm/kernel/sched/cpufreq_schedutil.c
@@ -530,7 +530,7 @@ static int sugov_init(struct cpufreq_pol
goto stop_kthread;
}
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
+ tunables->rate_limit_us = policy->latency_multiplier;
lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
if (lat)
tunables->rate_limit_us *= lat;
Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -41,6 +41,7 @@
#define INTEL_PSTATE_HWP_SAMPLING_INTERVAL (50 * NSEC_PER_MSEC)
#define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
+#define INTEL_CPUFREQ_LATENCY_MULTIPLIER 250
#ifdef CONFIG_ACPI
#include <acpi/processor.h>
@@ -2237,6 +2238,7 @@ static int intel_cpufreq_cpu_init(struct
return ret;
policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
+ policy->latency_multiplier = INTEL_CPUFREQ_LATENCY_MULTIPLIER;
/* This reflects the intel_pstate_get_cpu_pstates() setting. */
policy->cur = policy->cpuinfo.min_freq;
Hi Rafael,
On Sun, Apr 9, 2017 at 5:11 PM, Rafael J. Wysocki <[email protected]> wrote:
> From: Rafael J. Wysocki <[email protected]>
>
> Due to the limitation of the rate of frequency changes the schedutil
> governor only estimates the CPU utilization entirely when it is about
> to update the frequency for the corresponding cpufreq policy. As a
> result, the intermediate utilization values are discarded by it,
> but that is not appropriate in general (like, for example, when
> tasks migrate from one CPU to another or exit, in which cases the
> utilization measured by PELT may change abruptly between frequency
> updates).
>
> For this reason, modify schedutil to estimate CPU utilization
> completely whenever it is invoked for the given CPU and store the
> maximum encountered value of it as input for subsequent new frequency
> computations. This way the new frequency is always based on the
> maximum utilization value seen by the governor after the previous
> frequency update which effectively prevents intermittent utilization
> variations from causing it to be reduced unnecessarily.
>
> Signed-off-by: Rafael J. Wysocki <[email protected]>
> ---
> kernel/sched/cpufreq_schedutil.c | 90 +++++++++++++++++++++------------------
> 1 file changed, 50 insertions(+), 40 deletions(-)
>
> Index: linux-pm/kernel/sched/cpufreq_schedutil.c
> ===================================================================
> --- linux-pm.orig/kernel/sched/cpufreq_schedutil.c
> +++ linux-pm/kernel/sched/cpufreq_schedutil.c
> @@ -57,7 +57,6 @@ struct sugov_cpu {
> unsigned long iowait_boost_max;
> u64 last_update;
>
> - /* The fields below are only needed when sharing a policy. */
> unsigned long util;
> unsigned long max;
> unsigned int flags;
> @@ -154,22 +153,30 @@ static unsigned int get_next_freq(struct
> return cpufreq_driver_resolve_freq(policy, freq);
> }
>
> -static void sugov_get_util(unsigned long *util, unsigned long *max)
> +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
> {
> + unsigned long cfs_util, cfs_max;
> struct rq *rq = this_rq();
> - unsigned long cfs_max;
>
> - cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
> + sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
> + if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
> + return;
>
> - *util = min(rq->cfs.avg.util_avg, cfs_max);
> - *max = cfs_max;
> + cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
> + cfs_util = min(rq->cfs.avg.util_avg, cfs_max);
> + if (sg_cpu->util * cfs_max < sg_cpu->max * cfs_util) {
Assuming all CPUs have equal compute capacity, doesn't this mean that
sg_cpu->util is updated only if cfs_util > sg_cpu->util?
Maybe I missed something, but wouldn't we want sg_cpu->util to be
reduced as well when cfs_util reduces? Doesn't this condition
basically discard all updates to sg_cpu->util that could have reduced
it?
> + sg_cpu->util = cfs_util;
> + sg_cpu->max = cfs_max;
> + }
> }
Thanks,
Joel
Hi Rafael,
On Mon, Apr 10 2017 at 00:10, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <[email protected]>
>
> Make the schedutil governor compute the initial (default) value of
> the rate_limit_us sysfs attribute by multiplying the transition
> latency by a multiplier depending on the policy and set by the
> scaling driver (instead of using a constant for this purpose).
>
> That will allow scaling drivers to make schedutil use smaller default
> values of rate_limit_us and reduce the default average time interval
> between consecutive frequency changes.
>
I've been thinking about this over the last couple of days, and I'm
thinking (in opposition to what I said at OSPM Pisa) that allowing
drivers to specify a _multiplier_ isn't ideal, since you lose
granularity when you want your rate limit to be close to your transition
latency (i.e. if your multiplier would be 2.5 or something).
Can we instead just have an independent field
policy->default_rate_limit_us or similar? Drivers know the transition
latency so intel_pstate can still use a multiplier if it wants.
Cheers,
Brendan
> Make intel_pstate use the opportunity to reduce the rate limit
> somewhat.
>
> Signed-off-by: Rafael J. Wysocki <[email protected]>
> ---
> drivers/cpufreq/cpufreq.c | 1 +
> drivers/cpufreq/intel_pstate.c | 2 ++
> include/linux/cpufreq.h | 8 ++++++++
> kernel/sched/cpufreq_schedutil.c | 2 +-
> 4 files changed, 12 insertions(+), 1 deletion(-)
>
> Index: linux-pm/drivers/cpufreq/cpufreq.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/cpufreq.c
> +++ linux-pm/drivers/cpufreq/cpufreq.c
> @@ -1072,6 +1072,7 @@ static struct cpufreq_policy *cpufreq_po
> init_waitqueue_head(&policy->transition_wait);
> init_completion(&policy->kobj_unregister);
> INIT_WORK(&policy->update, handle_update);
> + policy->latency_multiplier = LATENCY_MULTIPLIER;
>
> policy->cpu = cpu;
> return policy;
> Index: linux-pm/include/linux/cpufreq.h
> ===================================================================
> --- linux-pm.orig/include/linux/cpufreq.h
> +++ linux-pm/include/linux/cpufreq.h
> @@ -120,6 +120,14 @@ struct cpufreq_policy {
> bool fast_switch_possible;
> bool fast_switch_enabled;
>
> + /*
> + * Multiplier to apply to the transition latency to obtain the preferred
> + * average time interval between consecutive invocations of the driver
> + * to set the frequency for this policy. Initialized by the core to the
> + * LATENCY_MULTIPLIER value.
> + */
> + unsigned int latency_multiplier;
> +
> /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
> unsigned int cached_target_freq;
> int cached_resolved_idx;
> Index: linux-pm/kernel/sched/cpufreq_schedutil.c
> ===================================================================
> --- linux-pm.orig/kernel/sched/cpufreq_schedutil.c
> +++ linux-pm/kernel/sched/cpufreq_schedutil.c
> @@ -530,7 +530,7 @@ static int sugov_init(struct cpufreq_pol
> goto stop_kthread;
> }
>
> - tunables->rate_limit_us = LATENCY_MULTIPLIER;
> + tunables->rate_limit_us = policy->latency_multiplier;
> lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
> if (lat)
> tunables->rate_limit_us *= lat;
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -41,6 +41,7 @@
> #define INTEL_PSTATE_HWP_SAMPLING_INTERVAL (50 * NSEC_PER_MSEC)
>
> #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
> +#define INTEL_CPUFREQ_LATENCY_MULTIPLIER 250
>
> #ifdef CONFIG_ACPI
> #include <acpi/processor.h>
> @@ -2237,6 +2238,7 @@ static int intel_cpufreq_cpu_init(struct
> return ret;
>
> policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
> + policy->latency_multiplier = INTEL_CPUFREQ_LATENCY_MULTIPLIER;
> /* This reflects the intel_pstate_get_cpu_pstates() setting. */
> policy->cur = policy->cpuinfo.min_freq;
>
On Mon, Apr 10, 2017 at 12:38 PM, Brendan Jackman
<[email protected]> wrote:
> Hi Rafael,
>
> On Mon, Apr 10 2017 at 00:10, Rafael J. Wysocki wrote:
>> From: Rafael J. Wysocki <[email protected]>
>>
>> Make the schedutil governor compute the initial (default) value of
>> the rate_limit_us sysfs attribute by multiplying the transition
>> latency by a multiplier depending on the policy and set by the
>> scaling driver (instead of using a constant for this purpose).
>>
>> That will allow scaling drivers to make schedutil use smaller default
>> values of rate_limit_us and reduce the default average time interval
>> between consecutive frequency changes.
>>
>
> I've been thinking about this over the last couple of days, and I'm
> thinking (in opposition to what I said at OSPM Pisa) that allowing
> drivers to specify a _multiplier_ isn't ideal, since you lose
> granularity when you want your rate limit to be close to your transition
> latency (i.e. if your multiplier would be 2.5 or something).
>
> Can we instead just have an independent field
> policy->default_rate_limit_us or similar?
Yes, we can.
Let me cut a v2 of this, shouldn't be too hard. :-)
Thanks,
Rafael
Hi Rafael,
thanks for this set. I'll give it a try (together with your previous
patch) in the next few days.
A question below.
On 10/04/17 02:11, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <[email protected]>
>
> Due to the limitation of the rate of frequency changes the schedutil
> governor only estimates the CPU utilization entirely when it is about
> to update the frequency for the corresponding cpufreq policy. As a
> result, the intermediate utilization values are discarded by it,
> but that is not appropriate in general (like, for example, when
> tasks migrate from one CPU to another or exit, in which cases the
> utilization measured by PELT may change abruptly between frequency
> updates).
>
> For this reason, modify schedutil to estimate CPU utilization
> completely whenever it is invoked for the given CPU and store the
> maximum encountered value of it as input for subsequent new frequency
> computations. This way the new frequency is always based on the
> maximum utilization value seen by the governor after the previous
> frequency update which effectively prevents intermittent utilization
> variations from causing it to be reduced unnecessarily.
>
> Signed-off-by: Rafael J. Wysocki <[email protected]>
> ---
[...]
> -static void sugov_get_util(unsigned long *util, unsigned long *max)
> +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
> {
> + unsigned long cfs_util, cfs_max;
> struct rq *rq = this_rq();
> - unsigned long cfs_max;
>
> - cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
> + sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
> + if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
> + return;
>
IIUC, with this you also keep track of any RT/DL tasks that woke up
during the last throttling period, and react accordingly as soon a
triggering event happens after the throttling period elapses.
Given that for RT (and still for DL as well) the next event is a
periodic tick, couldn't happen that the required frequency transition
for an RT task, that unfortunately woke up before the end of a throttling
period, gets delayed of a tick interval (at least 4ms on ARM)?
Don't we need to treat such wake up events (RT/DL) in a special way and
maybe set a timer to fire and process them as soon as the current
throttling period elapses? Might be a patch on top of this I guess.
Best,
- Juri
On Mon, Apr 10, 2017 at 8:39 AM, Joel Fernandes <[email protected]> wrote:
> Hi Rafael,
Hi,
> On Sun, Apr 9, 2017 at 5:11 PM, Rafael J. Wysocki <[email protected]> wrote:
>> From: Rafael J. Wysocki <[email protected]>
>>
[cut]
>> @@ -154,22 +153,30 @@ static unsigned int get_next_freq(struct
>> return cpufreq_driver_resolve_freq(policy, freq);
>> }
>>
>> -static void sugov_get_util(unsigned long *util, unsigned long *max)
>> +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
>> {
>> + unsigned long cfs_util, cfs_max;
>> struct rq *rq = this_rq();
>> - unsigned long cfs_max;
>>
>> - cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
>> + sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
>> + if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
>> + return;
>>
>> - *util = min(rq->cfs.avg.util_avg, cfs_max);
>> - *max = cfs_max;
>> + cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
>> + cfs_util = min(rq->cfs.avg.util_avg, cfs_max);
>> + if (sg_cpu->util * cfs_max < sg_cpu->max * cfs_util) {
>
> Assuming all CPUs have equal compute capacity, doesn't this mean that
> sg_cpu->util is updated only if cfs_util > sg_cpu->util?
Yes, it does.
> Maybe I missed something, but wouldn't we want sg_cpu->util to be
> reduced as well when cfs_util reduces? Doesn't this condition
> basically discard all updates to sg_cpu->util that could have reduced
> it?
>
>> + sg_cpu->util = cfs_util;
>> + sg_cpu->max = cfs_max;
>> + }
>> }
Well, that's the idea. :-)
During the discussion at the OSPM-summit we concluded that discarding
all of the utilization changes between the points at which frequency
updates actually happened was not a good idea, so they needed to be
aggregated somehow.
There are a few ways to aggregate them, but the most straightforward
one (and one which actually makes sense) is to take the maximum as the
aggregate value.
Of course, this means that we skew things towards performance here,
but I'm not worried that much. :-)
Thanks,
Rafael
On Mon, Apr 10, 2017 at 1:26 PM, Juri Lelli <[email protected]> wrote:
> Hi Rafael,
Hi,
> thanks for this set. I'll give it a try (together with your previous
> patch) in the next few days.
>
> A question below.
>
> On 10/04/17 02:11, Rafael J. Wysocki wrote:
>> From: Rafael J. Wysocki <[email protected]>
>>
>> Due to the limitation of the rate of frequency changes the schedutil
>> governor only estimates the CPU utilization entirely when it is about
>> to update the frequency for the corresponding cpufreq policy. As a
>> result, the intermediate utilization values are discarded by it,
>> but that is not appropriate in general (like, for example, when
>> tasks migrate from one CPU to another or exit, in which cases the
>> utilization measured by PELT may change abruptly between frequency
>> updates).
>>
>> For this reason, modify schedutil to estimate CPU utilization
>> completely whenever it is invoked for the given CPU and store the
>> maximum encountered value of it as input for subsequent new frequency
>> computations. This way the new frequency is always based on the
>> maximum utilization value seen by the governor after the previous
>> frequency update which effectively prevents intermittent utilization
>> variations from causing it to be reduced unnecessarily.
>>
>> Signed-off-by: Rafael J. Wysocki <[email protected]>
>> ---
>
> [...]
>
>> -static void sugov_get_util(unsigned long *util, unsigned long *max)
>> +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
>> {
>> + unsigned long cfs_util, cfs_max;
>> struct rq *rq = this_rq();
>> - unsigned long cfs_max;
>>
>> - cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
>> + sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
>> + if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
>> + return;
>>
>
> IIUC, with this you also keep track of any RT/DL tasks that woke up
> during the last throttling period, and react accordingly as soon a
> triggering event happens after the throttling period elapses.
Right (that's the idea at least).
> Given that for RT (and still for DL as well) the next event is a
> periodic tick, couldn't happen that the required frequency transition
> for an RT task, that unfortunately woke up before the end of a throttling
> period, gets delayed of a tick interval (at least 4ms on ARM)?
No, that won't be an entire tick unless it wakes up exactly at the
update time AFAICS.
> Don't we need to treat such wake up events (RT/DL) in a special way and
> maybe set a timer to fire and process them as soon as the current
> throttling period elapses? Might be a patch on top of this I guess.
Setting a timer won't be a good idea at all, as it would need to be a
deferrable one and Thomas would not like that (I'm sure).
We could in principle add some special casing around that, like for
example pass flags to sugov_should_update_freq() and opportunistically
ignore freq_update_delay_ns if SCHED_CPUFREQ_RT_DL is set in there,
but that would lead to extra overhead on systems where frequency
updates happen in-context.
Also the case looks somewhat corner to me to be honest.
Thanks,
Rafael
From: Rafael J. Wysocki <[email protected]>
Make the schedutil governor take the initial (default) value of the
rate_limit_us sysfs attribute from the (new) transition_delay_us
policy parameter (to be set by the scaling driver).
That will allow scaling drivers to make schedutil use smaller default
values of rate_limit_us and reduce the default average time interval
between consecutive frequency changes.
Make intel_pstate set transition_delay_us to 500.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
This is a replacement for https://patchwork.kernel.org/patch/9671831/
---
drivers/cpufreq/intel_pstate.c | 2 ++
include/linux/cpufreq.h | 7 +++++++
kernel/sched/cpufreq_schedutil.c | 15 ++++++++++-----
3 files changed, 19 insertions(+), 5 deletions(-)
Index: linux-pm/include/linux/cpufreq.h
===================================================================
--- linux-pm.orig/include/linux/cpufreq.h
+++ linux-pm/include/linux/cpufreq.h
@@ -120,6 +120,13 @@ struct cpufreq_policy {
bool fast_switch_possible;
bool fast_switch_enabled;
+ /*
+ * Preferred average time interval between consecutive invocations of
+ * the driver to set the frequency for this policy. To be set by the
+ * scaling driver (0, which is the default, means no preference).
+ */
+ unsigned int transition_delay_us;
+
/* Cached frequency lookup from cpufreq_driver_resolve_freq. */
unsigned int cached_target_freq;
int cached_resolved_idx;
Index: linux-pm/kernel/sched/cpufreq_schedutil.c
===================================================================
--- linux-pm.orig/kernel/sched/cpufreq_schedutil.c
+++ linux-pm/kernel/sched/cpufreq_schedutil.c
@@ -491,7 +491,6 @@ static int sugov_init(struct cpufreq_pol
{
struct sugov_policy *sg_policy;
struct sugov_tunables *tunables;
- unsigned int lat;
int ret = 0;
/* State should be equivalent to EXIT */
@@ -530,10 +529,16 @@ static int sugov_init(struct cpufreq_pol
goto stop_kthread;
}
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
- lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
+ if (policy->transition_delay_us) {
+ tunables->rate_limit_us = policy->transition_delay_us;
+ } else {
+ unsigned int lat;
+
+ tunables->rate_limit_us = LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat)
+ tunables->rate_limit_us *= lat;
+ }
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -41,6 +41,7 @@
#define INTEL_PSTATE_HWP_SAMPLING_INTERVAL (50 * NSEC_PER_MSEC)
#define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
+#define INTEL_CPUFREQ_TRANSITION_DELAY 500
#ifdef CONFIG_ACPI
#include <acpi/processor.h>
@@ -2237,6 +2238,7 @@ static int intel_cpufreq_cpu_init(struct
return ret;
policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
+ policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
/* This reflects the intel_pstate_get_cpu_pstates() setting. */
policy->cur = policy->cpuinfo.min_freq;
On Mon, Apr 10, 2017 at 1:59 PM, Rafael J. Wysocki <[email protected]> wrote:
[..]
>>> + sg_cpu->util = cfs_util;
>>> + sg_cpu->max = cfs_max;
>>> + }
>>> }
>
>
> Well, that's the idea. :-)
>
> During the discussion at the OSPM-summit we concluded that discarding
> all of the utilization changes between the points at which frequency
> updates actually happened was not a good idea, so they needed to be
> aggregated somehow.
>
> There are a few ways to aggregate them, but the most straightforward
> one (and one which actually makes sense) is to take the maximum as the
> aggregate value.
>
> Of course, this means that we skew things towards performance here,
> but I'm not worried that much. :-)
Does this increase the chance of going to idle at higher frequency?
Say in the last rate limit window, we have a high request followed by
a low request. After the window closes, by this algorithm we ignore
the low request and take the higher valued request, and then enter
idle. Then, wouldn't we be idling at higher frequency? I guess if you
enter "cluster-idle" then probably this isn't a big deal (like on the
ARM64 platforms I am working on). But I wasn't sure how expensive is
entering C-states at higher frequency on Intel platforms is or if it
is even a concern. :-D
Thanks,
Joel
On 10/04/17 23:13, Rafael J. Wysocki wrote:
> On Mon, Apr 10, 2017 at 1:26 PM, Juri Lelli <[email protected]> wrote:
[...]
> > Given that for RT (and still for DL as well) the next event is a
> > periodic tick, couldn't happen that the required frequency transition
> > for an RT task, that unfortunately woke up before the end of a throttling
> > period, gets delayed of a tick interval (at least 4ms on ARM)?
>
> No, that won't be an entire tick unless it wakes up exactly at the
> update time AFAICS.
>
Right. I was trying to think about worst case, as I'm considering RT
type of tasks.
> > Don't we need to treat such wake up events (RT/DL) in a special way and
> > maybe set a timer to fire and process them as soon as the current
> > throttling period elapses? Might be a patch on top of this I guess.
>
> Setting a timer won't be a good idea at all, as it would need to be a
> deferrable one and Thomas would not like that (I'm sure).
>
Why deferrable? IMHO, we should be servicing RT requestes as soon as the
HW is capable of. Even a small delay of, say, a couple of ms could be
causing deadline misses.
> We could in principle add some special casing around that, like for
> example pass flags to sugov_should_update_freq() and opportunistically
> ignore freq_update_delay_ns if SCHED_CPUFREQ_RT_DL is set in there,
> but that would lead to extra overhead on systems where frequency
> updates happen in-context.
>
Also, it looks still event driven to me. If the RT task is the only
thing running, nothing will trigger a potential frequency change
re-evaluation before the next tick.
> Also the case looks somewhat corner to me to be honest.
>
Sure. Only thinking about potential problems here. However, playing with
my DL patches I noticed that this can be actually a problem, as for DL,
for example, we trigger a frequency switch when the task wakes up, but
then we don't do anything during the tick (because it doesn't seem to
make sense to do anything :). So, if we missed the opportunity to
increase frequency at enqueue time, the task is hopelessly done. :(
Anyway, since this looks anyway something that we might want on top of
your patches, I'll play with the idea when refreshing my set and see
what I get.
Thanks,
- Juri
On 11-04-17, 00:20, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <[email protected]>
>
> Make the schedutil governor take the initial (default) value of the
> rate_limit_us sysfs attribute from the (new) transition_delay_us
> policy parameter (to be set by the scaling driver).
>
> That will allow scaling drivers to make schedutil use smaller default
> values of rate_limit_us and reduce the default average time interval
> between consecutive frequency changes.
>
> Make intel_pstate set transition_delay_us to 500.
>
> Signed-off-by: Rafael J. Wysocki <[email protected]>
> ---
>
> This is a replacement for https://patchwork.kernel.org/patch/9671831/
>
> ---
> drivers/cpufreq/intel_pstate.c | 2 ++
> include/linux/cpufreq.h | 7 +++++++
> kernel/sched/cpufreq_schedutil.c | 15 ++++++++++-----
> 3 files changed, 19 insertions(+), 5 deletions(-)
Should we use this new value for the ondemand/conservative governors as well?
--
viresh
On Tue, Apr 11, 2017 at 1:14 PM, Viresh Kumar <[email protected]> wrote:
> On 11-04-17, 00:20, Rafael J. Wysocki wrote:
>> From: Rafael J. Wysocki <[email protected]>
>>
>> Make the schedutil governor take the initial (default) value of the
>> rate_limit_us sysfs attribute from the (new) transition_delay_us
>> policy parameter (to be set by the scaling driver).
>>
>> That will allow scaling drivers to make schedutil use smaller default
>> values of rate_limit_us and reduce the default average time interval
>> between consecutive frequency changes.
>>
>> Make intel_pstate set transition_delay_us to 500.
>>
>> Signed-off-by: Rafael J. Wysocki <[email protected]>
>> ---
>>
>> This is a replacement for https://patchwork.kernel.org/patch/9671831/
>>
>> ---
>> drivers/cpufreq/intel_pstate.c | 2 ++
>> include/linux/cpufreq.h | 7 +++++++
>> kernel/sched/cpufreq_schedutil.c | 15 ++++++++++-----
>> 3 files changed, 19 insertions(+), 5 deletions(-)
>
> Should we use this new value for the ondemand/conservative governors as well?
We might, but it is mostly for schedutil.
Thanks,
Rafael
On Tue, Apr 11, 2017 at 3:57 AM, Joel Fernandes <[email protected]> wrote:
> On Mon, Apr 10, 2017 at 1:59 PM, Rafael J. Wysocki <[email protected]> wrote:
> [..]
>>>> + sg_cpu->util = cfs_util;
>>>> + sg_cpu->max = cfs_max;
>>>> + }
>>>> }
>>
>>
>> Well, that's the idea. :-)
>>
>> During the discussion at the OSPM-summit we concluded that discarding
>> all of the utilization changes between the points at which frequency
>> updates actually happened was not a good idea, so they needed to be
>> aggregated somehow.
>>
>> There are a few ways to aggregate them, but the most straightforward
>> one (and one which actually makes sense) is to take the maximum as the
>> aggregate value.
>>
>> Of course, this means that we skew things towards performance here,
>> but I'm not worried that much. :-)
>
> Does this increase the chance of going to idle at higher frequency?
> Say in the last rate limit window, we have a high request followed by
> a low request. After the window closes, by this algorithm we ignore
> the low request and take the higher valued request, and then enter
> idle. Then, wouldn't we be idling at higher frequency? I guess if you
> enter "cluster-idle" then probably this isn't a big deal (like on the
> ARM64 platforms I am working on). But I wasn't sure how expensive is
> entering C-states at higher frequency on Intel platforms is or if it
> is even a concern. :-D
It isn't a concern at all AFAICS.
Thanks,
Rafael
On Tue, Apr 11, 2017 at 9:00 AM, Juri Lelli <[email protected]> wrote:
> On 10/04/17 23:13, Rafael J. Wysocki wrote:
>> On Mon, Apr 10, 2017 at 1:26 PM, Juri Lelli <[email protected]> wrote:
>
> [...]
>
>> > Given that for RT (and still for DL as well) the next event is a
>> > periodic tick, couldn't happen that the required frequency transition
>> > for an RT task, that unfortunately woke up before the end of a throttling
>> > period, gets delayed of a tick interval (at least 4ms on ARM)?
>>
>> No, that won't be an entire tick unless it wakes up exactly at the
>> update time AFAICS.
>>
>
> Right. I was trying to think about worst case, as I'm considering RT
> type of tasks.
>
>> > Don't we need to treat such wake up events (RT/DL) in a special way and
>> > maybe set a timer to fire and process them as soon as the current
>> > throttling period elapses? Might be a patch on top of this I guess.
>>
>> Setting a timer won't be a good idea at all, as it would need to be a
>> deferrable one and Thomas would not like that (I'm sure).
>>
>
> Why deferrable? IMHO, we should be servicing RT requestes as soon as the
> HW is capable of. Even a small delay of, say, a couple of ms could be
> causing deadline misses.
If it is not deferrable, it will wake up the CPU from idle, but that's
not a concern here, because we're assuming that the CPU is not idle
anyway, so fair enough.
>> We could in principle add some special casing around that, like for
>> example pass flags to sugov_should_update_freq() and opportunistically
>> ignore freq_update_delay_ns if SCHED_CPUFREQ_RT_DL is set in there,
>> but that would lead to extra overhead on systems where frequency
>> updates happen in-context.
>>
>
> Also, it looks still event driven to me. If the RT task is the only
> thing running, nothing will trigger a potential frequency change
> re-evaluation before the next tick.
If freq_update_delay_ns is opportunistically ignored for
SCHED_CPUFREQ_RT_DL set in the flags by sugov_should_update_freq(),
then all of the updates with that flag set will cause a frequency
update to happen immediately *except* *for* the ones that require us
to wait for work_in_progress to become false, but in that case the
kthread might trigger an update (eg. by scheduling an irq_work) after
it has cleared work_in_progress.
No timers needed I guess after all? :-)
>> Also the case looks somewhat corner to me to be honest.
>>
>
> Sure. Only thinking about potential problems here. However, playing with
> my DL patches I noticed that this can be actually a problem, as for DL,
> for example, we trigger a frequency switch when the task wakes up, but
> then we don't do anything during the tick (because it doesn't seem to
> make sense to do anything :). So, if we missed the opportunity to
> increase frequency at enqueue time, the task is hopelessly done. :(
>
> Anyway, since this looks anyway something that we might want on top of
> your patches, I'll play with the idea when refreshing my set and see
> what I get.
Sounds good.
Thanks,
Rafael
On Fri, Apr 14, 2017 at 3:51 PM, Rafael J. Wysocki <[email protected]> wrote:
> On Tuesday, April 11, 2017 12:20:41 AM Rafael J. Wysocki wrote:
>> From: Rafael J. Wysocki <[email protected]>
>>
>> Make the schedutil governor take the initial (default) value of the
>> rate_limit_us sysfs attribute from the (new) transition_delay_us
>> policy parameter (to be set by the scaling driver).
>>
>> That will allow scaling drivers to make schedutil use smaller default
>> values of rate_limit_us and reduce the default average time interval
>> between consecutive frequency changes.
>>
>> Make intel_pstate set transition_delay_us to 500.
>>
>> Signed-off-by: Rafael J. Wysocki <[email protected]>
>> ---
>>
>> This is a replacement for https://patchwork.kernel.org/patch/9671831/
>
> Any concerns about this one?
Looks good to me.
Thanks,
Joel
On Tuesday, April 11, 2017 12:20:41 AM Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <[email protected]>
>
> Make the schedutil governor take the initial (default) value of the
> rate_limit_us sysfs attribute from the (new) transition_delay_us
> policy parameter (to be set by the scaling driver).
>
> That will allow scaling drivers to make schedutil use smaller default
> values of rate_limit_us and reduce the default average time interval
> between consecutive frequency changes.
>
> Make intel_pstate set transition_delay_us to 500.
>
> Signed-off-by: Rafael J. Wysocki <[email protected]>
> ---
>
> This is a replacement for https://patchwork.kernel.org/patch/9671831/
Any concerns about this one?
>
> ---
> drivers/cpufreq/intel_pstate.c | 2 ++
> include/linux/cpufreq.h | 7 +++++++
> kernel/sched/cpufreq_schedutil.c | 15 ++++++++++-----
> 3 files changed, 19 insertions(+), 5 deletions(-)
>
> Index: linux-pm/include/linux/cpufreq.h
> ===================================================================
> --- linux-pm.orig/include/linux/cpufreq.h
> +++ linux-pm/include/linux/cpufreq.h
> @@ -120,6 +120,13 @@ struct cpufreq_policy {
> bool fast_switch_possible;
> bool fast_switch_enabled;
>
> + /*
> + * Preferred average time interval between consecutive invocations of
> + * the driver to set the frequency for this policy. To be set by the
> + * scaling driver (0, which is the default, means no preference).
> + */
> + unsigned int transition_delay_us;
> +
> /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
> unsigned int cached_target_freq;
> int cached_resolved_idx;
> Index: linux-pm/kernel/sched/cpufreq_schedutil.c
> ===================================================================
> --- linux-pm.orig/kernel/sched/cpufreq_schedutil.c
> +++ linux-pm/kernel/sched/cpufreq_schedutil.c
> @@ -491,7 +491,6 @@ static int sugov_init(struct cpufreq_pol
> {
> struct sugov_policy *sg_policy;
> struct sugov_tunables *tunables;
> - unsigned int lat;
> int ret = 0;
>
> /* State should be equivalent to EXIT */
> @@ -530,10 +529,16 @@ static int sugov_init(struct cpufreq_pol
> goto stop_kthread;
> }
>
> - tunables->rate_limit_us = LATENCY_MULTIPLIER;
> - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
> - if (lat)
> - tunables->rate_limit_us *= lat;
> + if (policy->transition_delay_us) {
> + tunables->rate_limit_us = policy->transition_delay_us;
> + } else {
> + unsigned int lat;
> +
> + tunables->rate_limit_us = LATENCY_MULTIPLIER;
> + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
> + if (lat)
> + tunables->rate_limit_us *= lat;
> + }
>
> policy->governor_data = sg_policy;
> sg_policy->tunables = tunables;
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -41,6 +41,7 @@
> #define INTEL_PSTATE_HWP_SAMPLING_INTERVAL (50 * NSEC_PER_MSEC)
>
> #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
> +#define INTEL_CPUFREQ_TRANSITION_DELAY 500
>
> #ifdef CONFIG_ACPI
> #include <acpi/processor.h>
> @@ -2237,6 +2238,7 @@ static int intel_cpufreq_cpu_init(struct
> return ret;
>
> policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
> + policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
> /* This reflects the intel_pstate_get_cpu_pstates() setting. */
> policy->cur = policy->cpuinfo.min_freq;
>
>
On 11-04-17, 00:20, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <[email protected]>
>
> Make the schedutil governor take the initial (default) value of the
> rate_limit_us sysfs attribute from the (new) transition_delay_us
> policy parameter (to be set by the scaling driver).
>
> That will allow scaling drivers to make schedutil use smaller default
> values of rate_limit_us and reduce the default average time interval
> between consecutive frequency changes.
>
> Make intel_pstate set transition_delay_us to 500.
>
> Signed-off-by: Rafael J. Wysocki <[email protected]>
> ---
>
> This is a replacement for https://patchwork.kernel.org/patch/9671831/
Acked-by: Viresh Kumar <[email protected]>
--
viresh
Hi Rafael,
On Fri, Apr 14 2017 at 22:51, Rafael J. Wysocki wrote:
> On Tuesday, April 11, 2017 12:20:41 AM Rafael J. Wysocki wrote:
>> From: Rafael J. Wysocki <[email protected]>
>>
>> Make the schedutil governor take the initial (default) value of the
>> rate_limit_us sysfs attribute from the (new) transition_delay_us
>> policy parameter (to be set by the scaling driver).
>>
>> That will allow scaling drivers to make schedutil use smaller default
>> values of rate_limit_us and reduce the default average time interval
>> between consecutive frequency changes.
>>
>> Make intel_pstate set transition_delay_us to 500.
>>
>> Signed-off-by: Rafael J. Wysocki <[email protected]>
>> ---
>>
>> This is a replacement for https://patchwork.kernel.org/patch/9671831/
>
> Any concerns about this one?
Sorry for the delay. This looked good to me.
Cheers
Brendan