LinuxLists.cc - [PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks

2023-12-08 01:53:19

Subject: [PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks

Ignore freq updates to honour uclamp requests if the task is short
running. It won't run long enough to see the changes, so avoid the
unnecessary work and noise.

Make sure SCHED_CPUFREQ_PERF_HINTS flag is set in task_tick_fair() so
that we can do correction action if the task continued to run such that
it is no longer considered a short task.

Should address the problem of noisy short running tasks unnecessary
causing frequency spikes when waking up on a CPU that is running a busy
task capped by UCLAMP_MAX.

Move helper functions to access task_util_est() and related attributes
to sched.h to enable using it from cpufreq_schedutil.c

Signed-off-by: Qais Yousef (Google) <[email protected]>
---
kernel/sched/cpufreq_schedutil.c | 59 ++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 24 +------------
kernel/sched/sched.h | 22 ++++++++++++
3 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 137636f62593..04a5cfcdbbf2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -8,11 +8,18 @@

#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)

+/*
+ * Min runtime in us a task should run for above rate_limit_us so that we don't
+ * ignore it in ignore_short_tasks().
+ */
+#define SHORT_TASK_MIN 500
+
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, response_time_mult);

struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
+ unsigned long rate_limit_util;
unsigned int response_time_ms;
};

@@ -127,6 +134,49 @@ sugov_apply_response_time(unsigned long util, int cpu)
return mult >> SCHED_CAPACITY_SHIFT;
}

+/*
+ * Ignore updates if current task's runtime is too short for the rate limit.
+ * The task must run for an average of rate_limit_us + SHORT_TASK_MIN for it
+ * not to be ignored.
+ *
+ * If we made a wrong decision and the task has changed characteristic such
+ * that it is no longer a short task, we should detect that at tick. Which can
+ * be a high penalty if the tick value is too high.
+ *
+ * XXX: can we take TICK_US into account somehow when verifying if we can
+ * ignore it?
+ *
+ * This is only valid for requests containing SCHED_CPUFREQ_PERF_HINTS flag,
+ * ie: uclamp hints requests at context switches.
+ *
+ * This flag is expected to be passed on context switch and tick. Only fair
+ * tasks are considered now as we use util to approximate its average runtime.
+ * We can't do the same without tracking the average runtime of the RT task in
+ * our accounting. And it might be risky to temporarily ignore the RT task's
+ * perf requirements as a mistake could have higher consequence.
+ *
+ * Once fair gains the concept of latency sensitive tasks, we might need to
+ * consider the consequence of ignoring them here too. For the same reason
+ * ignoring RT tasks is risky.
+ */
+static inline bool ignore_short_tasks(int cpu,
+ struct sugov_policy *sg_policy,
+ unsigned int flags)
+{
+ struct task_struct *p = cpu_rq(cpu)->curr;
+ unsigned long task_util;
+
+ if (!(flags & SCHED_CPUFREQ_PERF_HINTS))
+ return false;
+
+ if (!fair_policy(p->policy))
+ return false;
+
+ task_util = task_util_est(p);
+
+ return task_util < sg_policy->tunables->rate_limit_util;
+}
+
static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
@@ -396,8 +446,12 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
u64 time, unsigned long max_cap,
unsigned int flags)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long boost;

+ if (ignore_short_tasks(sg_cpu->cpu, sg_policy, flags))
+ return false;
+
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;

@@ -526,6 +580,9 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int next_f;

+ if (ignore_short_tasks(sg_cpu->cpu, sg_policy, flags))
+ return;
+
raw_spin_lock(&sg_policy->update_lock);

sugov_iowait_boost(sg_cpu, time, flags);
@@ -612,6 +669,7 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
return -EINVAL;

tunables->rate_limit_us = rate_limit_us;
+ tunables->rate_limit_util = approximate_util_avg(0, rate_limit_us + SHORT_TASK_MIN);

list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {

@@ -853,6 +911,7 @@ static int sugov_init(struct cpufreq_policy *policy)
sg_policy->tunables = tunables;

tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+ tunables->rate_limit_util = approximate_util_avg(0, tunables->rate_limit_us + SHORT_TASK_MIN);
tunables->response_time_ms = sugov_calc_freq_response_ms(sg_policy);
sugov_update_response_time_mult(sg_policy);

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 74326179658c..b824e633ac2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4754,28 +4754,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)

static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);

-static inline unsigned long task_util(struct task_struct *p)
-{
- return READ_ONCE(p->se.avg.util_avg);
-}
-
-static inline unsigned long task_runnable(struct task_struct *p)
-{
- return READ_ONCE(p->se.avg.runnable_avg);
-}
-
-static inline unsigned long _task_util_est(struct task_struct *p)
-{
- struct util_est ue = READ_ONCE(p->se.avg.util_est);
-
- return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
-}
-
-static inline unsigned long task_util_est(struct task_struct *p)
-{
- return max(task_util(p), _task_util_est(p));
-}
-
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
@@ -12544,7 +12522,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
- cpufreq_update_util(rq, 0);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_PERF_HINTS);

task_tick_core(rq, curr);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f05a0674a036..b7a8cd768bef 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2952,6 +2952,28 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
unsigned long approximate_util_avg(unsigned long util, u64 delta);
u64 approximate_runtime(unsigned long util);

+static inline unsigned long task_util(struct task_struct *p)
+{
+ return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long task_runnable(struct task_struct *p)
+{
+ return READ_ONCE(p->se.avg.runnable_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+ struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+ return max(task_util(p), _task_util_est(p));
+}
+
/*
* Any governor that relies on util signal to drive DVFS, must populate these
* percpu dvfs_update_delay variables.
--
2.34.1

2023-12-08 10:43:41

by Hongyan Xia

[permalink] [raw]

Subject: Re: [PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks

Hi Qais,

On 08/12/2023 01:52, Qais Yousef wrote:
> Ignore freq updates to honour uclamp requests if the task is short
> running. It won't run long enough to see the changes, so avoid the
> unnecessary work and noise.
>
> Make sure SCHED_CPUFREQ_PERF_HINTS flag is set in task_tick_fair() so
> that we can do correction action if the task continued to run such that
> it is no longer considered a short task.
>
> Should address the problem of noisy short running tasks unnecessary
> causing frequency spikes when waking up on a CPU that is running a busy
> task capped by UCLAMP_MAX.

Actually, an occasional spike is not a big problem to me.

What is a big concern is a normal task and a uclamp_max task running on
the same rq. If the uclamp_max task is 1024 but capped by uclamp_max at
the lowest OPP, and the normal task has no uclamp but a duty cycle, then
when the normal task wakes up on the rq, it'll be the highest OPP. When
it sleeps, the ulamp_max is back and at the lowest OPP. This square-wave
problem to me is a much bigger concern than an infrequent spike. If
CONFIG_HZ is 1000, this square wave's frequency is 500 switching between
highest and lowest OPP, which is definitely unacceptable.

The problem I think with filtering is, under this condition, should we
filter out the lowest OPP or the highest? Neither sounds like a good
answer because neither is a short-running task and the correct answer
might be somewhere in between.

Sorry to ramble on this again and again, but I think filtering is
addressing the symptom, not the cause. The cause is we have no idea
under what condition a util_avg was achieved. The 1024 task in the
previous example would be much better if we extend it into

[1024, achieved at uclamp_min 0, achieved at uclamp_max 300]

If we know 1024 was done under uclamp_max of 300, then we know we don't
need to raise to the max OPP. So far, we carry around a lot of different
new variables but not these two which we really need.

>
> Move helper functions to access task_util_est() and related attributes
> to sched.h to enable using it from cpufreq_schedutil.c
>
> Signed-off-by: Qais Yousef (Google) <[email protected]>
> ---
> kernel/sched/cpufreq_schedutil.c | 59 ++++++++++++++++++++++++++++++++
> kernel/sched/fair.c | 24 +------------
> kernel/sched/sched.h | 22 ++++++++++++
> 3 files changed, 82 insertions(+), 23 deletions(-)
>
> [...]

2023-12-10 22:55:04

by Qais Yousef

[permalink] [raw]

Subject: Re: [PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks

Hi Hongyan

On 12/08/23 10:42, Hongyan Xia wrote:

> What is a big concern is a normal task and a uclamp_max task running on the
> same rq. If the uclamp_max task is 1024 but capped by uclamp_max at the
> lowest OPP, and the normal task has no uclamp but a duty cycle, then when

You mean util_avg is 1024 but capped to lowest OPP? uclamp_max is repeated but
couldn't decipher what you meant to write instead.

> the normal task wakes up on the rq, it'll be the highest OPP. When it
> sleeps, the ulamp_max is back and at the lowest OPP. This square-wave
> problem to me is a much bigger concern than an infrequent spike. If
> CONFIG_HZ is 1000, this square wave's frequency is 500 switching between

If the rq->util_avg is 1024, then for any task that is running, the requested
frequency should be max. If there's a task that is capped by uclamp_max, then
this task should not run at max.

So other tasks should have run at max; why you don't want them to run at max?

> highest and lowest OPP, which is definitely unacceptable.

How come so definitive? How did you reach this definitive conclusion?

> The problem I think with filtering is, under this condition, should we
> filter out the lowest OPP or the highest? Neither sounds like a good answer
> because neither is a short-running task and the correct answer might be
> somewhere in between.

We only ignore uclamp requirement with the filter. schedutil is drive by the rq
utilization signal as normal. It is only the fact should we apply
uclamp_min/max or not.

It seems you think we need to modify the rq->util_avg. And this should not be
the case. uclamp should not change how PELT accounting works; just modify how
some decisions based on it are taken.

It is true there's a corner case where util_avg could be wrong under the
documented limitation. But this is not related to max-aggregation and its
solution needs some creativity in handling pelt accounting under these
conditions.

Generally; capping that hard stirs question why userspace is doing this. We
don't want to cripple tasks, but prevent them from consuming inefficient energy
points. Otherwise they should make adequate progress. I'd expect uclamp_max to
be more meaningful for tasks that actually need to run at those higher
expensive frequencies.

So the corner case warrants fixing, but it is not a nuance in practice yet. And
it is a problem of failing to calculate the stolen idle time as we don't have
any under these corner conditions (Vincent can make a more accurate statement
than me here). It has nothing to do with how to handle performance requirements
of multiple RUNNABLE tasks.

> Sorry to ramble on this again and again, but I think filtering is addressing
> the symptom, not the cause. The cause is we have no idea under what
> condition a util_avg was achieved. The 1024 task in the previous example
> would be much better if we extend it into

I think the other way around :-) I think you're mixing the root cause of that
limitation with how uclamp hints for multiple tasks should be handled - which
is what is being fixed here.

I wrote the documentation and did the experiments to see how PELT behaves under
extreme conditions. And it says *it can break PELT*.

> [1024, achieved at uclamp_min 0, achieved at uclamp_max 300]

Why you think this is the dominant use case? And why do you think this is
caused by max-aggregation? This is a limitation of PELT accounting and has
nothing to do with max-aggregation which is how multiple uclamp hints for
RUNNABLE tasks are handled.

Have you actually seen it practice? We haven't come across this problem yet. We
just want to avoid using expensive OPPs, but capping too had is actually
a problem as it can cause starvation for those tasks.

Is it only the documentation what triggered this concern about this corner
case? I'm curious what have you seen.

> If we know 1024 was done under uclamp_max of 300, then we know we don't need
> to raise to the max OPP. So far, we carry around a lot of different new
> variables but not these two which we really need.

This problem is independent of how uclamp hint of multiple tasks should be
accounted for by the governor. This is a limitation of how PELT accounting
works. And to be honest, if you think more about it, this 300 tasks is already
a 1024 on current littles that has a capacity of 200 or less. And the capacity
of the mids at lowest OPP usually starts at a capacity around 100 or something.
Can't see it hit this problem while running on middle. I think this 300 tasks
will already run near lowest OPP at the big even without uclamp_max being
0 - it is that small for it.

So not sure on what systems you saw this problem on, and whether at all this is
a problem in practice. Like priority/nice and other sched attributes; you can
pick a wrong combination and shoot your self in the foot.

As I put in the documentation, this limitation will only hit if the actual task
capacity reaches some magical ratio. I'd expect practically these tasks to
still see idle time and get their util_avg corrected eventually.

So worth a fix, not related to handling performance requests for multiple
tasks, and not urgently needed as nothing is falling apart because of it for
the time being at least.

Cheers

--
Qais Yousef

2023-12-11 11:23:04

by Hongyan Xia

[permalink] [raw]

Subject: Re: [PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks

On 10/12/2023 22:22, Qais Yousef wrote:
> Hi Hongyan
>
> On 12/08/23 10:42, Hongyan Xia wrote:
>
>> What is a big concern is a normal task and a uclamp_max task running on the
>> same rq. If the uclamp_max task is 1024 but capped by uclamp_max at the
>> lowest OPP, and the normal task has no uclamp but a duty cycle, then when
>
> You mean util_avg is 1024 but capped to lowest OPP? uclamp_max is repeated but
> couldn't decipher what you meant to write instead.
>
>> the normal task wakes up on the rq, it'll be the highest OPP. When it
>> sleeps, the ulamp_max is back and at the lowest OPP. This square-wave
>> problem to me is a much bigger concern than an infrequent spike. If
>> CONFIG_HZ is 1000, this square wave's frequency is 500 switching between
>
> If the rq->util_avg is 1024, then for any task that is running, the requested
> frequency should be max. If there's a task that is capped by uclamp_max, then
> this task should not run at max.
>
> So other tasks should have run at max; why you don't want them to run at max?

Because it saves power. If there's a 1024 task capped at 300 and a true
300 task without uclamp on the same rq, there's no need to run the CPU
at more than 600. Running it at 1024 ignores the uclamp_max and burns
battery when it's not needed.

>> highest and lowest OPP, which is definitely unacceptable.
>
> How come so definitive? How did you reach this definitive conclusion?

You are right. After talking to our firmware and silicon engineers they
don't think switching between the highest and lowest OPP 500 times a
second can have damaging effects, so I retract the 'unacceptable' comment.

>> The problem I think with filtering is, under this condition, should we
>> filter out the lowest OPP or the highest? Neither sounds like a good answer
>> because neither is a short-running task and the correct answer might be
>> somewhere in between.
>
> We only ignore uclamp requirement with the filter. schedutil is drive by the rq
> utilization signal as normal. It is only the fact should we apply
> uclamp_min/max or not.
>
> It seems you think we need to modify the rq->util_avg. And this should not be
> the case. uclamp should not change how PELT accounting works; just modify how
> some decisions based on it are taken.

I agree, uclamp shouldn't change PELT, but my series doesn't. Just like
util_est which boosts util_avg, my patches don't touch util_avg but
simply introduces util_min, util_max on the side of util_avg. I fail to
see why it's okay for util_est to bias util_avg but not okay for me to
do so. If this is the case, then the 'util_guest' proposal should also
be right out rejected on the same ground.

> It is true there's a corner case where util_avg could be wrong under the
> documented limitation. But this is not related to max-aggregation and its
> solution needs some creativity in handling pelt accounting under these
> conditions.
>
> Generally; capping that hard stirs question why userspace is doing this. We
> don't want to cripple tasks, but prevent them from consuming inefficient energy
> points. Otherwise they should make adequate progress. I'd expect uclamp_max to
> be more meaningful for tasks that actually need to run at those higher
> expensive frequencies.
>
> So the corner case warrants fixing, but it is not a nuance in practice yet. And
> it is a problem of failing to calculate the stolen idle time as we don't have
> any under these corner conditions (Vincent can make a more accurate statement
> than me here). It has nothing to do with how to handle performance requirements
> of multiple RUNNABLE tasks.
>
>> Sorry to ramble on this again and again, but I think filtering is addressing
>> the symptom, not the cause. The cause is we have no idea under what
>> condition a util_avg was achieved. The 1024 task in the previous example
>> would be much better if we extend it into
>
> I think the other way around :-) I think you're mixing the root cause of that
> limitation with how uclamp hints for multiple tasks should be handled - which
> is what is being fixed here.
>
> I wrote the documentation and did the experiments to see how PELT behaves under
> extreme conditions. And it says *it can break PELT*.
>
>> [1024, achieved at uclamp_min 0, achieved at uclamp_max 300]
>
> Why you think this is the dominant use case? And why do you think this is
> caused by max-aggregation? This is a limitation of PELT accounting and has
> nothing to do with max-aggregation which is how multiple uclamp hints for
> RUNNABLE tasks are handled.
>
> Have you actually seen it practice? We haven't come across this problem yet. We
> just want to avoid using expensive OPPs, but capping too had is actually
> a problem as it can cause starvation for those tasks.
>
> Is it only the documentation what triggered this concern about this corner
> case? I'm curious what have you seen.

This is not a corner case. Having a uclamp_max task and a normal task on
the same rq is fairly common. My concern isn't the 'frequency spike'
problem from documentation. My concern comes from benchmarks, which is
high-frequency square waves. An occasional spike isn't worrying, but the
square waves are.

>> If we know 1024 was done under uclamp_max of 300, then we know we don't need
>> to raise to the max OPP. So far, we carry around a lot of different new
>> variables but not these two which we really need.
>
> This problem is independent of how uclamp hint of multiple tasks should be
> accounted for by the governor. This is a limitation of how PELT accounting
> works. And to be honest, if you think more about it, this 300 tasks is already
> a 1024 on current littles that has a capacity of 200 or less. And the capacity
> of the mids at lowest OPP usually starts at a capacity around 100 or something.
> Can't see it hit this problem while running on middle. I think this 300 tasks
> will already run near lowest OPP at the big even without uclamp_max being
> 0 - it is that small for it.
>
> So not sure on what systems you saw this problem on, and whether at all this is
> a problem in practice. Like priority/nice and other sched attributes; you can
> pick a wrong combination and shoot your self in the foot.
>
> As I put in the documentation, this limitation will only hit if the actual task
> capacity reaches some magical ratio. I'd expect practically these tasks to
> still see idle time and get their util_avg corrected eventually.

Like in the previous comment, it's square waves that happen 500 times a
second I saw in benchmarks that's worrying, not the occasional spike in
documentation. I doubt we can say that a uclamp_max task and a normal
task running on the same rq is a corner case.

> So worth a fix, not related to handling performance requests for multiple
> tasks, and not urgently needed as nothing is falling apart because of it for
> the time being at least.

Also, I think there's still an unanswered question. If there's a 1024
task with a uclamp_min of 300 and a 300-util task with default uclamp on
the same rq, which currently under max aggregation switches between
highest and lowest OPP, should we filter out the high OPP or the low
one? Neither is a short-running task. We could designate this as a
corner case (though I don't think so), but wouldn't it be nice if we
don't have any of these problems in the first place?

Hongyan

2023-12-12 12:24:02

by Qais Yousef

[permalink] [raw]

Subject: Re: [PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks

On 12/11/23 11:15, Hongyan Xia wrote:
> > If the rq->util_avg is 1024, then for any task that is running, the requested
> > frequency should be max. If there's a task that is capped by uclamp_max, then
> > this task should not run at max.
> >
> > So other tasks should have run at max; why you don't want them to run at max?
>
> Because it saves power. If there's a 1024 task capped at 300 and a true 300
> task without uclamp on the same rq, there's no need to run the CPU at more
> than 600. Running it at 1024 ignores the uclamp_max and burns battery when
> it's not needed.

To fix this problem of tasks that are not 1024 but appearing 1024 the solution
doesn't lie on how the combined tasks perf hints are treated.

Note that tasks stuck on a CPU with small capacity (due to affinity or
extremely long balance_interval) can still appear as 1024 the same way
UCLAMP_MAX induces.

> > Is it only the documentation what triggered this concern about this corner
> > case? I'm curious what have you seen.
>
> This is not a corner case. Having a uclamp_max task and a normal task on the
> same rq is fairly common. My concern isn't the 'frequency spike' problem
> from documentation. My concern comes from benchmarks, which is
> high-frequency square waves. An occasional spike isn't worrying, but the
> square waves are.

Fairly common in practice or you synthetic test setup to trigger it? We haven't
hit this problem in practice. Again, the solution is not related to how the
performance hints are applied.

Note if you have busy tasks running and sharing the CPU, the CPU should run at
max for non capped tasks. Please differentiate between the two problems.

>
> > So worth a fix, not related to handling performance requests for multiple
> > tasks, and not urgently needed as nothing is falling apart because of it for
> > the time being at least.
>
> Also, I think there's still an unanswered question. If there's a 1024 task

If there's a 1024 tasks on the rq then it'd run at max frequency and the system
will be overutilized and EAS disabled and work is spread according to load.

Cheers

--
Qais Yousef

> with a uclamp_min of 300 and a 300-util task with default uclamp on the same
> rq, which currently under max aggregation switches between highest and
> lowest OPP, should we filter out the high OPP or the low one? Neither is a
> short-running task. We could designate this as a corner case (though I don't
> think so), but wouldn't it be nice if we don't have any of these problems in
> the first place?
>
> Hongyan